mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-05 08:42:54 +00:00
Compare commits
46 Commits
wasm
...
broken_col
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9d8031664 | ||
|
|
9628413386 | ||
|
|
b2ce65f52d | ||
|
|
0cea706f10 | ||
|
|
ad81e131ec | ||
|
|
71d41ca209 | ||
|
|
bc69dab822 | ||
|
|
72acad0921 | ||
|
|
c9459f74e8 | ||
|
|
327ca2ab02 | ||
|
|
16ca6a0e5c | ||
|
|
8c07ae653d | ||
|
|
3d483f8711 | ||
|
|
56b2e9731f | ||
|
|
08d2cc6c7b | ||
|
|
c85668cabe | ||
|
|
82d87416c2 | ||
|
|
96b2c2971e | ||
|
|
162afd73f6 | ||
|
|
ddfd87fa59 | ||
|
|
24050d0eb5 | ||
|
|
89eb209ece | ||
|
|
9a0b7f9855 | ||
|
|
8e343b1ca3 | ||
|
|
99c0b84036 | ||
|
|
ca74c14647 | ||
|
|
68ee18e4e8 | ||
|
|
8a33ddaca7 | ||
|
|
5637657c2f | ||
|
|
2e3c9a8878 | ||
|
|
78673172d0 | ||
|
|
175b76f119 | ||
|
|
9b79e21bd7 | ||
|
|
5e38ae336f | ||
|
|
8604351f59 | ||
|
|
6a48953d8a | ||
|
|
0804b42afa | ||
|
|
8083bc6eef | ||
|
|
0156f88265 | ||
|
|
a1c07bf457 | ||
|
|
9de74b68d1 | ||
|
|
57c7073867 | ||
|
|
121374b89b | ||
|
|
e44782bf14 | ||
|
|
dfafb24fa6 | ||
|
|
4c6f9541e9 |
24
CHANGELOG.md
24
CHANGELOG.md
@@ -1,9 +1,21 @@
|
||||
Tantivy 0.5.2
|
||||
Tantivy 0.6
|
||||
==========================
|
||||
|
||||
- Removed C code. Tantivy is now pure Rust.
|
||||
- BM25
|
||||
- Approximate field norms encoded over 1 byte.
|
||||
- Compiles on stable rust
|
||||
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Add NGram token support (@drusellers)
|
||||
- Add Stopword Filter support (@drusellers)
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
- bugfix #274
|
||||
- bugfix #280
|
||||
- bugfix #289
|
||||
|
||||
|
||||
Tantivy 0.5.1
|
||||
==========================
|
||||
@@ -81,7 +93,7 @@ Tantivy 0.3
|
||||
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
|
||||
for their contribution to this release.
|
||||
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
for their advise and company :)
|
||||
|
||||
https://gitter.im/tantivy-search/tantivy
|
||||
@@ -89,9 +101,9 @@ https://gitter.im/tantivy-search/tantivy
|
||||
|
||||
Warning:
|
||||
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
|
||||
code and index format.
|
||||
You should not expect backward compatibility before
|
||||
You should not expect backward compatibility before
|
||||
tantivy 1.0.
|
||||
|
||||
|
||||
@@ -117,7 +129,7 @@ Thanks to @KodrAus ! (#108)
|
||||
the natural ordering.
|
||||
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
|
||||
- Misc invisible bug fixes, and code cleanup.
|
||||
- Use
|
||||
- Use
|
||||
|
||||
|
||||
|
||||
|
||||
30
Cargo.toml
30
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.5.1"
|
||||
version = "0.6.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -12,12 +12,14 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.9.1"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = {version="0.2", default-features=false}
|
||||
fst = {version="0.3", default-features=false}
|
||||
atomicwrites = {version="0.1", optional=true}
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
tempdir = "0.3"
|
||||
@@ -26,6 +28,8 @@ serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
@@ -36,17 +40,16 @@ error-chain = "0.8"
|
||||
owning_ref = "0.3"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9", features = ["nightly"]}
|
||||
downcast = { version="0.9" }
|
||||
matches = "0.1"
|
||||
snap = "0.2"
|
||||
bitpacking = {path = "../bitpacking"}
|
||||
bitpacking = "0.4"
|
||||
fnv = "1.0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.3"
|
||||
tempfile = "2.1"
|
||||
env_logger = "0.4"
|
||||
|
||||
[profile.release]
|
||||
@@ -55,12 +58,11 @@ debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
streamdict = []
|
||||
simd = ["bitpacking/simd"]
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
|
||||
unstable = ["simd"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
@@ -69,11 +71,5 @@ travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
name = "simple_search"
|
||||
required-features = ["mmap"]
|
||||
|
||||
|
||||
[[bin]]
|
||||
name = "convert_to_static"
|
||||
path = "./bin/convert_to_static.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "test_static_dir"
|
||||
path = "./bin/test_static_dir.rs"
|
||||
[[example]]
|
||||
name = "custom_tokenizer"
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2016 Paul Masurel
|
||||
Copyright (c) 2018 by Paul Masurel, Google LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
|
||||
@@ -43,15 +43,12 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
|
||||
## Development
|
||||
|
||||
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/unstable-book/language-features/box-syntax.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md),
|
||||
and [simd](https://github.com/rust-lang/rust/issues/27731).
|
||||
|
||||
|
||||
Tantivy now compiles on stable rust.
|
||||
To check out and run test, you can simply run :
|
||||
|
||||
git clone git@github.com:tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
cargo +nightly build
|
||||
cargo build
|
||||
|
||||
|
||||
## Note on release build and performance
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
extern crate tantivy;
|
||||
use tantivy::directory::write_static_from_directory;
|
||||
|
||||
fn main() {
|
||||
// Prints each argument on a separate line
|
||||
let mut args = env::args();
|
||||
args.next().unwrap();
|
||||
let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
|
||||
let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
|
||||
println!("{} => {}", directory_path, output_path);
|
||||
let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
|
||||
println!("Read all");
|
||||
let mut output = File::create(output_path).unwrap();
|
||||
output.write_all(&buffer[..]).unwrap();
|
||||
output.flush().unwrap();
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
extern crate tantivy;
|
||||
use tantivy::directory::{StaticDirectory, write_static_from_directory};
|
||||
use tantivy::Index;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::collector::TopCollector;
|
||||
|
||||
|
||||
static DATA: &'static [u8] = include_bytes!("output.bin");
|
||||
|
||||
fn run() -> tantivy::Result<()> {
|
||||
// Prints each argument on a separate line
|
||||
let directory = StaticDirectory::open(DATA).unwrap();
|
||||
let index = Index::open_directory(directory).unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let schema = index.schema();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
fn main() {
|
||||
run().unwrap();
|
||||
}
|
||||
226
examples/custom_tokenizer.rs
Normal file
226
examples/custom_tokenizer.rs
Normal file
@@ -0,0 +1,226 @@
|
||||
extern crate tantivy;
|
||||
extern crate tempdir;
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
if let Ok(dir) = TempDir::new("tantivy_token_example_dir") {
|
||||
run_example(dir.path()).unwrap();
|
||||
dir.close().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// Our first field is title.
|
||||
// In this example we want to use NGram searching
|
||||
// we will set that to 3 characters, so any three
|
||||
// char in the title should be findable.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("ngram3")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter and
|
||||
// by omitting `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
index
|
||||
.tokenizers()
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
// ### Create a document "manually".
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
// ### Create a document directly from json.
|
||||
//
|
||||
// Alternatively, we can use our schema to parse a
|
||||
// document object directly from json.
|
||||
// The document is a string, but we use the `json` macro
|
||||
// from `serde_json` for the convenience of multi-line support.
|
||||
let json = json!({
|
||||
"title": "Of Mice and Men",
|
||||
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
});
|
||||
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let json = json!({
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
});
|
||||
let frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call .commit() explicitly to force the
|
||||
// index_writer to finish processing the documents in the queue,
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
index_writer.commit()?;
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents that have been added are guaranteed to be
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// commit.
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
// searchers in the index. This should be done
|
||||
// after every commit().
|
||||
index.load_searchers()?;
|
||||
|
||||
// Afterwards create one (or more) searchers.
|
||||
//
|
||||
// You should create a searcher
|
||||
// every time you start a "search query".
|
||||
let searcher = index.searcher();
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
let query = query_parser.parse_query("ken")?;
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
//
|
||||
// A query created by the query parser is scored according
|
||||
// to a metric called Tf-Idf, and will consider
|
||||
// any document matching at least one of our terms.
|
||||
|
||||
// ### Collectors
|
||||
//
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the TopCollector.
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
// We can now perform our query.
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
// Our top collector now contains the 10
|
||||
// most relevant doc ids...
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// Wait for indexing and merging threads to shut down.
|
||||
// Usually this isn't needed, but in `main` we try to
|
||||
// delete the temporary directory and that fails on
|
||||
// Windows if the files are still open.
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -5,11 +5,11 @@ extern crate tempdir;
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
use tantivy::Index;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
|
||||
@@ -1,27 +1,39 @@
|
||||
use Result;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use DocId;
|
||||
use Score;
|
||||
use collector::SegmentCollector;
|
||||
use collector::CollectorWrapper;
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
pub struct DoNothingCollector;
|
||||
impl Collector for DoNothingCollector {
|
||||
type Child = DoNothingCollector;
|
||||
#[inline]
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<DoNothingCollector> {
|
||||
Ok(DoNothingCollector)
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
#[inline]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for DoNothingCollector {
|
||||
type CollectionResult = ();
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
|
||||
fn finalize(self) -> () {
|
||||
()
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
/// This contraption is only usable if the type of your collectors
|
||||
/// are known at compile time.
|
||||
@@ -30,34 +42,49 @@ pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
right: Right,
|
||||
}
|
||||
|
||||
pub struct ChainedSegmentCollector<Left: SegmentCollector, Right: SegmentCollector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
/// Adds a collector
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, CollectorWrapper<C>> {
|
||||
ChainedCollector {
|
||||
left: self,
|
||||
right: new_collector,
|
||||
right: CollectorWrapper::new(new_collector),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
fn set_segment(
|
||||
type Child = ChainedSegmentCollector<Left::Child, Right::Child>;
|
||||
fn for_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
self.left.set_segment(segment_local_id, segment)?;
|
||||
self.right.set_segment(segment_local_id, segment)?;
|
||||
Ok(())
|
||||
) -> Result<Self::Child> {
|
||||
Ok(ChainedSegmentCollector {
|
||||
left: self.left.for_segment(segment_local_id, segment)?,
|
||||
right: self.right.for_segment(segment_local_id, segment)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left: SegmentCollector, Right: SegmentCollector> SegmentCollector for ChainedSegmentCollector<Left, Right> {
|
||||
type CollectionResult = (Left::CollectionResult, Right::CollectionResult);
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.left.collect(doc, score);
|
||||
self.right.collect(doc, score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
fn finalize(self) -> Self::CollectionResult {
|
||||
(self.left.finalize(), self.right.finalize())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,19 +98,35 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
use collector::{CountCollector, SegmentCollector, TopCollector};
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
use Document;
|
||||
|
||||
#[test]
|
||||
fn test_chained_collector() {
|
||||
let schema_builder = SchemaBuilder::new();
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let doc = Document::new();
|
||||
index_writer.add_document(doc);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_readers = searcher.segment_readers();
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
let mut segment_collector = collectors.for_segment(0, &segment_readers[0]).unwrap();
|
||||
segment_collector.collect(1, 0.2);
|
||||
segment_collector.collect(2, 0.1);
|
||||
segment_collector.collect(3, 0.5);
|
||||
collectors.merge_children(vec![segment_collector]);
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use super::Collector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
use SegmentReader;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::Combinable;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
@@ -21,12 +23,10 @@ impl CountCollector {
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
type Child = CountCollector;
|
||||
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<CountCollector> {
|
||||
Ok(CountCollector::default())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -34,10 +34,28 @@ impl Collector for CountCollector {
|
||||
}
|
||||
}
|
||||
|
||||
impl Combinable for CountCollector {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.count += other.count;
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for CountCollector {
|
||||
type CollectionResult = CountCollector;
|
||||
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
fn finalize(self) -> CountCollector {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::{Collector, CountCollector};
|
||||
use collector::{Collector, CountCollector, SegmentCollector};
|
||||
|
||||
#[test]
|
||||
fn test_count_collector() {
|
||||
|
||||
@@ -1,27 +1,24 @@
|
||||
use std::mem;
|
||||
use collector::Collector;
|
||||
use docset::SkipResult;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use schema::Facet;
|
||||
use schema::Field;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermStreamerBuilder;
|
||||
use std::collections::BTreeSet;
|
||||
use termdict::TermMerger;
|
||||
use docset::SkipResult;
|
||||
use std::{usize, u64};
|
||||
use std::iter::Peekable;
|
||||
use std::{u64, usize};
|
||||
use termdict::TermMerger;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use std::cmp::Ordering;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -196,19 +193,22 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// }
|
||||
/// ```
|
||||
pub struct FacetCollector {
|
||||
facet_ords: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||
segment_counters: Vec<SegmentFacetCounter>,
|
||||
facets: BTreeSet<Facet>,
|
||||
}
|
||||
|
||||
pub struct FacetSegmentCollector {
|
||||
reader: FacetReader,
|
||||
|
||||
facet_ords_buf: Vec<u64>,
|
||||
|
||||
// facet_ord -> collapse facet_id
|
||||
current_segment_collapse_mapping: Vec<usize>,
|
||||
collapse_mapping: Vec<usize>,
|
||||
// collapse facet_id -> count
|
||||
current_segment_counts: Vec<u64>,
|
||||
counts: Vec<u64>,
|
||||
// collapse facet_id -> facet_ord
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
facets: BTreeSet<Facet>,
|
||||
collapse_facet_ords: Vec<u64>,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
@@ -242,15 +242,9 @@ impl FacetCollector {
|
||||
/// is of the proper type.
|
||||
pub fn for_field(field: Field) -> FacetCollector {
|
||||
FacetCollector {
|
||||
facet_ords: Vec::with_capacity(255),
|
||||
segment_counters: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
facets: BTreeSet::new(),
|
||||
|
||||
current_segment_collapse_mapping: Vec::new(),
|
||||
current_collapse_facet_ords: Vec::new(),
|
||||
current_segment_counts: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -281,69 +275,11 @@ impl FacetCollector {
|
||||
self.facets.insert(facet);
|
||||
}
|
||||
|
||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||
self.current_segment_collapse_mapping.clear();
|
||||
self.current_collapse_facet_ords.clear();
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if !facet_streamer.advance() {
|
||||
return;
|
||||
}
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = self.current_collapse_facet_ords.len();
|
||||
self.current_collapse_facet_ords
|
||||
.push(facet_streamer.term_ord());
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(SegmentFacetCounter {
|
||||
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
/// it also translates the facet ordinals of the last segment.
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
pub fn harvest(self) -> FacetCounts {
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
@@ -381,38 +317,102 @@ impl FacetCollector {
|
||||
})
|
||||
.sum();
|
||||
if count > 0u64 {
|
||||
let bytes = facet_merger.key().to_owned();
|
||||
facet_counts.insert(Facet::from_encoded(bytes), count);
|
||||
let bytes: Vec<u8> = facet_merger.key().to_owned();
|
||||
// may create an corrupted facet if the term dicitonary is corrupted
|
||||
let facet = unsafe { Facet::from_encoded(bytes) };
|
||||
facet_counts.insert(facet, count);
|
||||
}
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetSegmentCollector {
|
||||
fn into_segment_facet_counter(self) -> SegmentFacetCounter {
|
||||
SegmentFacetCounter {
|
||||
facet_reader: self.reader,
|
||||
facet_ords: self.collapse_facet_ords,
|
||||
facet_counts: self.counts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FacetCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.finalize_segment();
|
||||
type Child = FacetSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
self.set_collapse_mapping(&facet_reader);
|
||||
self.current_segment_counts
|
||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||
Ok(())
|
||||
|
||||
let mut collapse_mapping = Vec::new();
|
||||
let mut counts = Vec::new();
|
||||
let mut collapse_facet_ords = Vec::new();
|
||||
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
collapse_facet_ords.push(0);
|
||||
{
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if facet_streamer.advance() {
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = collapse_facet_ords.len();
|
||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
||||
collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
counts.resize(collapse_facet_ords.len(), 0);
|
||||
|
||||
Ok(FacetSegmentCollector {
|
||||
reader: facet_reader,
|
||||
facet_ords_buf: Vec::with_capacity(255),
|
||||
collapse_mapping,
|
||||
counts,
|
||||
collapse_facet_ords,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FacetSegmentCollector {
|
||||
type CollectionResult = Vec<SegmentFacetCounter>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
|
||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||
for &facet_ord in &self.facet_ords {
|
||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||
{
|
||||
for &facet_ord in &self.facet_ords_buf {
|
||||
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
|
||||
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
@@ -421,8 +421,8 @@ impl Collector for FacetCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
fn finalize(self) -> Vec<SegmentFacetCounter> {
|
||||
vec![self.into_segment_facet_counter()]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -432,9 +432,20 @@ pub struct FacetCounts {
|
||||
facet_counts: BTreeMap<Facet, u64>,
|
||||
}
|
||||
|
||||
pub struct FacetChildIterator<'a> {
|
||||
underlying: btree_map::Range<'a, Facet, u64>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for FacetChildIterator<'a> {
|
||||
type Item = (&'a Facet, u64);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.underlying.next().map(|(facet, count)| (facet, *count))
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetCounts {
|
||||
#[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
|
||||
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
@@ -443,15 +454,13 @@ impl FacetCounts {
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
|
||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = Facet::from_encoded(facet_after_bytes);
|
||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
|
||||
self.facet_counts
|
||||
.range((left_bound, right_bound))
|
||||
.map(|(facet, count)| (facet, *count))
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
FacetChildIterator { underlying }
|
||||
}
|
||||
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
@@ -483,14 +492,13 @@ impl FacetCounts {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use test::Bencher;
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use query::AllQuery;
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use std::iter;
|
||||
use schema::Field;
|
||||
use core::Index;
|
||||
use query::AllQuery;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Field;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
@@ -545,8 +553,10 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
#[should_panic(
|
||||
expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet."
|
||||
)]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
@@ -604,6 +614,19 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use collector::FacetCollector;
|
||||
use query::AllQuery;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Facet;
|
||||
use schema::SchemaBuilder;
|
||||
use test::Bencher;
|
||||
use Index;
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
Defines how the documents matching a search query should be processed.
|
||||
*/
|
||||
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use query::Query;
|
||||
use Searcher;
|
||||
use downcast;
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::CountCollector;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
//mod multi_collector;
|
||||
//pub use self::multi_collector::MultiCollector;
|
||||
|
||||
mod top_collector;
|
||||
pub use self::top_collector::TopCollector;
|
||||
@@ -53,31 +56,90 @@ pub use self::chained_collector::chain;
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector {
|
||||
type Child : SegmentCollector + 'static;
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(
|
||||
fn for_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
) -> Result<Self::Child>;
|
||||
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
fn requires_scoring(&self) -> bool;
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a SegmentCollector for collecting documents associated to the segment
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate through the matched documents and push them to the segment collector.
|
||||
/// - turn the segment collector into a Combinable segment result
|
||||
///
|
||||
/// Combining all of the segment results gives a single Child::CollectionResult, which is returned.
|
||||
///
|
||||
/// The result will be Ok(None) in case of having no segments.
|
||||
fn search(&mut self, searcher: &Searcher, query: &Query) -> Result<Option<<Self::Child as SegmentCollector>::CollectionResult>> {
|
||||
let scoring_enabled = self.requires_scoring();
|
||||
let weight = query.weight(searcher, scoring_enabled)?;
|
||||
let mut results = Vec::new();
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
let mut child: Self::Child = self.for_segment(segment_ord as SegmentLocalId, segment_reader)?;
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
scorer.collect(&mut child, segment_reader.delete_bitset());
|
||||
results.push(child.finalize());
|
||||
}
|
||||
Ok(results.into_iter().fold1(|x,y| {
|
||||
x.combine_into(y);
|
||||
x
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Combinable {
|
||||
fn combine_into(&mut self, other: Self);
|
||||
}
|
||||
|
||||
impl Combinable for () {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Combinable for Vec<T> {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.extend(other.into_iter());
|
||||
}
|
||||
}
|
||||
|
||||
impl<L: Combinable, R: Combinable> Combinable for (L, R) {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.0.combine_into(other.0);
|
||||
self.1.combine_into(other.1);
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SegmentCollector: downcast::Any + 'static {
|
||||
type CollectionResult: Combinable + downcast::Any + 'static;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
|
||||
/// Turn into the final result
|
||||
fn finalize(self) -> Self::CollectionResult;
|
||||
}
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
type Child = C::Child;
|
||||
|
||||
fn for_segment(
|
||||
&mut self, // TODO Ask Jason : why &mut self here!?
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
C::collect(self, doc, score)
|
||||
) -> Result<C::Child> {
|
||||
(*self).for_segment(segment_local_id, segment)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -85,25 +147,85 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CollectorWrapper<'a, TCollector: 'a + Collector>(&'a mut TCollector);
|
||||
|
||||
impl<'a, T: 'a + Collector> CollectorWrapper<'a, T> {
|
||||
pub fn new(collector: &'a mut T) -> CollectorWrapper<'a, T> {
|
||||
CollectorWrapper(collector)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a + Collector> Collector for CollectorWrapper<'a, T> {
|
||||
type Child = T::Child;
|
||||
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<T::Child> {
|
||||
self.0.for_segment(segment_local_id, segment)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.0.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
trait UntypedCollector {
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>>;
|
||||
}
|
||||
|
||||
|
||||
impl<'a, TCollector:'a + Collector> UntypedCollector for CollectorWrapper<'a, TCollector> {
|
||||
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>> {
|
||||
let segment_collector = self.0.for_segment(segment_local_id, segment)?;
|
||||
Ok(Box::new(segment_collector))
|
||||
}
|
||||
}
|
||||
|
||||
trait UntypedSegmentCollector {
|
||||
fn finalize(self) -> Box<UntypedCombinable>;
|
||||
}
|
||||
|
||||
trait UntypedCombinable {
|
||||
fn combine_into(&mut self, other: Box<UntypedCombinable>);
|
||||
}
|
||||
|
||||
pub struct CombinableWrapper<'a, T: 'a + Combinable>(&'a mut T);
|
||||
|
||||
impl<'a, T: 'a + Combinable> CombinableWrapper<'a, T> {
|
||||
pub fn new(combinable: &'a mut T) -> CombinableWrapper<'a, T> {
|
||||
CombinableWrapper(combinable)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: 'a + Combinable> Combinable for CombinableWrapper<'a, T> {
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.0.combine_into(*::downcast::Downcast::<T>::downcast(other).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use DocId;
|
||||
use Score;
|
||||
use core::SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
use DocId;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in practise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector {
|
||||
next_offset: DocId,
|
||||
docs: Vec<DocId>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
|
||||
pub struct TestSegmentCollector {
|
||||
offset: DocId,
|
||||
segment_max_doc: DocId,
|
||||
docs: Vec<DocId>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
@@ -122,8 +244,7 @@ pub mod tests {
|
||||
impl Default for TestCollector {
|
||||
fn default() -> TestCollector {
|
||||
TestCollector {
|
||||
offset: 0,
|
||||
segment_max_doc: 0,
|
||||
next_offset: 0,
|
||||
docs: Vec::new(),
|
||||
scores: Vec::new(),
|
||||
}
|
||||
@@ -131,19 +252,33 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
Ok(())
|
||||
type Child = TestSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<TestSegmentCollector> {
|
||||
let offset = self.next_offset;
|
||||
self.next_offset += reader.max_doc();
|
||||
Ok(TestSegmentCollector {
|
||||
offset,
|
||||
docs: Vec::new(),
|
||||
scores: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TestSegmentCollector {
|
||||
type CollectionResult = Vec<TestSegmentCollector>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.docs.push(doc + self.offset);
|
||||
self.scores.push(score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
fn finalize(self) -> Vec<TestSegmentCollector> {
|
||||
vec![self]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,17 +287,26 @@ pub mod tests {
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u64>,
|
||||
next_counter: usize,
|
||||
field: Field,
|
||||
ff_reader: Option<FastFieldReader<u64>>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct FastFieldSegmentCollectorState {
|
||||
counter: usize,
|
||||
vals: Vec<u64>,
|
||||
}
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
state: FastFieldSegmentCollectorState,
|
||||
reader: FastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
next_counter: 0,
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,20 +316,96 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
type Child = FastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FastFieldSegmentCollector> {
|
||||
let counter = self.next_counter;
|
||||
self.next_counter += 1;
|
||||
Ok(FastFieldSegmentCollector {
|
||||
state: FastFieldSegmentCollectorState::default(),
|
||||
reader: reader.fast_field_reader(self.field)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FastFieldSegmentCollector {
|
||||
type CollectionResult = Vec<FastFieldSegmentCollectorState>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.reader.get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<FastFieldSegmentCollectorState> {
|
||||
vec![self.state]
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast field bytes for all of the
|
||||
/// docs in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
vals: Vec<u8>,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
pub struct BytesFastFieldSegmentCollector {
|
||||
vals: Vec<u8>,
|
||||
reader: BytesFastFieldReader,
|
||||
}
|
||||
|
||||
impl BytesFastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||
BytesFastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self) -> Vec<u8> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
type Child = BytesFastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<BytesFastFieldSegmentCollector> {
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type CollectionResult = Vec<Vec<u8>>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.reader.get_val(doc);
|
||||
self.vals.extend(val);
|
||||
}
|
||||
|
||||
fn finalize(self) -> Vec<Vec<u8>> {
|
||||
vec![self.vals]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use collector::{Collector, CountCollector};
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -1,67 +1,122 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use DocId;
|
||||
use Score;
|
||||
use Result;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use downcast::Downcast;
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
pub struct MultiCollector<'a> {
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
collector_wrappers: Vec<Box<UntypedCollector + 'a>>
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
/// Constructor
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector { collectors }
|
||||
pub fn new() -> MultiCollector<'a> {
|
||||
MultiCollector {
|
||||
collector_wrappers: Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_collector<TCollector: 'a + Collector>(&mut self, collector: &'a mut TCollector) {
|
||||
let collector_wrapper = CollectorWrapper(collector);
|
||||
self.collector_wrappers.push(Box::new(collector_wrapper));
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
collector.set_segment(segment_local_id, segment)?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
type Child = MultiCollectorChild;
|
||||
|
||||
fn for_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<MultiCollectorChild> {
|
||||
let children = self.collector_wrappers
|
||||
.iter_mut()
|
||||
.map(|collector_wrapper| {
|
||||
collector_wrapper.for_segment(segment_local_id, segment)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
Ok(MultiCollectorChild {
|
||||
children
|
||||
})
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
for collector in &mut self.collectors {
|
||||
collector.collect(doc, score);
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector_wrappers
|
||||
.iter()
|
||||
.any(|c| c.requires_scoring())
|
||||
}
|
||||
|
||||
fn merge_children(&mut self, children: Vec<MultiCollectorChild>) {
|
||||
let mut per_collector_children: Vec<Vec<Box<SegmentCollector>>> =
|
||||
(0..self.collector_wrappers.len())
|
||||
.map(|_| Vec::with_capacity(children.len()))
|
||||
.collect::<Vec<_>>();
|
||||
for child in children {
|
||||
for (idx, segment_collector) in child.children.into_iter().enumerate() {
|
||||
per_collector_children[idx].push(segment_collector);
|
||||
}
|
||||
}
|
||||
for (collector, children) in self.collector_wrappers.iter_mut().zip(per_collector_children) {
|
||||
collector.merge_children_anys(children);
|
||||
}
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collectors
|
||||
.iter()
|
||||
.any(|collector| collector.requires_scoring())
|
||||
|
||||
}
|
||||
|
||||
pub struct MultiCollectorChild {
|
||||
children: Vec<Box<SegmentCollector>>
|
||||
}
|
||||
|
||||
impl SegmentCollector for MultiCollectorChild {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
for child in &mut self.children {
|
||||
child.collect(doc, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
use schema::{TEXT, SchemaBuilder};
|
||||
use query::TermQuery;
|
||||
use Index;
|
||||
use Term;
|
||||
use schema::IndexRecordOption;
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(text=>""));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors =
|
||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
let mut collectors = MultiCollector::new();
|
||||
collectors.add_collector(&mut top_collector);
|
||||
collectors.add_collector(&mut count_collector);
|
||||
collectors.search(&*searcher, &query).unwrap();
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
assert_eq!(count_collector.count(), 5);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
use super::Collector;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use collector::SegmentCollector;
|
||||
use collector::Combinable;
|
||||
|
||||
// Rust heap is a max-heap and we need a min heap.
|
||||
#[derive(Clone, Copy)]
|
||||
@@ -99,11 +101,34 @@ impl TopCollector {
|
||||
}
|
||||
|
||||
impl Collector for TopCollector {
|
||||
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
self.segment_id = segment_id;
|
||||
Ok(())
|
||||
type Child = TopCollector;
|
||||
|
||||
fn for_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<TopCollector> {
|
||||
Ok(TopCollector {
|
||||
limit: self.limit,
|
||||
heap: BinaryHeap::new(),
|
||||
segment_id,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl Combinable for TopCollector {
|
||||
// TODO: I think this could be a bit better
|
||||
fn combine_into(&mut self, other: Self) {
|
||||
self.segment_id = other.segment_id;
|
||||
while let Some(doc) = other.heap.pop() {
|
||||
self.collect(doc.doc_address.doc(), doc.score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TopCollector {
|
||||
type CollectionResult = TopCollector;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
@@ -126,8 +151,8 @@ impl Collector for TopCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
fn finalize(self) -> TopCollector {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,7 +162,6 @@ mod tests {
|
||||
use super::*;
|
||||
use DocId;
|
||||
use Score;
|
||||
use collector::Collector;
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use common::serialize::BinarySerializable;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
@@ -106,7 +106,8 @@ where
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
} else {
|
||||
@@ -141,7 +142,8 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
|
||||
@@ -202,15 +202,14 @@ impl BitSet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
extern crate test;
|
||||
use tests;
|
||||
use std::collections::HashSet;
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use query::BitSetDocSet;
|
||||
use docset::DocSet;
|
||||
use query::BitSetDocSet;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
use tests;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
@@ -353,6 +352,14 @@ mod tests {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
@@ -385,5 +392,4 @@ mod tests {
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use std::io::Write;
|
||||
use common::CountingWriter;
|
||||
use std::collections::HashMap;
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use directory::WritePtr;
|
||||
use std::io::{self, Read};
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read};
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
@@ -30,10 +30,7 @@ impl BinarySerializable for FileAddr {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let field = Field::deserialize(reader)?;
|
||||
let idx = VInt::deserialize(reader)?.0 as usize;
|
||||
Ok(FileAddr {
|
||||
field,
|
||||
idx,
|
||||
})
|
||||
Ok(FileAddr { field, idx })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,7 +163,7 @@ impl CompositeFile {
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
|
||||
self.offsets_index
|
||||
.get(&FileAddr { field, idx, })
|
||||
.get(&FileAddr { field, idx })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
}
|
||||
@@ -174,12 +171,12 @@ impl CompositeFile {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::io::Write;
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod counting_writer;
|
||||
mod composite_file;
|
||||
pub mod bitpacker;
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::VInt;
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::VInt;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
@@ -104,8 +104,8 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use common::Endianness;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::io;
|
||||
use common::VInt;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
/// Trait for a simple binary serialization.
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
@@ -135,8 +135,8 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use common::VInt;
|
||||
use super::*;
|
||||
use common::VInt;
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::BinarySerializable;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
|
||||
@@ -8,10 +8,8 @@ const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
|
||||
|
||||
/// Returns the size in bytes of a compressed block, given `num_bits`.
|
||||
pub fn compressed_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
|
||||
@@ -35,19 +33,21 @@ impl BlockEncoder {
|
||||
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
|
||||
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
|
||||
let written_size =
|
||||
1 + self.bitpacker
|
||||
.compress_sorted(offset, block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
|
||||
let num_bits = self.bitpacker.num_bits(block);
|
||||
self.output[0] = num_bits;
|
||||
let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
|
||||
let written_size = 1 + self.bitpacker
|
||||
.compress(block, &mut self.output[1..], num_bits);
|
||||
&self.output[..written_size]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||
@@ -68,17 +68,23 @@ impl BlockDecoder {
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
|
||||
1 + self.bitpacker.decompress_sorted(
|
||||
offset,
|
||||
&compressed_data[1..],
|
||||
&mut self.output,
|
||||
num_bits,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
|
||||
1 + self.bitpacker
|
||||
.decompress(&compressed_data[1..], &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -174,8 +180,6 @@ impl VIntDecoder for BlockDecoder {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block() {
|
||||
@@ -264,11 +268,34 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::*;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use test::Bencher;
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
@@ -277,7 +304,7 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
@@ -304,7 +331,7 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
@@ -313,12 +340,11 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use compression::compressed_block_size;
|
||||
use compression::BlockDecoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
|
||||
/// Reads a stream of compressed ints.
|
||||
@@ -13,7 +13,7 @@ pub struct CompressedIntStream {
|
||||
buffer: SourceRead,
|
||||
|
||||
block_decoder: BlockDecoder,
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
cached_next_addr: usize, // address following the currently decoded block
|
||||
|
||||
addr: usize, // address of the block associated to the current position
|
||||
@@ -42,7 +42,8 @@ impl CompressedIntStream {
|
||||
// no need to read.
|
||||
self.cached_next_addr
|
||||
} else {
|
||||
let next_addr = addr + self.block_decoder.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
let next_addr = addr + self.block_decoder
|
||||
.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
self.cached_addr = addr;
|
||||
self.cached_next_addr = next_addr;
|
||||
next_addr
|
||||
@@ -101,8 +102,8 @@ pub mod tests {
|
||||
|
||||
use super::CompressedIntStream;
|
||||
use compression::compressed_block_size;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::BlockEncoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
fn create_stream_buffer() -> ReadOnlySource {
|
||||
|
||||
@@ -1,34 +1,32 @@
|
||||
use Result;
|
||||
use core::SegmentId;
|
||||
use error::{ErrorKind, ResultExt};
|
||||
use serde_json;
|
||||
use schema::Schema;
|
||||
use std::sync::Arc;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use core::SegmentId;
|
||||
use std::sync::Arc;
|
||||
use Result;
|
||||
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
use super::pool::LeasedItem;
|
||||
use super::pool::Pool;
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use core::searcher::Searcher;
|
||||
use core::IndexMeta;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use core::META_FILEPATH;
|
||||
use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use core::searcher::Searcher;
|
||||
use std::convert::From;
|
||||
use num_cpus;
|
||||
use super::segment::Segment;
|
||||
use core::SegmentReader;
|
||||
use super::pool::Pool;
|
||||
use core::SegmentMeta;
|
||||
use super::pool::LeasedItem;
|
||||
use std::path::Path;
|
||||
use core::IndexMeta;
|
||||
use indexer::DirectoryLock;
|
||||
use IndexWriter;
|
||||
use directory::ManagedDirectory;
|
||||
use core::META_FILEPATH;
|
||||
use super::segment::create_segment;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::DirectoryLock;
|
||||
use num_cpus;
|
||||
use std::path::Path;
|
||||
use tokenizer::TokenizerManager;
|
||||
use IndexWriter;
|
||||
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
|
||||
@@ -65,7 +63,7 @@ impl Index {
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
@@ -85,8 +83,7 @@ impl Index {
|
||||
///
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(test)]
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
@@ -106,6 +103,20 @@ impl Index {
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open_directory<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
Index::open_directory(mmap_directory)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
@@ -113,22 +124,6 @@ impl Index {
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature="mmap")]
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> Result<IndexMeta> {
|
||||
load_metas(self.directory())
|
||||
@@ -231,8 +226,9 @@ impl Index {
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()?;
|
||||
let schema = self.schema();
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::from(segment_readers.clone()))
|
||||
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use schema::Schema;
|
||||
use core::SegmentMeta;
|
||||
use std::fmt;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -45,9 +45,9 @@ impl fmt::Debug for IndexMeta {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json;
|
||||
use super::IndexMeta;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use common::BinarySerializable;
|
||||
use compression::CompressedIntStream;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use postings::FreqReadingOption;
|
||||
use postings::TermInfo;
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use schema::FieldType;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
use common::BinarySerializable;
|
||||
use schema::FieldType;
|
||||
use termdict::TermDictionary;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -23,16 +23,16 @@ use schema::FieldType;
|
||||
/// `InvertedIndexReader` are created by calling
|
||||
/// the `SegmentReader`'s [`.inverted_index(...)`] method
|
||||
pub struct InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl,
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64
|
||||
total_num_tokens: u64,
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionaryImpl,
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
@@ -45,7 +45,7 @@ impl InvertedIndexReader {
|
||||
postings_source: postings_source.slice_from(8),
|
||||
positions_source,
|
||||
record_option,
|
||||
total_num_tokens
|
||||
total_num_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,11 +56,11 @@ impl InvertedIndexReader {
|
||||
.get_index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::empty(field_type),
|
||||
termdict: TermDictionary::empty(field_type),
|
||||
postings_source: ReadOnlySource::empty(),
|
||||
positions_source: ReadOnlySource::empty(),
|
||||
record_option,
|
||||
total_num_tokens: 0u64
|
||||
total_num_tokens: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ impl InvertedIndexReader {
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn terms(&self) -> &TermDictionaryImpl {
|
||||
pub fn terms(&self) -> &TermDictionary {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
@@ -149,8 +149,6 @@ impl InvertedIndexReader {
|
||||
self.total_num_tokens
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
///
|
||||
@@ -166,12 +164,15 @@ impl InvertedIndexReader {
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
pub(crate) fn read_postings_no_deletes(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.get_term_info(term)
|
||||
@@ -179,6 +180,3 @@ impl InvertedIndexReader {
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,24 +1,24 @@
|
||||
pub mod searcher;
|
||||
pub mod index;
|
||||
mod segment_reader;
|
||||
mod segment_id;
|
||||
mod segment_component;
|
||||
mod segment;
|
||||
mod index_meta;
|
||||
mod pool;
|
||||
mod segment_meta;
|
||||
mod inverted_index_reader;
|
||||
mod pool;
|
||||
pub mod searcher;
|
||||
mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_meta;
|
||||
mod segment_reader;
|
||||
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::index::Index;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use crossbeam::sync::MsQueue;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use crossbeam::sync::MsQueue;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct GenerationItem<T> {
|
||||
@@ -114,8 +114,8 @@ impl<T> Drop for LeasedItem<T> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::iter;
|
||||
use super::Pool;
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_pool() {
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
use Result;
|
||||
use core::SegmentReader;
|
||||
use schema::Document;
|
||||
use collector::Collector;
|
||||
use query::Query;
|
||||
use DocAddress;
|
||||
use schema::{Field, Term};
|
||||
use termdict::{TermDictionary, TermMerger};
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use core::InvertedIndexReader;
|
||||
use core::SegmentReader;
|
||||
use query::Query;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::{Field, Term};
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
@@ -16,10 +17,18 @@ use core::InvertedIndexReader;
|
||||
/// the destruction of the `Searcher`.
|
||||
///
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
schema,
|
||||
segment_readers,
|
||||
}
|
||||
}
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
@@ -30,6 +39,11 @@ impl Searcher {
|
||||
segment_reader.doc(doc_id)
|
||||
}
|
||||
|
||||
/// Access the schema associated to the index of this searcher.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
pub fn num_docs(&self) -> u64 {
|
||||
self.segment_readers
|
||||
@@ -59,7 +73,7 @@ impl Searcher {
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
|
||||
query.search(self, collector)
|
||||
collector.search(self, query)
|
||||
}
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
@@ -92,12 +106,6 @@ impl FieldSearcher {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<SegmentReader>> for Searcher {
|
||||
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher { segment_readers }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let segment_ids = self.segment_readers
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
use Result;
|
||||
use std::path::PathBuf;
|
||||
use schema::Schema;
|
||||
use std::fmt;
|
||||
use core::SegmentId;
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use super::SegmentComponent;
|
||||
use core::Index;
|
||||
use std::result;
|
||||
use directory::Directory;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use directory::error::{OpenReadError, OpenWriteError};
|
||||
use directory::Directory;
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use schema::Schema;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::result;
|
||||
use Result;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
@@ -111,8 +111,8 @@ mod tests {
|
||||
|
||||
use core::SegmentComponent;
|
||||
use directory::Directory;
|
||||
use std::collections::HashSet;
|
||||
use schema::SchemaBuilder;
|
||||
use std::collections::HashSet;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::slice;
|
||||
|
||||
/// Enum describing each component of a tantivy segment.
|
||||
/// Each component is stored in its own file,
|
||||
/// using the pattern `segment_uuid`.`component_extension`,
|
||||
@@ -26,7 +28,7 @@ pub enum SegmentComponent {
|
||||
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
|
||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use uuid::Uuid;
|
||||
use std::fmt;
|
||||
use std::cmp::{Ord, Ordering};
|
||||
use std::fmt;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentId;
|
||||
use super::SegmentComponent;
|
||||
use std::path::PathBuf;
|
||||
use core::SegmentId;
|
||||
use std::collections::HashSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
|
||||
@@ -1,31 +1,30 @@
|
||||
use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentComponent;
|
||||
use std::sync::RwLock;
|
||||
use common::HasLen;
|
||||
use core::SegmentMeta;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::DeleteBitSet;
|
||||
use store::StoreReader;
|
||||
use schema::Document;
|
||||
use DocId;
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
use common::CompositeFile;
|
||||
use std::fmt;
|
||||
use common::HasLen;
|
||||
use core::InvertedIndexReader;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use error::ErrorKind;
|
||||
use termdict::TermDictionaryImpl;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Schema;
|
||||
use termdict::TermDictionary;
|
||||
use fastfield::{FastValue, MultiValueIntFastFieldReader};
|
||||
use schema::Cardinality;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use schema::Cardinality;
|
||||
use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use store::StoreReader;
|
||||
use termdict::TermDictionary;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -76,6 +75,11 @@ impl SegmentReader {
|
||||
self.segment_meta.num_docs()
|
||||
}
|
||||
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
@@ -105,12 +109,25 @@ impl SegmentReader {
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
@@ -123,21 +140,32 @@ impl SegmentReader {
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let vals_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
{
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
@@ -157,7 +185,7 @@ impl SegmentReader {
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let termdict = TermDictionaryImpl::from_source(termdict_source);
|
||||
let termdict = TermDictionary::from_source(termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
}
|
||||
@@ -171,12 +199,14 @@ impl SegmentReader {
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
|
||||
if let Some(fieldnorm_source) = self.fieldnorms_composite
|
||||
.open_read(field) {
|
||||
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
|
||||
FieldNormReader::open(fieldnorm_source)
|
||||
} else {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg= format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was it market as indexed during indexing.",
|
||||
field_name
|
||||
);
|
||||
panic!(err_msg);
|
||||
}
|
||||
}
|
||||
@@ -211,13 +241,12 @@ impl SegmentReader {
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
|
||||
let delete_bitset_opt =
|
||||
if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
@@ -281,7 +310,7 @@ impl SegmentReader {
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionaryImpl::from_source(termdict_source),
|
||||
TermDictionary::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
record_option,
|
||||
@@ -323,6 +352,11 @@ impl SegmentReader {
|
||||
.map(|delete_set| delete_set.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
@@ -330,3 +364,90 @@ impl fmt::Debug for SegmentReader {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements the iterator trait to allow easy iteration
|
||||
/// over non-deleted ("alive") DocIds in a SegmentReader
|
||||
pub struct SegmentReaderAliveDocsIterator<'a> {
|
||||
reader: &'a SegmentReader,
|
||||
max_doc: DocId,
|
||||
current: DocId,
|
||||
}
|
||||
|
||||
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
||||
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
||||
SegmentReaderAliveDocsIterator {
|
||||
reader: reader,
|
||||
max_doc: reader.max_doc(),
|
||||
current: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
||||
type Item = DocId;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
|
||||
// find the next alive doc id
|
||||
while self.reader.is_deleted(self.current) {
|
||||
self.current += 1;
|
||||
|
||||
if self.current >= self.max_doc {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// capture the current alive DocId
|
||||
let result = Some(self.current);
|
||||
|
||||
// move down the chain
|
||||
self.current += 1;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use core::Index;
|
||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
|
||||
// we should now have one segment with two docs
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut index_writer2 = index.writer(50_000_000).unwrap();
|
||||
index_writer2.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer2.delete_term(Term::from_field_text(name, "cap"));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer2.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
|
||||
assert_eq!(vec![0u32, 2u32], docs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
mod skiplist_builder;
|
||||
mod skiplist;
|
||||
mod skiplist_builder;
|
||||
|
||||
pub use self::skiplist_builder::SkipListBuilder;
|
||||
pub use self::skiplist::SkipList;
|
||||
pub use self::skiplist_builder::SkipListBuilder;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use common::{BinarySerializable, VInt};
|
||||
use std::marker::PhantomData;
|
||||
use std::cmp::max;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
static EMPTY: [u8; 0] = [];
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::io::Write;
|
||||
use common::{BinarySerializable, VInt, is_power_of_2};
|
||||
use std::marker::PhantomData;
|
||||
use common::{is_power_of_2, BinarySerializable, VInt};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
struct LayerBuilder<T: BinarySerializable> {
|
||||
period_mask: usize,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::mem;
|
||||
use super::heap::{Heap, HeapAllocable};
|
||||
use std::mem;
|
||||
|
||||
#[inline]
|
||||
pub fn is_power_of_2(val: u32) -> bool {
|
||||
@@ -99,12 +99,8 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::Heap;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
@@ -124,6 +120,17 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::ExpUnrolledLinkedList;
|
||||
use super::Heap;
|
||||
use test::Bencher;
|
||||
|
||||
const NUM_STACK: usize = 10_000;
|
||||
const STACK_SIZE: u32 = 1000;
|
||||
|
||||
#[bench]
|
||||
fn bench_push_vec(bench: &mut Bencher) {
|
||||
bench.iter(|| {
|
||||
|
||||
@@ -1,54 +1,53 @@
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
use postings::UnorderedTermId;
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use postings::UnorderedTermId;
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
use std::slice;
|
||||
|
||||
mod murmurhash2 {
|
||||
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
const M: u32 = 0x5bd1_e995;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let m: u32 = 0x5bd1_e995;
|
||||
let r = 24;
|
||||
let len = key.len() as u32;
|
||||
|
||||
let mut h: u32 = SEED ^ len;
|
||||
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { *key_ptr };
|
||||
k = k.wrapping_mul(m);
|
||||
k ^= k >> r;
|
||||
k = k.wrapping_mul(m);
|
||||
k = k.wrapping_mul(m);
|
||||
let mut k: u32 = unsafe { *key_ptr }; // ok because of num_blocks definition
|
||||
k = k.wrapping_mul(M);
|
||||
k ^= k >> 24;
|
||||
k = k.wrapping_mul(M);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining = len & 3;
|
||||
let key_ptr_u8: *const u8 = key_ptr as *const u8;
|
||||
match remaining {
|
||||
let remaining: &[u8] = &key[key.len() & !3..];
|
||||
match remaining.len() {
|
||||
3 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
h ^= u32::from(remaining[2]) << 16;
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
2 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
h ^= u32::from(remaining[1]) << 8;
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
1 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
h ^= u32::from(remaining[0]);
|
||||
h = h.wrapping_mul(M);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(m);
|
||||
h = h.wrapping_mul(M);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
}
|
||||
@@ -117,11 +116,7 @@ struct QuadraticProbing {
|
||||
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing {
|
||||
hash,
|
||||
i: 0,
|
||||
mask,
|
||||
}
|
||||
QuadraticProbing { hash, i: 0, mask }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -131,6 +126,23 @@ impl QuadraticProbing {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a: 'b, 'b> {
|
||||
hashmap: &'b TermHashMap<'a>,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a, 'b> Iterator for Iter<'a, 'b> {
|
||||
type Item = (&'b [u8], u32, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'b [u8], u32) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
@@ -161,16 +173,16 @@ impl<'a> TermHashMap<'a> {
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr, hash
|
||||
key_value_addr,
|
||||
hash,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
let (key, offset) = self.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
pub fn iter<'b: 'a>(&'b self) -> Iter<'a, 'b> {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: &self,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
@@ -202,15 +214,32 @@ impl<'a> TermHashMap<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash2(b: &mut Bencher) {
|
||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
let mut s = 0;
|
||||
for &key in &keys {
|
||||
s ^= murmurhash2(key.as_bytes());
|
||||
}
|
||||
s
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::{Heap, HeapAllocable};
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
use std::collections::HashSet;
|
||||
use super::split_memory;
|
||||
use super::*;
|
||||
use std::collections::HashSet;
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
@@ -281,6 +310,17 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_against_reference_impl() {
|
||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
@@ -292,18 +332,4 @@ mod tests {
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_murmurhash_2(b: &mut Bencher) {
|
||||
let keys: Vec<&'static str> =
|
||||
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
///
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
mod expull;
|
||||
pub(crate) mod hashmap;
|
||||
mod heap;
|
||||
mod expull;
|
||||
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::hashmap::TermHashMap;
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
|
||||
#[test]
|
||||
fn test_unrolled_linked_list() {
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::marker::Send;
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::marker::Send;
|
||||
use std::marker::Sync;
|
||||
use std::path::Path;
|
||||
use std::result;
|
||||
|
||||
/// Write-once read many (WORM) abstraction for where
|
||||
/// tantivy's data should be stored.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::error::Error as StdError;
|
||||
use std::path::PathBuf;
|
||||
use std::io;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// General IO error with an optional path to the offending file.
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use serde_json;
|
||||
use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
use std::io;
|
||||
use Directory;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::io::Write;
|
||||
use core::MANAGED_FILEPATH;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use error::{ErrorKind, Result, ResultExt};
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Directory;
|
||||
|
||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||
///
|
||||
@@ -86,7 +86,7 @@ impl ManagedDirectory {
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default(),
|
||||
@@ -94,7 +94,7 @@ impl ManagedDirectory {
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
@@ -265,7 +265,7 @@ impl Directory for ManagedDirectory {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
box self.clone()
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -282,10 +282,10 @@ impl Clone for ManagedDirectory {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use std::path::Path;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
|
||||
lazy_static! {
|
||||
@@ -294,7 +294,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
@@ -343,7 +343,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap ")]
|
||||
#[cfg(feature = "mmap ")]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
@@ -373,7 +373,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory_protect() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
use atomicwrites;
|
||||
use common::make_io_err;
|
||||
use directory::Directory;
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use directory::ReadOnlySource;
|
||||
use directory::shared_vec_slice::SharedVecSlice;
|
||||
use directory::Directory;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::collections::hash_map::Entry as HashMapEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File};
|
||||
use std::fs::OpenOptions;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -40,9 +40,11 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
// instead.
|
||||
return Ok(None);
|
||||
}
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
unsafe {
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
|
||||
@@ -4,32 +4,29 @@ WORM directory abstraction.
|
||||
|
||||
*/
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_directory;
|
||||
|
||||
mod ram_directory;
|
||||
mod directory;
|
||||
mod managed_directory;
|
||||
mod ram_directory;
|
||||
mod read_only_source;
|
||||
mod shared_vec_slice;
|
||||
mod managed_directory;
|
||||
mod static_directory;
|
||||
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
|
||||
pub use self::static_directory::StaticDirectory;
|
||||
pub use self::static_directory::write_static_from_directory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub use self::directory::Directory;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
|
||||
/// Synonym of Seek + Write
|
||||
pub trait SeekableWrite: Seek + Write {}
|
||||
@@ -45,8 +42,8 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::path::Path;
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::Path;
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||
@@ -59,7 +56,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_mmap_directory() {
|
||||
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
test_directory(&mut mmap_directory);
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::make_io_err;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::WritePtr;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use common::make_io_err;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::WritePtr;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
///
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
#[cfg(feature="mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::ops::Deref;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
use std::slice;
|
||||
use std::io::{self, Read};
|
||||
#[cfg(feature = "mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
|
||||
const EMPTY_SLICE: [u8; 0] = [];
|
||||
use std::io::{self, Read};
|
||||
use std::ops::Deref;
|
||||
use std::slice;
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
@@ -17,12 +15,10 @@ const EMPTY_SLICE: [u8; 0] = [];
|
||||
/// hold by this object should never be altered or destroyed.
|
||||
pub enum ReadOnlySource {
|
||||
/// Mmap source of data
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
Mmap(MmapReadOnly),
|
||||
/// Wrapping a `Vec<u8>`
|
||||
Anonymous(SharedVecSlice),
|
||||
/// Wrapping a static slice
|
||||
Static(&'static [u8])
|
||||
}
|
||||
|
||||
unsafe impl StableDeref for ReadOnlySource {}
|
||||
@@ -39,16 +35,15 @@ impl Deref for ReadOnlySource {
|
||||
impl ReadOnlySource {
|
||||
/// Creates an empty ReadOnlySource
|
||||
pub fn empty() -> ReadOnlySource {
|
||||
ReadOnlySource::Static(&EMPTY_SLICE)
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::empty())
|
||||
}
|
||||
|
||||
/// Returns the data underlying the ReadOnlySource object.
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
match *self {
|
||||
#[cfg(feature="mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
ReadOnlySource::Static(data) => data,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,9 +66,14 @@ impl ReadOnlySource {
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// are retained in memory.
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
|
||||
assert!(from_offset <= to_offset, "Requested negative slice [{}..{}]", from_offset, to_offset);
|
||||
assert!(
|
||||
from_offset <= to_offset,
|
||||
"Requested negative slice [{}..{}]",
|
||||
from_offset,
|
||||
to_offset
|
||||
);
|
||||
match *self {
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => {
|
||||
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
|
||||
ReadOnlySource::Mmap(sliced_mmap)
|
||||
@@ -81,9 +81,6 @@ impl ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||
}
|
||||
ReadOnlySource::Static(data) => {
|
||||
ReadOnlySource::Static(&data[from_offset..to_offset])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,12 +121,6 @@ impl From<Vec<u8>> for ReadOnlySource {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&'static [u8]> for ReadOnlySource {
|
||||
fn from(data: &'static [u8]) -> ReadOnlySource {
|
||||
ReadOnlySource::Static(data)
|
||||
}
|
||||
}
|
||||
|
||||
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
|
||||
pub(crate) struct SourceRead {
|
||||
_data_owner: ReadOnlySource,
|
||||
@@ -144,13 +135,11 @@ impl SourceRead {
|
||||
|
||||
pub fn slice_from(&self, start: usize) -> &[u8] {
|
||||
&self.cursor[start..]
|
||||
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u8 {
|
||||
self.cursor[idx]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SourceRead {
|
||||
|
||||
@@ -1,123 +0,0 @@
|
||||
use std::collections::HashMap;
|
||||
use Directory;
|
||||
use std::path::PathBuf;
|
||||
use directory::ReadOnlySource;
|
||||
use std::io::BufWriter;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use std::path::Path;
|
||||
use std::fmt::{Formatter, Debug, self};
|
||||
use Result as TantivyResult;
|
||||
use directory::SeekableWrite;
|
||||
use std::io;
|
||||
use std::fs;
|
||||
use common::Endianness;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use byteorder::ByteOrder;
|
||||
use std::str;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Write};
|
||||
use std::ffi::OsString;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StaticDirectory {
|
||||
files: HashMap<PathBuf, &'static [u8]>,
|
||||
}
|
||||
|
||||
impl Debug for StaticDirectory {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "StaticDirectory[{} files]", self.files.len())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl StaticDirectory {
|
||||
pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
|
||||
assert!(data.len() > 8);
|
||||
let footer_len_offset = data.len() - 8;
|
||||
let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
|
||||
let mut body = &data[..body_len];
|
||||
let mut footer = &data[body_len..footer_len_offset];
|
||||
let num_files = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let mut files = HashMap::new();
|
||||
for _ in 0..num_files {
|
||||
let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let filename = &footer[..filename_len];
|
||||
footer = &footer[filename_len..];
|
||||
let data_len = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let file_data = &body[..data_len];
|
||||
body = &body[data_len..];
|
||||
let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
|
||||
let filename = PathBuf::from(filename_str);
|
||||
println!("{:?} {:?}", filename, data_len);
|
||||
files.insert(filename, file_data);
|
||||
}
|
||||
Ok(StaticDirectory {
|
||||
files
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for StaticDirectory {
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
if let Some(static_data) = self.files.get(path) {
|
||||
Ok(ReadOnlySource::from(*static_data))
|
||||
} else {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.files.contains_key(path)
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||
if let Some(static_data) = self.files.get(path) {
|
||||
Ok(static_data.to_vec())
|
||||
} else {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
box self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
|
||||
assert!(directory_path.is_dir());
|
||||
let mut file_data: Vec<(OsString, usize)> = Vec::new();
|
||||
let mut write: Vec<u8> = Vec::new();
|
||||
for entry in fs::read_dir(directory_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_file() {
|
||||
info!("Appending {}", path.to_string_lossy());
|
||||
let mut open_file = File::open(&path)?;
|
||||
let file_len = open_file.read_to_end(&mut write)?;
|
||||
file_data.push((entry.file_name(), file_len));
|
||||
}
|
||||
}
|
||||
// write footer
|
||||
let body_len = write.len();
|
||||
VInt(file_data.len() as u64).serialize(&mut write)?;
|
||||
for (filename, filelen) in file_data {
|
||||
VInt(filename.len() as u64).serialize(&mut write)?;
|
||||
write.write_all(filename.to_string_lossy().as_bytes())?;
|
||||
VInt(filelen as u64).serialize(&mut write)?;
|
||||
}
|
||||
(body_len as u64).serialize(&mut write)?;
|
||||
Ok(write)
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
use DocId;
|
||||
use common::BitSet;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
use common::BitSet;
|
||||
use DocId;
|
||||
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
|
||||
14
src/error.rs
14
src/error.rs
@@ -2,13 +2,13 @@
|
||||
|
||||
use std::io;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use query;
|
||||
use schema;
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use serde_json;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
error_chain!(
|
||||
errors {
|
||||
@@ -48,10 +48,10 @@ error_chain!(
|
||||
description("an error occurred in a thread")
|
||||
display("an error occurred in a thread: '{}'", err)
|
||||
}
|
||||
/// An Error appeared related to the lack of a field.
|
||||
SchemaError(field: String) {
|
||||
description("a schema field is missing")
|
||||
display("a schema field is missing: '{}'", field)
|
||||
/// An Error appeared related to the schema.
|
||||
SchemaError(message: String) {
|
||||
description("the schema is not matching expectations.")
|
||||
display("Schema error: '{}'", message)
|
||||
}
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
FastFieldError(err: FastFieldNotAvailableError) {
|
||||
|
||||
38
src/fastfield/bytes/mod.rs
Normal file
38
src/fastfield/bytes/mod.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use self::reader::BytesFastFieldReader;
|
||||
pub use self::writer::BytesFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_bytes() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_bytes_field("bytesfield");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
|
||||
index_writer.add_document(doc!(field=>vec![]));
|
||||
index_writer.add_document(doc!(field=>vec![255u8]));
|
||||
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
|
||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
||||
|
||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_val(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
||||
let long = vec![0u8; 1000];
|
||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
||||
}
|
||||
}
|
||||
37
src/fastfield/bytes/reader.rs
Normal file
37
src/fastfield/bytes/reader.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use owning_ref::OwningRef;
|
||||
|
||||
use directory::ReadOnlySource;
|
||||
use fastfield::FastFieldReader;
|
||||
use DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
///
|
||||
/// The reader is implemented as a `u64` fast field and a separate collection of bytes.
|
||||
///
|
||||
/// The `vals_reader` will access the concatenated list of all values for all documents.
|
||||
///
|
||||
/// The `idx_reader` associates, for each document, the index of its first value.
|
||||
///
|
||||
/// Reading the value for a document is done by reading the start index for it,
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values: OwningRef<ReadOnlySource, [u8]>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values_source: ReadOnlySource,
|
||||
) -> BytesFastFieldReader {
|
||||
let values = OwningRef::new(values_source).map(|source| &source[..]);
|
||||
BytesFastFieldReader { idx_reader, values }
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
||||
let start = self.idx_reader.get(doc) as usize;
|
||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||
&self.values[start..stop]
|
||||
}
|
||||
}
|
||||
96
src/fastfield/bytes/writer.rs
Normal file
96
src/fastfield/bytes/writer.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use std::io;
|
||||
|
||||
use fastfield::serializer::FastFieldSerializer;
|
||||
use schema::{Document, Field, Value};
|
||||
use DocId;
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
/// This `BytesFastFieldWriter` is only useful for advanced user.
|
||||
/// The normal way to get your associated bytes in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::SingleValue`
|
||||
/// in your schema
|
||||
/// - add your document simply by calling `.add_document(...)` with associating bytes to the field.
|
||||
///
|
||||
/// The `BytesFastFieldWriter` can be acquired from the
|
||||
/// fast field writer by calling
|
||||
/// [`.get_bytes_writer(...)`](./struct.FastFieldsWriter.html#method.get_bytes_writer).
|
||||
///
|
||||
/// Once acquired, writing is done by calling `.add_document_val(&[u8])`
|
||||
/// once per document, even if there are no bytes associated to it.
|
||||
pub struct BytesFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u8>,
|
||||
doc_index: Vec<u64>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldWriter {
|
||||
/// Creates a new `BytesFastFieldWriter`
|
||||
pub fn new(field: Field) -> Self {
|
||||
BytesFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the field associated to the `BytesFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Finalize the current document.
|
||||
pub(crate) fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Shift to the next document and add all of the
|
||||
/// matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
if let &Value::Bytes(ref bytes) = field_value.value() {
|
||||
self.vals.extend_from_slice(bytes);
|
||||
} else {
|
||||
panic!(
|
||||
"Bytes field contained non-Bytes Value!. Field {:?} = {:?}",
|
||||
self.field, field_value
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Register the bytes associated to a document.
|
||||
///
|
||||
/// The method returns the `DocId` of the document that was
|
||||
/// just written.
|
||||
pub fn add_document_val(&mut self, val: &[u8]) -> DocId {
|
||||
let doc = self.doc_index.len() as DocId;
|
||||
self.next_doc();
|
||||
self.vals.extend_from_slice(val);
|
||||
doc
|
||||
}
|
||||
|
||||
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
doc_index_serializer.close_field()?;
|
||||
}
|
||||
{
|
||||
// writing the values themselves
|
||||
let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1)?;
|
||||
value_serializer.write_all(&self.vals)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
use bit_set::BitSet;
|
||||
use directory::WritePtr;
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use common::HasLen;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use DocId;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
@@ -62,10 +62,8 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
@@ -74,10 +72,10 @@ impl HasLen for DeleteBitSet {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::PathBuf;
|
||||
use super::*;
|
||||
use bit_set::BitSet;
|
||||
use directory::*;
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
||||
let test_path = PathBuf::from("test");
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::result;
|
||||
use schema::FieldEntry;
|
||||
use std::result;
|
||||
|
||||
/// `FastFieldNotAvailableError` is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use DocId;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::Facet;
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
@@ -19,7 +19,7 @@ use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
/// only makes sense for a given segment.
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
term_dict: TermDictionary,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -28,11 +28,11 @@ impl FacetReader {
|
||||
/// A facet reader just wraps :
|
||||
/// - a `MultiValueIntFastFieldReader` that makes it possible to
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionaryImpl` that helps associating a facet to
|
||||
/// - a `TermDictionary` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
pub fn new(
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
term_dict: TermDictionary,
|
||||
) -> FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
@@ -50,7 +50,7 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Accessor for the facet term dictionary.
|
||||
pub fn facet_dict(&self) -> &TermDictionaryImpl {
|
||||
pub fn facet_dict(&self) -> &TermDictionary {
|
||||
&self.term_dict
|
||||
}
|
||||
|
||||
|
||||
@@ -23,26 +23,28 @@ values stored.
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use common;
|
||||
use schema::Cardinality;
|
||||
use schema::FieldType;
|
||||
use schema::Value;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::MultiValueIntFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
|
||||
mod reader;
|
||||
mod writer;
|
||||
mod serializer;
|
||||
mod error;
|
||||
mod bytes;
|
||||
mod delete;
|
||||
mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64 or i64).
|
||||
pub trait FastValue: Default + Clone + Copy {
|
||||
@@ -121,31 +123,27 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use schema::Document;
|
||||
use schema::FAST;
|
||||
use schema::Field;
|
||||
use schema::FAST;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use super::*;
|
||||
use test;
|
||||
use test::Bencher;
|
||||
|
||||
lazy_static! {
|
||||
static ref SCHEMA: Schema = {
|
||||
pub static ref SCHEMA: Schema = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
};
|
||||
static ref FIELD: Field = {
|
||||
SCHEMA.get_field("field").unwrap()
|
||||
};
|
||||
pub static ref FIELD: Field = { SCHEMA.get_field("field").unwrap() };
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -369,7 +367,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_permutation() -> Vec<u64> {
|
||||
pub fn generate_permutation() -> Vec<u64> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng = XorShiftRng::from_seed(*seed);
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
@@ -409,13 +407,27 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use test::{self, Bencher};
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
for i in (0u32..n / 7).map(|v| v * 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
@@ -461,7 +473,7 @@ mod tests {
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
for i in (0u32..n / 7).map(|val| val * 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
@@ -502,4 +514,5 @@ mod tests {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
mod writer;
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Cardinality;
|
||||
use schema::IntOptions;
|
||||
use schema::SchemaBuilder;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use DocId;
|
||||
use fastfield::{FastFieldReader, FastValue};
|
||||
use DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
///
|
||||
@@ -26,13 +26,20 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let start = self.idx_reader.get(doc) as u32;
|
||||
let stop = self.idx_reader.get(doc + 1) as u32;
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start, &mut vals[..]);
|
||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,35 @@
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::serializer::FastSingleFieldSerializer;
|
||||
use fastfield::value_to_u64;
|
||||
use std::collections::HashMap;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use itertools::Itertools;
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Document, Field};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use itertools::Itertools;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
///
|
||||
/// This `Writer` is only useful for advanced user.
|
||||
/// The normal way to get your multivalued int in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::MultiValues`
|
||||
/// in your schema
|
||||
/// - add your document simply by calling `.add_document(...)`.
|
||||
///
|
||||
/// The `MultiValueIntFastFieldWriter` can be acquired from the
|
||||
/// fastfield writer, by calling [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
|
||||
///
|
||||
/// Once acquired, writing is done by calling calls to
|
||||
/// `.add_document_vals(&[u64])` once per document.
|
||||
///
|
||||
/// The serializer makes it possible to remap all of the values
|
||||
/// that were pushed to the writer using a mapping.
|
||||
/// This makes it possible to push unordered term ids,
|
||||
/// during indexing and remap them to their respective
|
||||
/// term ids when the segment is getting serialized.
|
||||
pub struct MultiValueIntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u64>,
|
||||
@@ -16,7 +39,7 @@ pub struct MultiValueIntFastFieldWriter {
|
||||
|
||||
impl MultiValueIntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field, is_facet: bool) -> Self {
|
||||
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
|
||||
MultiValueIntFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
@@ -25,24 +48,26 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the field associated to the `MultiValueIntFastFieldWriter`
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
pub fn next_doc(&mut self) {
|
||||
/// Finalize the current document.
|
||||
pub(crate) fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: UnorderedTermId) {
|
||||
/// Pushes a new value to the current document.
|
||||
pub(crate) fn add_val(&mut self, val: UnorderedTermId) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
/// Shift to the next document and adds
|
||||
/// all of the matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
@@ -52,6 +77,17 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Register all of the values associated to a document.
|
||||
///
|
||||
/// The method returns the `DocId` of the document that was
|
||||
/// just written.
|
||||
pub fn add_document_vals(&mut self, vals: &[UnorderedTermId]) -> DocId {
|
||||
let doc = self.doc_index.len() as DocId;
|
||||
self.next_doc();
|
||||
self.vals.extend_from_slice(vals);
|
||||
doc
|
||||
}
|
||||
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// HashMap makes it possible to remap them before serializing.
|
||||
@@ -66,7 +102,7 @@ impl MultiValueIntFastFieldWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
@@ -90,13 +126,13 @@ impl MultiValueIntFastFieldWriter {
|
||||
1,
|
||||
)?;
|
||||
for val in &self.vals {
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
|
||||
value_serializer.add_val(remapped_val)?;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
use common::BinarySerializable;
|
||||
use super::FastValue;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use common::CompositeFile;
|
||||
use common::compute_num_bits;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeFile;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use owning_ref::OwningRef;
|
||||
use schema::FAST;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::FAST;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem;
|
||||
use std::path::Path;
|
||||
use super::FastValue;
|
||||
use DocId;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
@@ -67,12 +67,20 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
///
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64`
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
for out in output_u64.iter_mut() {
|
||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use common::BinarySerializable;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::CountingWriter;
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeWrite;
|
||||
use common::CountingWriter;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
@@ -61,6 +61,16 @@ impl FastFieldSerializer {
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> io::Result<FastBytesFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer::open(field_write)
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
@@ -77,11 +87,20 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
@@ -107,3 +126,21 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write + 'a> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
fn open(write: &'a mut W) -> io::Result<FastBytesFieldSerializer<'a, W>> {
|
||||
Ok(FastBytesFieldSerializer { write })
|
||||
}
|
||||
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
use schema::{Cardinality, Document, Field, Schema};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use std::io;
|
||||
use schema::FieldType;
|
||||
use common;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Cardinality, Document, Field, FieldType, Schema};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
@@ -20,6 +21,7 @@ impl FastFieldsWriter {
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
@@ -47,12 +49,17 @@ impl FastFieldsWriter {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
let fast_field_writer = BytesFastFieldWriter::new(field);
|
||||
bytes_value_writers.push(fast_field_writer);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
bytes_value_writers,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,24 +75,36 @@ impl FastFieldsWriter {
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub(crate) fn get_multivalue_writer(
|
||||
pub fn get_multivalue_writer(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValueIntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
// TODO expose for users
|
||||
self.multi_values_writers
|
||||
.iter_mut()
|
||||
.find(|multivalue_writer| multivalue_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the bytes fast field writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a bytes fastfield in the schema.
|
||||
pub fn get_bytes_writer(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.bytes_value_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.multi_values_writers {
|
||||
field_writer.next_doc();
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.bytes_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
@@ -95,7 +114,7 @@ impl FastFieldsWriter {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
@@ -104,6 +123,9 @@ impl FastFieldsWriter {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field))?;
|
||||
}
|
||||
for field_writer in &self.bytes_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
|
||||
#[inline(always)]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
FIELD_NORMS_TABLE[id as usize]
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
FIELD_NORMS_TABLE
|
||||
@@ -12,45 +10,34 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
.unwrap_or_else(|idx| idx - 1) as u8
|
||||
}
|
||||
|
||||
|
||||
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54,
|
||||
56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144,
|
||||
152, 168, 184, 200, 216, 232, 248, 264, 280, 312, 344, 376, 408, 440, 472, 504,
|
||||
536, 600, 664, 728, 792, 856, 920, 984,
|
||||
1048, 1176, 1304, 1432, 1560, 1688, 1816, 1944,
|
||||
2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240,
|
||||
10264, 11288, 12312, 13336, 14360, 15384,
|
||||
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744,
|
||||
32792, 36888, 40984, 45080, 49176, 53272, 57368, 61464,
|
||||
65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||
163864, 180248, 196632, 213016, 229400, 245784, 262168,
|
||||
294936, 327704, 360472, 393240, 426008, 458776,
|
||||
491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528,
|
||||
983064, 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032,
|
||||
1966104, 2097176, 2359320, 2621464, 2883608, 3145752, 3407896, 3670040, 3932184,
|
||||
4194328, 4718616, 5242904, 5767192, 6291480, 6815768, 7340056, 7864344, 8388632, 9437208,
|
||||
10485784, 11534360, 12582936, 13631512, 14680088, 15728664, 16777240, 18874392, 20971544,
|
||||
23068696, 25165848, 27263000, 29360152, 31457304, 33554456, 37748760, 41943064,
|
||||
46137368, 50331672, 54525976, 58720280, 62914584, 67108888, 75497496, 83886104,
|
||||
92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, 167772184,
|
||||
184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, 335544344,
|
||||
369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, 671088664,
|
||||
738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, 1342177304,
|
||||
1476395032, 1610612760, 1744830488, 1879048216, 2013265944
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
||||
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048,
|
||||
1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384,
|
||||
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176,
|
||||
53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||
163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008,
|
||||
458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600,
|
||||
1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464,
|
||||
2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480,
|
||||
6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088,
|
||||
15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304,
|
||||
33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888,
|
||||
75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968,
|
||||
167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912,
|
||||
335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800,
|
||||
671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576,
|
||||
1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||
];
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm, FIELD_NORMS_TABLE};
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_decode_code() {
|
||||
assert_eq!(fieldnorm_to_id(0), 0);
|
||||
@@ -103,4 +90,4 @@ mod tests {
|
||||
assert_eq!(FIELD_NORMS_TABLE[i], decode_fieldnorm_byte(i as u8));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,13 +17,12 @@
|
||||
//!
|
||||
//! This trick is used by the [BM25 similarity]().
|
||||
mod code;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
mod reader;
|
||||
|
||||
pub use self::reader::FieldNormReader;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
pub use self::serializer::FieldNormsSerializer;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use super::{id_to_fieldnorm, fieldnorm_to_id};
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
|
||||
|
||||
/// Reads the fieldnorm associated to a document.
|
||||
/// The fieldnorm represents the length associated to
|
||||
/// a given Field of a given document.
|
||||
@@ -21,16 +20,13 @@ use DocId;
|
||||
/// precompute computationally expensive functions of the fieldnorm
|
||||
/// in a very short array.
|
||||
pub struct FieldNormReader {
|
||||
data: ReadOnlySource
|
||||
data: ReadOnlySource,
|
||||
}
|
||||
|
||||
impl FieldNormReader {
|
||||
|
||||
/// Opens a field norm reader given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> Self {
|
||||
FieldNormReader {
|
||||
data
|
||||
}
|
||||
FieldNormReader { data }
|
||||
}
|
||||
|
||||
/// Returns the `fieldnorm` associated to a doc id.
|
||||
@@ -71,12 +67,13 @@ impl FieldNormReader {
|
||||
#[cfg(test)]
|
||||
impl From<Vec<u32>> for FieldNormReader {
|
||||
fn from(field_norms: Vec<u32>) -> FieldNormReader {
|
||||
let field_norms_id = field_norms.into_iter()
|
||||
let field_norms_id = field_norms
|
||||
.into_iter()
|
||||
.map(FieldNormReader::fieldnorm_to_id)
|
||||
.collect::<Vec<u8>>();
|
||||
let field_norms_data = ReadOnlySource::from(field_norms_id);
|
||||
FieldNormReader {
|
||||
data: field_norms_data
|
||||
data: field_norms_data,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,26 +1,24 @@
|
||||
use directory::WritePtr;
|
||||
use std::io;
|
||||
use common::CompositeWrite;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
|
||||
/// The fieldnorms serializer is in charge of
|
||||
/// the serialization of field norms for all fields.
|
||||
pub struct FieldNormsSerializer {
|
||||
composite_write: CompositeWrite,
|
||||
}
|
||||
|
||||
impl FieldNormsSerializer {
|
||||
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FieldNormsSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FieldNormsSerializer {
|
||||
composite_write
|
||||
})
|
||||
Ok(FieldNormsSerializer { composite_write })
|
||||
}
|
||||
|
||||
|
||||
/// Serialize the given field
|
||||
pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> {
|
||||
let write = self.composite_write.for_field(field);
|
||||
write.write_all(fieldnorms_data)?;
|
||||
@@ -28,10 +26,9 @@ impl FieldNormsSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up / flush / close
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,30 +1,36 @@
|
||||
use DocId;
|
||||
|
||||
use schema::Field;
|
||||
use super::FieldNormsSerializer;
|
||||
use std::io;
|
||||
use schema::Schema;
|
||||
use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
use schema::Field;
|
||||
use schema::Schema;
|
||||
use std::io;
|
||||
|
||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||
/// of each document for each field with field norms.
|
||||
///
|
||||
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
||||
/// byte per document per field.
|
||||
pub struct FieldNormsWriter {
|
||||
fields: Vec<Field>,
|
||||
fieldnorms_buffer: Vec<Vec<u8>>
|
||||
fieldnorms_buffer: Vec<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl FieldNormsWriter {
|
||||
|
||||
pub fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||
/// Returns the fields that should have field norms computed
|
||||
/// according to the given schema.
|
||||
pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| {
|
||||
field_entry.is_indexed()
|
||||
})
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field, _)| Field(field as u32))
|
||||
.collect::<Vec<Field>>()
|
||||
}
|
||||
|
||||
/// Initialize with state for tracking the field norm fields
|
||||
/// specified in the schema.
|
||||
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
|
||||
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||
let max_field = fields
|
||||
@@ -35,26 +41,40 @@ impl FieldNormsWriter {
|
||||
.unwrap_or(0);
|
||||
FieldNormsWriter {
|
||||
fields,
|
||||
fieldnorms_buffer: (0..max_field)
|
||||
.map(|_| Vec::new())
|
||||
.collect::<Vec<_>>()
|
||||
fieldnorms_buffer: (0..max_field).map(|_| Vec::new()).collect::<Vec<_>>(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensure that all documents in 0..max_doc have a byte associated with them
|
||||
/// in each of the fieldnorm vectors.
|
||||
///
|
||||
/// Will extend with 0-bytes for documents that have not been seen.
|
||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||
for &field in self.fields.iter() {
|
||||
self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the fieldnorm byte for the given document for the given field.
|
||||
///
|
||||
/// Will internally convert the u32 `fieldnorm` value to the appropriate byte
|
||||
/// to approximate the field norm in less space.
|
||||
///
|
||||
/// * doc - the document id
|
||||
/// * field - the field being set
|
||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
|
||||
assert!(fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice");
|
||||
assert!(
|
||||
fieldnorm_buffer.len() <= doc as usize,
|
||||
"Cannot register a given fieldnorm twice"
|
||||
);
|
||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
|
||||
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
|
||||
@@ -62,4 +82,4 @@ impl FieldNormsWriter {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::collections::HashSet;
|
||||
use rand::thread_rng;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
use schema::*;
|
||||
use Index;
|
||||
use Searcher;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
@@ -13,7 +13,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::operation::DeleteOperation;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use Directory;
|
||||
use directory::error::OpenWriteError;
|
||||
use core::LOCKFILE_FILEPATH;
|
||||
use directory::error::OpenWriteError;
|
||||
use Directory;
|
||||
|
||||
/// The directory lock is a mechanism used to
|
||||
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use bit_set::BitSet;
|
||||
use chan;
|
||||
use core::Index;
|
||||
@@ -6,32 +9,28 @@ use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use indexer::stamper::Stamper;
|
||||
use datastruct::stacker::hashmap::split_memory;
|
||||
use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use docset::DocSet;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use fastfield::write_delete_bitset;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use futures::Canceled;
|
||||
use datastruct::stacker::hashmap::split_memory;
|
||||
use futures::Future;
|
||||
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::operation::DeleteOperation;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::DirectoryLock;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use docset::DocSet;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::thread::JoinHandle;
|
||||
use indexer::DirectoryLock;
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
@@ -81,10 +80,6 @@ pub struct IndexWriter {
|
||||
committed_opstamp: u64,
|
||||
}
|
||||
|
||||
// IndexWriter cannot be sent to another thread.
|
||||
impl !Send for IndexWriter {}
|
||||
impl !Sync for IndexWriter {}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
@@ -200,7 +195,6 @@ pub fn advance_deletes(
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
{
|
||||
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
||||
// We are already up-to-date here.
|
||||
@@ -241,7 +235,6 @@ pub fn advance_deletes(
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
@@ -448,10 +441,7 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -647,12 +637,12 @@ impl IndexWriter {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use env_logger;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
use schema::{self, Document};
|
||||
use Index;
|
||||
use Term;
|
||||
use error::*;
|
||||
use env_logger;
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
@@ -675,7 +665,7 @@ mod tests {
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
|
||||
@@ -82,7 +82,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box self.clone()
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,8 +99,8 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use indexer::merge_policy::MergePolicy;
|
||||
use core::{SegmentId, SegmentMeta};
|
||||
use indexer::merge_policy::MergePolicy;
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use std::marker;
|
||||
use std::fmt::Debug;
|
||||
use std::marker;
|
||||
|
||||
/// Set of segment suggested for a merge.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -37,7 +37,7 @@ impl MergePolicy for NoMergePolicy {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box NoMergePolicy
|
||||
Box::new(NoMergePolicy)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ pub mod tests {
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box MergeWheneverPossible
|
||||
Box::new(MergeWheneverPossible)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,29 +1,29 @@
|
||||
pub mod index_writer;
|
||||
pub mod segment_serializer;
|
||||
pub mod merger;
|
||||
pub mod merge_policy;
|
||||
mod log_merge_policy;
|
||||
mod segment_register;
|
||||
mod segment_writer;
|
||||
mod segment_manager;
|
||||
pub mod delete_queue;
|
||||
pub mod segment_updater;
|
||||
mod directory_lock;
|
||||
mod segment_entry;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod index_writer;
|
||||
mod log_merge_policy;
|
||||
pub mod merge_policy;
|
||||
pub mod merger;
|
||||
pub mod operation;
|
||||
mod stamper;
|
||||
mod prepared_commit;
|
||||
mod segment_entry;
|
||||
mod segment_manager;
|
||||
mod segment_register;
|
||||
pub mod segment_serializer;
|
||||
pub mod segment_updater;
|
||||
mod segment_writer;
|
||||
mod stamper;
|
||||
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use Result;
|
||||
use super::IndexWriter;
|
||||
use Result;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
@@ -13,7 +13,7 @@ impl<'a> PreparedCommit<'a> {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
opstamp
|
||||
opstamp,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentMeta;
|
||||
use bit_set::BitSet;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use std::sync::RwLock;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
|
||||
use core::SegmentId;
|
||||
use indexer::SegmentEntry;
|
||||
use std::path::PathBuf;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::SegmentEntry;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
#[derive(Default)]
|
||||
struct SegmentRegisters {
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use core::SegmentId;
|
||||
use std::collections::HashMap;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
|
||||
/// The segment register keeps track
|
||||
/// of the list of segment, their size as well
|
||||
@@ -113,11 +113,11 @@ impl SegmentRegister {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use indexer::SegmentState;
|
||||
use super::*;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::*;
|
||||
use super::*;
|
||||
use indexer::SegmentState;
|
||||
|
||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||
segment_register
|
||||
|
||||
@@ -3,9 +3,9 @@ use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use store::StoreWriter;
|
||||
use fieldnorm::FieldNormsSerializer;
|
||||
use postings::InvertedIndexSerializer;
|
||||
use store::StoreWriter;
|
||||
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
@@ -47,7 +47,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Accessor to the field norm serializer.
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
|
||||
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
|
||||
&mut self.fieldnorms_serializer
|
||||
}
|
||||
|
||||
|
||||
@@ -1,40 +1,40 @@
|
||||
use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
use core::Index;
|
||||
use core::IndexMeta;
|
||||
use core::META_FILEPATH;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SerializableSegment;
|
||||
use core::META_FILEPATH;
|
||||
use directory::Directory;
|
||||
use indexer::stamper::Stamper;
|
||||
use error::{Error, ErrorKind, Result};
|
||||
use futures_cpupool::CpuPool;
|
||||
use futures::Future;
|
||||
use futures::Canceled;
|
||||
use futures::oneshot;
|
||||
use directory::FileProtection;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use error::{Error, ErrorKind, Result};
|
||||
use futures::oneshot;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use futures::Future;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use futures_cpupool::CpuPool;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::index_writer::advance_deletes;
|
||||
use indexer::MergeCandidate;
|
||||
use indexer::merger::IndexMerger;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::MergeCandidate;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentSerializer;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use serde_json;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
@@ -171,7 +171,7 @@ impl SegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
index,
|
||||
segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
@@ -283,10 +283,7 @@ impl SegmentUpdater {
|
||||
}).wait()
|
||||
}
|
||||
|
||||
pub fn start_merge(
|
||||
&self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
|
||||
self.0.segment_manager.start_merge(segment_ids);
|
||||
let segment_updater_clone = self.clone();
|
||||
|
||||
@@ -361,7 +358,9 @@ impl SegmentUpdater {
|
||||
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
||||
for MergeCandidate(segment_metas) in merge_candidates {
|
||||
self.start_merge(&segment_metas);
|
||||
if let Err(e) = self.start_merge(&segment_metas).fuse().poll() {
|
||||
error!("The merge task failed quickly after starting: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -480,9 +479,9 @@ impl SegmentUpdater {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use Index;
|
||||
use schema::*;
|
||||
use indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use schema::*;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_delete_during_merge() {
|
||||
@@ -494,7 +493,7 @@ mod tests {
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.set_merge_policy(box MergeWheneverPossible);
|
||||
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
||||
|
||||
{
|
||||
for _ in 0..100 {
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
use Result;
|
||||
use DocId;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use super::operation::AddOperation;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use schema::FieldType;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use datastruct::stacker::Heap;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use super::operation::AddOperation;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use schema::Value;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use schema::Value;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// documents.
|
||||
@@ -35,7 +35,6 @@ pub struct SegmentWriter<'a> {
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> SegmentWriter<'a> {
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
@@ -139,8 +138,7 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let mut term = Term::for_field(field); // we set the Term
|
||||
for facet_bytes in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||
@@ -179,8 +177,7 @@ impl<'a> SegmentWriter<'a> {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, num_tokens);
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
@@ -204,6 +201,9 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// Do nothing. Bytes only supports fast fields.
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
||||
|
||||
@@ -1,15 +1,66 @@
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
// AtomicU64 have not landed in stable.
|
||||
// For the moment let's just use AtomicUsize on
|
||||
// x86/64 bit platform, and a mutex on other platform.
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicUsize>);
|
||||
#[cfg(target = "x86_64")]
|
||||
mod archicture_impl {
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicUsize::new(first_opstamp as usize)))
|
||||
}
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1, Ordering::SeqCst) as u64
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target = "x86_64"))]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<Mutex<u64>>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(Mutex::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
let mut guard = self.0.lock().expect("Failed to lock the stamper");
|
||||
let previous_val = *guard;
|
||||
*guard = previous_val + 1;
|
||||
previous_val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub use self::archicture_impl::Stamper;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::Stamper;
|
||||
|
||||
#[test]
|
||||
fn test_stamper() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
assert_eq!(stamper.stamp(), 7u64);
|
||||
assert_eq!(stamper.stamp(), 8u64);
|
||||
|
||||
let stamper_clone = stamper.clone();
|
||||
assert_eq!(stamper.stamp(), 9u64);
|
||||
|
||||
assert_eq!(stamper.stamp(), 10u64);
|
||||
assert_eq!(stamper_clone.stamp(), 11u64);
|
||||
}
|
||||
}
|
||||
|
||||
97
src/lib.rs
97
src/lib.rs
@@ -1,14 +1,7 @@
|
||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(module_inception))]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(inline_always))]
|
||||
#![feature(box_syntax)]
|
||||
#![feature(optin_builtin_traits)]
|
||||
#![feature(conservative_impl_trait)]
|
||||
#![feature(collections_range)]
|
||||
#![feature(integer_atomics)]
|
||||
#![feature(drain_filter)]
|
||||
#![cfg_attr(test, feature(test))]
|
||||
#![cfg_attr(test, feature(iterator_step_by))]
|
||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![allow(unknown_lints)]
|
||||
#![allow(new_without_default)]
|
||||
@@ -123,36 +116,40 @@ extern crate lazy_static;
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
|
||||
#[cfg_attr(test, macro_use)]
|
||||
extern crate serde_json;
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
#[macro_use]
|
||||
extern crate error_chain;
|
||||
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
extern crate atomicwrites;
|
||||
extern crate base64;
|
||||
extern crate bit_set;
|
||||
extern crate bitpacking;
|
||||
extern crate byteorder;
|
||||
extern crate chan;
|
||||
extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate fst;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate snap;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate lz4;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
extern crate rust_stemmers;
|
||||
extern crate serde;
|
||||
extern crate serde_json;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
#[cfg(test)]
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
extern crate bitpacking;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
@@ -166,7 +163,8 @@ extern crate winapi;
|
||||
|
||||
#[cfg(test)]
|
||||
extern crate rand;
|
||||
#[cfg(test)]
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
extern crate test;
|
||||
|
||||
extern crate tinysegmenter;
|
||||
@@ -185,36 +183,36 @@ pub use error::{Error, ErrorKind, ResultExt};
|
||||
/// Tantivy result.
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
mod core;
|
||||
mod compression;
|
||||
mod indexer;
|
||||
mod common;
|
||||
mod compression;
|
||||
mod core;
|
||||
mod indexer;
|
||||
|
||||
mod datastruct;
|
||||
#[allow(unused_doc_comment)]
|
||||
mod error;
|
||||
pub mod tokenizer;
|
||||
mod datastruct;
|
||||
|
||||
pub mod termdict;
|
||||
pub mod store;
|
||||
pub mod query;
|
||||
pub mod directory;
|
||||
pub mod collector;
|
||||
pub mod postings;
|
||||
pub mod schema;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
pub mod fieldnorm;
|
||||
pub mod postings;
|
||||
pub mod query;
|
||||
pub mod schema;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use directory::Directory;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use indexer::IndexWriter;
|
||||
pub use schema::{Document, Term};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use postings::Postings;
|
||||
pub use core::SegmentComponent;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use directory::Directory;
|
||||
pub use indexer::IndexWriter;
|
||||
pub use postings::Postings;
|
||||
pub use schema::{Document, Term};
|
||||
|
||||
pub use common::{i64_to_u64, u64_to_i64};
|
||||
|
||||
@@ -230,10 +228,10 @@ pub fn version() -> &'static str {
|
||||
|
||||
/// Defines tantivy's merging strategy
|
||||
pub mod merge_policy {
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::NoMergePolicy;
|
||||
pub use indexer::DefaultMergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::NoMergePolicy;
|
||||
}
|
||||
|
||||
/// A `u32` identifying a document within a segment.
|
||||
@@ -282,33 +280,29 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
mod tests {
|
||||
|
||||
use collector::tests::TestCollector;
|
||||
use Index;
|
||||
use core::SegmentReader;
|
||||
use query::BooleanQuery;
|
||||
use schema::*;
|
||||
use docset::DocSet;
|
||||
use query::BooleanQuery;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use schema::*;
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use Postings;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||
assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
|
||||
assert!(
|
||||
nearly_equals(val, expected),
|
||||
"Got {}, expected {}.",
|
||||
val,
|
||||
expected
|
||||
);
|
||||
}
|
||||
|
||||
pub fn nearly_equals(a: f32, b: f32) -> bool {
|
||||
(a - b).abs() < 0.0005 * (a + b).abs()
|
||||
}
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
@@ -318,10 +312,6 @@ mod tests {
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
@@ -333,7 +323,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
@@ -459,7 +449,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn advance_undeleted(docset: &mut DocSet, reader: &SegmentReader) -> bool {
|
||||
while docset.advance() {
|
||||
if !reader.is_deleted(docset.doc()) {
|
||||
|
||||
@@ -6,20 +6,18 @@ Postings module (also called inverted index)
|
||||
///
|
||||
/// Postings, also called inverted lists, is the key datastructure
|
||||
/// to full-text search.
|
||||
|
||||
mod postings;
|
||||
mod recorder;
|
||||
mod serializer;
|
||||
mod postings_writer;
|
||||
mod term_info;
|
||||
mod recorder;
|
||||
mod segment_postings;
|
||||
mod serializer;
|
||||
mod term_info;
|
||||
|
||||
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
@@ -38,25 +36,22 @@ pub(crate) enum FreqReadingOption {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use core::Index;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentReader;
|
||||
use datastruct::stacker::Heap;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
use indexer::SegmentWriter;
|
||||
use query::Scorer;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use schema::Field;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use std::iter;
|
||||
use DocId;
|
||||
use Score;
|
||||
use query::Intersection;
|
||||
use query::Scorer;
|
||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||
use core::SegmentComponent;
|
||||
use indexer::SegmentWriter;
|
||||
use core::SegmentReader;
|
||||
use core::Index;
|
||||
use schema::IndexRecordOption;
|
||||
use std::iter;
|
||||
use datastruct::stacker::Heap;
|
||||
use schema::Field;
|
||||
use test::{self, Bencher};
|
||||
use indexer::operation::AddOperation;
|
||||
use tests;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use fieldnorm::FieldNormReader;
|
||||
|
||||
#[test]
|
||||
pub fn test_position_write() {
|
||||
@@ -127,7 +122,6 @@ pub mod tests {
|
||||
assert_eq!(&[0, 5], &positions[..]);
|
||||
}
|
||||
{
|
||||
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
@@ -206,13 +200,14 @@ pub mod tests {
|
||||
{
|
||||
let segment_reader = SegmentReader::open(&segment).unwrap();
|
||||
{
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field) ;
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
|
||||
for i in 2..1000 {
|
||||
assert_eq!(
|
||||
fieldnorm_reader.fieldnorm_id(i),
|
||||
FieldNormReader::fieldnorm_to_id(i + 1) );
|
||||
FieldNormReader::fieldnorm_to_id(i + 1)
|
||||
);
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -449,7 +444,7 @@ pub mod tests {
|
||||
// delete everything else
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.delete_term(term_1);
|
||||
index_writer.delete_term(term_1);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
@@ -479,23 +474,23 @@ pub mod tests {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref TERM_A: Term = {
|
||||
pub static ref TERM_A: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "a")
|
||||
};
|
||||
static ref TERM_B: Term = {
|
||||
pub static ref TERM_B: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "b")
|
||||
};
|
||||
static ref TERM_C: Term = {
|
||||
pub static ref TERM_C: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "c")
|
||||
};
|
||||
static ref TERM_D: Term = {
|
||||
pub static ref TERM_D: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "d")
|
||||
};
|
||||
static ref INDEX: Index = {
|
||||
pub static ref INDEX: Index = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
@@ -507,7 +502,7 @@ pub mod tests {
|
||||
let posting_list_size = 1_000_000;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
for _ in 0 .. posting_list_size {
|
||||
for _ in 0..posting_list_size {
|
||||
let mut doc = Document::default();
|
||||
if rng.gen_weighted_bool(15) {
|
||||
doc.add_text(text_field, "a");
|
||||
@@ -530,6 +525,85 @@ pub mod tests {
|
||||
};
|
||||
}
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
/// implementation of `.skip_next(...)` is consistent
|
||||
/// with the default implementation.
|
||||
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
|
||||
|
||||
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
|
||||
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
|
||||
UnoptimizedDocSet(docset)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.0.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.0.score()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::tests::*;
|
||||
use docset::SkipResult;
|
||||
use query::Intersection;
|
||||
use schema::IndexRecordOption;
|
||||
use test::{self, Bencher};
|
||||
use tests;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_segment_postings(b: &mut Bencher) {
|
||||
let searcher = INDEX.searcher();
|
||||
@@ -646,71 +720,4 @@ pub mod tests {
|
||||
s
|
||||
});
|
||||
}
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
/// implementation of `.skip_next(...)` is consistent
|
||||
/// with the default implementation.
|
||||
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
|
||||
|
||||
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
|
||||
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
|
||||
UnoptimizedDocSet(docset)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.0.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.0.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
|
||||
fn score(&mut self) -> Score {
|
||||
self.0.score()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
for target in targets {
|
||||
let mut postings_opt = postings_factory();
|
||||
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
|
||||
let skip_result_opt = postings_opt.skip_next(target);
|
||||
let skip_result_unopt = postings_unopt.skip_next(target);
|
||||
assert_eq!(
|
||||
skip_result_unopt, skip_result_opt,
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
match skip_result_opt {
|
||||
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
|
||||
SkipResult::OverStep => assert!(postings_opt.doc() > target),
|
||||
SkipResult::End => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while postings_opt.advance() {
|
||||
assert!(postings_unopt.advance());
|
||||
assert_eq!(
|
||||
postings_opt.doc(),
|
||||
postings_unopt.doc(),
|
||||
"Failed while skipping to {}",
|
||||
target
|
||||
);
|
||||
}
|
||||
assert!(!postings_unopt.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,21 +1,18 @@
|
||||
use DocId;
|
||||
use schema::Term;
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use postings::UnorderedTermId;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use std::io;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||
use std::collections::HashMap;
|
||||
use postings::Recorder;
|
||||
use Result;
|
||||
use schema::{Field, Schema};
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use termdict::TermOrdinal;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use schema::IndexRecordOption;
|
||||
use postings::UnorderedTermId;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
@@ -39,11 +36,17 @@ fn posting_from_field_entry<'a>(
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// FieldType::Bytes cannot actually be indexed.
|
||||
// TODO fix during the indexer refactoring described in #276
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
}
|
||||
@@ -58,8 +61,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
.iter()
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.collect();
|
||||
|
||||
MultiFieldPostingsWriter {
|
||||
schema: schema.clone(),
|
||||
heap,
|
||||
term_index,
|
||||
per_field_postings_writers,
|
||||
@@ -83,7 +86,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
|
||||
@@ -94,8 +97,10 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
|
||||
HashMap::new();
|
||||
let mut unordered_term_mappings: HashMap<
|
||||
Field,
|
||||
HashMap<UnorderedTermId, TermOrdinal>,
|
||||
> = HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
@@ -110,20 +115,30 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
|
||||
// populating the unordered term ord -> ordered term ord mapping
|
||||
// for the field.
|
||||
let mut mapping = HashMap::new();
|
||||
for (term_ord, term_unord_id) in term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket)
|
||||
.enumerate()
|
||||
{
|
||||
mapping.insert(term_unord_id, term_ord);
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Str(_) | &FieldType::HierarchicalFacet => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let mut unordered_term_ids = term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||
})
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
&FieldType::U64(_) | &FieldType::I64(_) => {}
|
||||
&FieldType::Bytes => {}
|
||||
}
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
let mut field_serializer = serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
let mut field_serializer =
|
||||
serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
@@ -179,8 +194,7 @@ pub trait PostingsWriter {
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap,
|
||||
) -> u32 {
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let mut term = Term::for_field(field);
|
||||
let num_tokens = {
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use DocId;
|
||||
use std::{self, io};
|
||||
use postings::FieldSerializer;
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
use postings::FieldSerializer;
|
||||
use std::{self, io};
|
||||
use DocId;
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
const POSITION_END: u32 = std::u32::MAX;
|
||||
|
||||
@@ -2,15 +2,15 @@ use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BL
|
||||
use DocId;
|
||||
|
||||
use common::BitSet;
|
||||
use common::CountingWriter;
|
||||
use common::HasLen;
|
||||
use postings::Postings;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use postings::FreqReadingOption;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use common::CountingWriter;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
@@ -84,9 +84,13 @@ impl SegmentPostings {
|
||||
for &doc in docs {
|
||||
postings_serializer.write_doc(doc, 1u32).unwrap();
|
||||
}
|
||||
postings_serializer.close_term().expect("In memory Serialization should never fail.");
|
||||
postings_serializer
|
||||
.close_term()
|
||||
.expect("In memory Serialization should never fail.");
|
||||
}
|
||||
let (buffer , _) = counting_writer.finish().expect("Serializing in a buffer should never fail.");
|
||||
let (buffer, _) = counting_writer
|
||||
.finish()
|
||||
.expect("Serializing in a buffer should never fail.");
|
||||
let data = ReadOnlySource::from(buffer);
|
||||
let block_segment_postings = BlockSegmentPostings::from_data(
|
||||
docs.len(),
|
||||
@@ -98,7 +102,6 @@ impl SegmentPostings {
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
@@ -125,7 +128,7 @@ fn exponential_search(target: u32, mut start: usize, arr: &[u32]) -> (usize, usi
|
||||
loop {
|
||||
let new = start + jump;
|
||||
if new >= end {
|
||||
return (start, end)
|
||||
return (start, end);
|
||||
}
|
||||
if arr[new] > target {
|
||||
return (start, new);
|
||||
@@ -163,7 +166,8 @@ impl DocSet for SegmentPostings {
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
|
||||
let sum_freq: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer.as_mut()
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freq as usize);
|
||||
}
|
||||
@@ -198,7 +202,8 @@ impl DocSet for SegmentPostings {
|
||||
if self.position_computer.is_some() {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
|
||||
let sum_freqs: u32 = freqs_skipped.iter().sum();
|
||||
self.position_computer.as_mut()
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs as usize);
|
||||
}
|
||||
@@ -211,7 +216,6 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
@@ -262,7 +266,6 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl HasLen for SegmentPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq()
|
||||
@@ -276,16 +279,11 @@ impl Postings for SegmentPostings {
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
if self.position_computer.is_some() {
|
||||
let prev_capacity = output.capacity();
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if term_freq > prev_capacity {
|
||||
let additional_len = term_freq - output.len();
|
||||
output.reserve(additional_len);
|
||||
}
|
||||
unsafe {
|
||||
output.set_len(term_freq);
|
||||
self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
|
||||
}
|
||||
output.resize(self.term_freq() as usize, 0u32);
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.positions_with_offset(offset, &mut output[..])
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
@@ -473,16 +471,16 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use docset::DocSet;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::SegmentPostings;
|
||||
use schema::SchemaBuilder;
|
||||
use common::HasLen;
|
||||
use core::Index;
|
||||
use schema::INT_INDEXED;
|
||||
use schema::Term;
|
||||
use docset::DocSet;
|
||||
use fst::Streamer;
|
||||
use schema::IndexRecordOption;
|
||||
use common::HasLen;
|
||||
use super::BlockSegmentPostings;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -570,4 +568,3 @@ mod tests {
|
||||
assert_eq!(block_segments.docs(), &[1, 3, 5]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,20 +1,16 @@
|
||||
use Result;
|
||||
use termdict::TermDictionaryBuilderImpl;
|
||||
use super::TermInfo;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use directory::WritePtr;
|
||||
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use DocId;
|
||||
use core::Segment;
|
||||
use std::io::{self, Write};
|
||||
use compression::VIntEncoder;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use termdict::TermDictionaryBuilder;
|
||||
use common::{CompositeWrite, CountingWriter};
|
||||
use compression::VIntEncoder;
|
||||
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use core::Segment;
|
||||
use directory::WritePtr;
|
||||
use schema::Schema;
|
||||
use schema::{Field, FieldEntry, FieldType};
|
||||
use std::io::{self, Write};
|
||||
use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
@@ -85,7 +81,11 @@ impl InvertedIndexSerializer {
|
||||
/// a given field.
|
||||
///
|
||||
/// Loads the indexing options for the given field.
|
||||
pub fn new_field(&mut self, field: Field, total_num_tokens: u64) -> io::Result<FieldSerializer> {
|
||||
pub fn new_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -111,11 +111,12 @@ impl InvertedIndexSerializer {
|
||||
/// The field serializer is in charge of
|
||||
/// the serialization of a specific field.
|
||||
pub struct FieldSerializer<'a> {
|
||||
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
|
||||
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
|
||||
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
|
||||
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
||||
current_term_info: TermInfo,
|
||||
term_open: bool,
|
||||
num_terms: TermOrdinal,
|
||||
}
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
@@ -125,7 +126,6 @@ impl<'a> FieldSerializer<'a> {
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
if let Some(text_indexing_options) = text_options.get_indexing_options() {
|
||||
@@ -141,7 +141,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
_ => (false, false),
|
||||
};
|
||||
let term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
|
||||
TermDictionaryBuilder::new(term_dictionary_write, field_type)?;
|
||||
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
@@ -155,6 +155,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
positions_serializer_opt,
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
num_terms: TermOrdinal::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -175,7 +176,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
|
||||
assert!(
|
||||
!self.term_open,
|
||||
"Called new_term, while the previous term was not closed."
|
||||
@@ -183,7 +184,10 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.term_open = true;
|
||||
self.postings_serializer.clear();
|
||||
self.current_term_info = self.current_term_info();
|
||||
self.term_dictionary_builder.insert_key(term)
|
||||
self.term_dictionary_builder.insert_key(term)?;
|
||||
let term_ordinal = self.num_terms;
|
||||
self.num_terms += 1;
|
||||
Ok(term_ordinal)
|
||||
}
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
|
||||
@@ -1,34 +1,25 @@
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use std::io;
|
||||
|
||||
/// `TermInfo` contains all of the information
|
||||
/// associated to terms in the `.term` file.
|
||||
///
|
||||
/// It consists of
|
||||
/// * `doc_freq` : the number of document in the segment
|
||||
/// containing this term. It is also the length of the
|
||||
/// posting list associated to this term
|
||||
/// * `postings_offset` : an offset in the `.idx` file
|
||||
/// addressing the start of the posting list associated
|
||||
/// to this term.
|
||||
/// `TermInfo` wraps the metadata associated to a Term.
|
||||
/// It is segment-local.
|
||||
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
|
||||
pub struct TermInfo {
|
||||
/// Number of documents in the segment containing the term
|
||||
pub doc_freq: u32,
|
||||
/// Offset within the postings (`.idx`) file.
|
||||
/// Start offset within the postings (`.idx`) file.
|
||||
pub postings_offset: u64,
|
||||
/// Offset within the position (`.pos`) file.
|
||||
/// Start offset of the first block within the position (`.pos`) file.
|
||||
pub positions_offset: u64,
|
||||
/// Offset within the position block.
|
||||
/// Start offset within this position block.
|
||||
pub positions_inner_offset: u8,
|
||||
}
|
||||
|
||||
impl FixedSize for TermInfo {
|
||||
/// Size required for the binary serialization of `TermInfo`.
|
||||
/// This is large, but in practise, all `TermInfo` but the first one
|
||||
/// of the block are bitpacked.
|
||||
///
|
||||
/// See `TermInfoStore`.
|
||||
/// Size required for the binary serialization of a `TermInfo` object.
|
||||
/// This is large, but in practise, `TermInfo` are encoded in blocks and
|
||||
/// only the first `TermInfo` of a block is serialized uncompressed.
|
||||
/// The subsequent `TermInfo` are delta encoded and bitpacked.
|
||||
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,22 +1,20 @@
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use query::Scorer;
|
||||
use core::Searcher;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::{Query, Scorer, Weight};
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use DocId;
|
||||
use core::Searcher;
|
||||
|
||||
/// Query that matches all of the documents.
|
||||
///
|
||||
/// All of the document get the score 1f32.
|
||||
#[derive(Debug)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AllQuery;
|
||||
|
||||
impl Query for AllQuery {
|
||||
fn weight(&self, _: &Searcher, _: bool) -> Result<Box<Weight>> {
|
||||
Ok(box AllWeight)
|
||||
Ok(Box::new(AllWeight))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,29 +23,47 @@ pub struct AllWeight;
|
||||
|
||||
impl Weight for AllWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
Ok(box AllScorer {
|
||||
started: false,
|
||||
Ok(Box::new(AllScorer {
|
||||
state: State::NotStarted,
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc(),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
enum State {
|
||||
NotStarted,
|
||||
Started,
|
||||
Finished,
|
||||
}
|
||||
|
||||
/// Scorer associated to the `AllQuery` query.
|
||||
pub struct AllScorer {
|
||||
started: bool,
|
||||
state: State,
|
||||
doc: DocId,
|
||||
max_doc: DocId,
|
||||
}
|
||||
|
||||
impl DocSet for AllScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.started {
|
||||
self.doc += 1u32;
|
||||
} else {
|
||||
self.started = true;
|
||||
match self.state {
|
||||
State::NotStarted => {
|
||||
self.state = State::Started;
|
||||
self.doc = 0;
|
||||
}
|
||||
State::Started => {
|
||||
self.doc += 1u32;
|
||||
}
|
||||
State::Finished => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if self.doc < self.max_doc {
|
||||
return true;
|
||||
} else {
|
||||
self.state = State::Finished;
|
||||
return false;
|
||||
}
|
||||
self.doc < self.max_doc
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
@@ -64,3 +80,46 @@ impl Scorer for AllScorer {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::AllQuery;
|
||||
use query::Query;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_all_query() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>"aaa"));
|
||||
index_writer.add_document(doc!(field=>"bbb"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(field=>"ccc"));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let weight = AllQuery.weight(&searcher, false).unwrap();
|
||||
{
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut scorer = weight.scorer(reader).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 1u32);
|
||||
assert!(!scorer.advance());
|
||||
}
|
||||
{
|
||||
let reader = searcher.segment_reader(1);
|
||||
let mut scorer = weight.scorer(reader).unwrap();
|
||||
assert!(scorer.advance());
|
||||
assert_eq!(scorer.doc(), 0u32);
|
||||
assert!(!scorer.advance());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use common::{BitSet, TinySet};
|
||||
use DocId;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
|
||||
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
|
||||
///
|
||||
@@ -120,12 +120,10 @@ impl DocSet for BitSetDocSet {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use DocId;
|
||||
use super::BitSetDocSet;
|
||||
use common::BitSet;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use super::BitSetDocSet;
|
||||
extern crate test;
|
||||
use tests;
|
||||
use DocId;
|
||||
|
||||
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
|
||||
let mut docset = BitSet::with_max_value(max_doc);
|
||||
@@ -219,6 +217,17 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::BitSetDocSet;
|
||||
use test;
|
||||
use tests;
|
||||
use DocSet;
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
|
||||
use tests;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use fieldnorm::FieldNormReader;
|
||||
use Term;
|
||||
use Searcher;
|
||||
use Score;
|
||||
use Searcher;
|
||||
use Term;
|
||||
|
||||
const K1: f32 = 1.2;
|
||||
const B: f32 = 0.75;
|
||||
@@ -11,7 +11,6 @@ fn idf(doc_freq: u64, doc_count: u64) -> f32 {
|
||||
(1f32 + x).ln()
|
||||
}
|
||||
|
||||
|
||||
fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 {
|
||||
K1 * (1f32 - B + B * fieldnorm as f32 / average_fieldnorm)
|
||||
}
|
||||
@@ -32,11 +31,10 @@ pub struct BM25Weight {
|
||||
}
|
||||
|
||||
impl BM25Weight {
|
||||
|
||||
pub fn null() -> BM25Weight {
|
||||
BM25Weight {
|
||||
weight: 0f32,
|
||||
cache: [1f32; 256]
|
||||
cache: [1f32; 256],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +42,11 @@ impl BM25Weight {
|
||||
assert!(!terms.is_empty(), "BM25 requires at least one term");
|
||||
let field = terms[0].field();
|
||||
for term in &terms[1..] {
|
||||
assert_eq!(term.field(), field, "All terms must belong to the same field.");
|
||||
assert_eq!(
|
||||
term.field(),
|
||||
field,
|
||||
"All terms must belong to the same field."
|
||||
);
|
||||
}
|
||||
|
||||
let mut total_num_tokens = 0u64;
|
||||
@@ -56,7 +58,8 @@ impl BM25Weight {
|
||||
}
|
||||
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
|
||||
|
||||
let idf = terms.iter()
|
||||
let idf = terms
|
||||
.iter()
|
||||
.map(|term| {
|
||||
let term_doc_freq = searcher.doc_freq(term);
|
||||
idf(term_doc_freq, total_num_docs)
|
||||
@@ -83,12 +86,12 @@ impl BM25Weight {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use tests::assert_nearly_equals;
|
||||
use super::idf;
|
||||
use tests::assert_nearly_equals;
|
||||
|
||||
#[test]
|
||||
fn test_idf() {
|
||||
assert_nearly_equals(idf(1, 2), 0.6931472);
|
||||
assert_nearly_equals(idf(1, 2), 0.6931472);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use Result;
|
||||
use super::boolean_weight::BooleanWeight;
|
||||
use query::Weight;
|
||||
use Searcher;
|
||||
use query::Query;
|
||||
use schema::Term;
|
||||
use query::TermQuery;
|
||||
use schema::IndexRecordOption;
|
||||
use query::Occur;
|
||||
use query::Query;
|
||||
use query::TermQuery;
|
||||
use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use Result;
|
||||
use Searcher;
|
||||
|
||||
/// The boolean query combines a set of queries
|
||||
///
|
||||
@@ -23,6 +23,16 @@ pub struct BooleanQuery {
|
||||
subqueries: Vec<(Occur, Box<Query>)>,
|
||||
}
|
||||
|
||||
impl Clone for BooleanQuery {
|
||||
fn clone(&self) -> Self {
|
||||
self.subqueries
|
||||
.iter()
|
||||
.map(|(x, y)| (x.clone(), y.box_clone()))
|
||||
.collect::<Vec<_>>()
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
|
||||
fn from(subqueries: Vec<(Occur, Box<Query>)>) -> BooleanQuery {
|
||||
BooleanQuery { subqueries }
|
||||
@@ -37,7 +47,7 @@ impl Query for BooleanQuery {
|
||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
Ok(box BooleanWeight::new(sub_weights, scoring_enabled))
|
||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,10 +58,16 @@ impl BooleanQuery {
|
||||
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
|
||||
.into_iter()
|
||||
.map(|term| {
|
||||
let term_query: Box<Query> = box TermQuery::new(term, IndexRecordOption::WithFreqs);
|
||||
let term_query: Box<Query> =
|
||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
||||
(Occur::Should, term_query)
|
||||
})
|
||||
.collect();
|
||||
BooleanQuery::from(occur_term_queries)
|
||||
}
|
||||
|
||||
/// Deconstructed view of the clauses making up this query.
|
||||
pub fn clauses(&self) -> &[(Occur, Box<Query>)] {
|
||||
&self.subqueries[..]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,18 @@
|
||||
use query::Weight;
|
||||
use core::SegmentReader;
|
||||
use query::Union;
|
||||
use std::collections::HashMap;
|
||||
use query::EmptyScorer;
|
||||
use query::Scorer;
|
||||
use downcast::Downcast;
|
||||
use std::borrow::Borrow;
|
||||
use query::intersect_scorers;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use query::term_query::TermScorer;
|
||||
use query::EmptyScorer;
|
||||
use query::Exclude;
|
||||
use query::Occur;
|
||||
use query::RequiredOptionalScorer;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use query::Scorer;
|
||||
use query::Union;
|
||||
use query::Weight;
|
||||
use std::borrow::Borrow;
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
use query::intersect_scorers;
|
||||
use query::term_query::TermScorer;
|
||||
|
||||
|
||||
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
|
||||
where
|
||||
@@ -34,14 +33,13 @@ where
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = box Union::<TermScorer, TScoreCombiner>::from(scorers);
|
||||
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
||||
return scorer;
|
||||
}
|
||||
}
|
||||
|
||||
let scorer: Box<Scorer> = box Union::<_, TScoreCombiner>::from(scorers);
|
||||
let scorer: Box<Scorer> = Box::new(Union::<_, TScoreCombiner>::from(scorers));
|
||||
return scorer;
|
||||
|
||||
}
|
||||
|
||||
pub struct BooleanWeight {
|
||||
@@ -78,17 +76,17 @@ impl BooleanWeight {
|
||||
.remove(&Occur::MustNot)
|
||||
.map(scorer_union::<TScoreCombiner>);
|
||||
|
||||
let must_scorer_opt: Option<Box<Scorer>> =
|
||||
per_occur_scorers.remove(&Occur::Must)
|
||||
.map(intersect_scorers);
|
||||
let must_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::Must)
|
||||
.map(intersect_scorers);
|
||||
|
||||
let positive_scorer: Box<Scorer> = match (should_scorer_opt, must_scorer_opt) {
|
||||
(Some(should_scorer), Some(must_scorer)) => {
|
||||
if self.scoring_enabled {
|
||||
box RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||
Box::new(RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||
must_scorer,
|
||||
should_scorer,
|
||||
)
|
||||
))
|
||||
} else {
|
||||
must_scorer
|
||||
}
|
||||
@@ -96,12 +94,12 @@ impl BooleanWeight {
|
||||
(None, Some(must_scorer)) => must_scorer,
|
||||
(Some(should_scorer), None) => should_scorer,
|
||||
(None, None) => {
|
||||
return Ok(box EmptyScorer);
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(exclude_scorer) = exclude_scorer_opt {
|
||||
Ok(box Exclude::new(positive_scorer, exclude_scorer))
|
||||
Ok(Box::new(Exclude::new(positive_scorer, exclude_scorer)))
|
||||
} else {
|
||||
Ok(positive_scorer)
|
||||
}
|
||||
@@ -111,11 +109,11 @@ impl BooleanWeight {
|
||||
impl Weight for BooleanWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
if self.weights.is_empty() {
|
||||
Ok(box EmptyScorer)
|
||||
Ok(Box::new(EmptyScorer))
|
||||
} else if self.weights.len() == 1 {
|
||||
let &(occur, ref weight) = &self.weights[0];
|
||||
if occur == Occur::MustNot {
|
||||
Ok(box EmptyScorer)
|
||||
Ok(Box::new(EmptyScorer))
|
||||
} else {
|
||||
weight.scorer(reader)
|
||||
}
|
||||
|
||||
@@ -7,19 +7,19 @@ pub use self::boolean_query::BooleanQuery;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use query::Occur;
|
||||
use query::Query;
|
||||
use query::TermQuery;
|
||||
use query::Intersection;
|
||||
use query::Scorer;
|
||||
use collector::tests::TestCollector;
|
||||
use Index;
|
||||
use downcast::Downcast;
|
||||
use schema::*;
|
||||
use query::QueryParser;
|
||||
use query::RequiredOptionalScorer;
|
||||
use query::score_combiner::SumWithCoordsCombiner;
|
||||
use query::term_query::TermScorer;
|
||||
use query::Intersection;
|
||||
use query::Occur;
|
||||
use query::Query;
|
||||
use query::QueryParser;
|
||||
use query::RequiredOptionalScorer;
|
||||
use query::Scorer;
|
||||
use query::TermQuery;
|
||||
use schema::*;
|
||||
use Index;
|
||||
|
||||
fn aux_test_helper() -> (Index, Field) {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -110,6 +110,7 @@ mod tests {
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&*searcher, false).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
println!("{:?}", scorer.type_name());
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
}
|
||||
}
|
||||
@@ -123,7 +124,7 @@ mod tests {
|
||||
Term::from_field_text(text_field, text),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let query: Box<Query> = box term_query;
|
||||
let query: Box<Query> = Box::new(term_query);
|
||||
query
|
||||
};
|
||||
|
||||
@@ -170,7 +171,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_intersection_score() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
@@ -180,7 +180,7 @@ mod tests {
|
||||
Term::from_field_text(text_field, text),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let query: Box<Query> = box term_query;
|
||||
let query: Box<Query> = Box::new(term_query);
|
||||
query
|
||||
};
|
||||
|
||||
@@ -192,7 +192,10 @@ mod tests {
|
||||
};
|
||||
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), (Occur::Must, make_term_query("b"))]);
|
||||
let boolean_query = BooleanQuery::from(vec![
|
||||
(Occur::Must, make_term_query("a")),
|
||||
(Occur::Must, make_term_query("b")),
|
||||
]);
|
||||
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use query::Scorer;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use Score;
|
||||
use query::Scorer;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
enum State {
|
||||
@@ -129,10 +129,10 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use tests::sample_with_seed;
|
||||
use postings::tests::test_skip_against_unoptimized;
|
||||
use super::*;
|
||||
use postings::tests::test_skip_against_unoptimized;
|
||||
use query::VecDocSet;
|
||||
use tests::sample_with_seed;
|
||||
|
||||
#[test]
|
||||
fn test_exclude() {
|
||||
@@ -151,10 +151,10 @@ mod tests {
|
||||
fn test_exclude_skip() {
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
box Exclude::new(
|
||||
Box::new(Exclude::new(
|
||||
VecDocSet::from(vec![1, 2, 5, 8, 10, 15, 24]),
|
||||
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
|
||||
)
|
||||
))
|
||||
},
|
||||
vec![1, 2, 5, 8, 10, 15, 24],
|
||||
);
|
||||
@@ -167,10 +167,10 @@ mod tests {
|
||||
let sample_skip = sample_with_seed(10_000, 0.005, 3);
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
box Exclude::new(
|
||||
Box::new(Exclude::new(
|
||||
VecDocSet::from(sample_include.clone()),
|
||||
VecDocSet::from(sample_exclude.clone()),
|
||||
)
|
||||
))
|
||||
},
|
||||
sample_skip,
|
||||
);
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use docset::{DocSet, SkipResult};
|
||||
use query::Scorer;
|
||||
use query::EmptyScorer;
|
||||
use DocId;
|
||||
use downcast::Downcast;
|
||||
use std::borrow::Borrow;
|
||||
use Score;
|
||||
use query::term_query::TermScorer;
|
||||
use query::EmptyScorer;
|
||||
use query::Scorer;
|
||||
use std::borrow::Borrow;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
/// Returns the intersection scorer.
|
||||
///
|
||||
@@ -22,7 +22,7 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||
let second_rarest_opt = scorers.pop();
|
||||
scorers.reverse();
|
||||
match (rarest_opt, second_rarest_opt) {
|
||||
(None, None) => box EmptyScorer,
|
||||
(None, None) => Box::new(EmptyScorer),
|
||||
(Some(single_docset), None) => single_docset,
|
||||
(Some(left), Some(right)) => {
|
||||
{
|
||||
@@ -32,31 +32,33 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||
}) {
|
||||
let left = *Downcast::<TermScorer>::downcast(left).unwrap();
|
||||
let right = *Downcast::<TermScorer>::downcast(right).unwrap();
|
||||
return box Intersection {
|
||||
return Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets
|
||||
}
|
||||
num_docsets,
|
||||
});
|
||||
}
|
||||
}
|
||||
return box Intersection {
|
||||
return Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets
|
||||
}
|
||||
num_docsets,
|
||||
});
|
||||
}
|
||||
_ => {
|
||||
unreachable!();
|
||||
}
|
||||
_ => { unreachable!(); }
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet=Box<Scorer>> {
|
||||
pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<Scorer>> {
|
||||
left: TDocSet,
|
||||
right: TDocSet,
|
||||
others: Vec<TOtherDocSet>,
|
||||
num_docsets: usize
|
||||
num_docsets: usize,
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
@@ -71,18 +73,17 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
left,
|
||||
right,
|
||||
others: docsets,
|
||||
num_docsets
|
||||
num_docsets,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
pub(crate) fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
|
||||
match ord {
|
||||
0 => &mut self.left,
|
||||
1 => &mut self.right,
|
||||
n => &mut self.others[n - 2]
|
||||
n => &mut self.others[n - 2],
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -92,7 +93,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet>
|
||||
match ord {
|
||||
0 => &mut self.left,
|
||||
1 => &mut self.right,
|
||||
n => &mut self.others[n - 2]
|
||||
n => &mut self.others[n - 2],
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -114,23 +115,30 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
// of the two rarest `DocSet` in the intersection.
|
||||
loop {
|
||||
match right.skip_next(candidate) {
|
||||
SkipResult::Reached => { break; }
|
||||
SkipResult::Reached => {
|
||||
break;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = right.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => { return false; }
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => { break; }
|
||||
SkipResult::Reached => {
|
||||
break;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => { return false; }
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// test the remaining scorers;
|
||||
for (ord, docset) in self.others.iter_mut().enumerate() {
|
||||
@@ -147,16 +155,22 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
// let's update our candidate.
|
||||
candidate = docset.doc();
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => { other_candidate_ord = ord; }
|
||||
SkipResult::Reached => {
|
||||
other_candidate_ord = ord;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => { return false; }
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => { return false; }
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -164,9 +178,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
|
||||
// We optimize skipping by skipping every single member
|
||||
// of the intersection to target.
|
||||
let mut current_target: DocId = target;
|
||||
@@ -211,18 +223,22 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
}
|
||||
|
||||
impl<TScorer, TOtherScorer> Scorer for Intersection<TScorer, TOtherScorer>
|
||||
where TScorer: Scorer, TOtherScorer: Scorer {
|
||||
where
|
||||
TScorer: Scorer,
|
||||
TOtherScorer: Scorer,
|
||||
{
|
||||
fn score(&mut self) -> Score {
|
||||
self.left.score() + self.right.score() + self.others.iter_mut().map(Scorer::score).sum::<Score>()
|
||||
self.left.score() + self.right.score()
|
||||
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use docset::{DocSet, SkipResult};
|
||||
use super::Intersection;
|
||||
use query::VecDocSet;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use postings::tests::test_skip_against_unoptimized;
|
||||
use query::VecDocSet;
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
@@ -271,7 +287,7 @@ mod tests {
|
||||
|| {
|
||||
let left = VecDocSet::from(vec![4]);
|
||||
let right = VecDocSet::from(vec![2, 5]);
|
||||
box Intersection::new(vec![left, right])
|
||||
Box::new(Intersection::new(vec![left, right]))
|
||||
},
|
||||
vec![0, 2, 4, 5, 6],
|
||||
);
|
||||
@@ -281,19 +297,19 @@ mod tests {
|
||||
let mut right = VecDocSet::from(vec![2, 5, 10]);
|
||||
left.advance();
|
||||
right.advance();
|
||||
box Intersection::new(vec![left, right])
|
||||
Box::new(Intersection::new(vec![left, right]))
|
||||
},
|
||||
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
|
||||
);
|
||||
test_skip_against_unoptimized(
|
||||
|| {
|
||||
box Intersection::new(vec![
|
||||
Box::new(Intersection::new(vec![
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 2, 5, 6]),
|
||||
VecDocSet::from(vec![1, 4, 5, 6]),
|
||||
VecDocSet::from(vec![1, 5, 6]),
|
||||
VecDocSet::from(vec![2, 4, 5, 7, 8]),
|
||||
])
|
||||
]))
|
||||
},
|
||||
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
|
||||
);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user