mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Compare commits
33 Commits
issue/weba
...
0.8.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f07dc35d8 | ||
|
|
176f67a266 | ||
|
|
19babff849 | ||
|
|
bf2576adf9 | ||
|
|
b8241c5603 | ||
|
|
a4745151c0 | ||
|
|
e2ce326a8c | ||
|
|
bb21d12a70 | ||
|
|
4565aba62a | ||
|
|
545a7ec8dd | ||
|
|
e68775d71c | ||
|
|
dcc92d287e | ||
|
|
b48f81c051 | ||
|
|
a3042e956b | ||
|
|
1fa10f0a0b | ||
|
|
279a9eb5e3 | ||
|
|
21a24672d8 | ||
|
|
a3f1fbaae6 | ||
|
|
a6e767c877 | ||
|
|
6af0488dbe | ||
|
|
07d87e154b | ||
|
|
8b0b0133dd | ||
|
|
7b9752f897 | ||
|
|
c92f41aea8 | ||
|
|
dea16f1d9d | ||
|
|
236cfbec08 | ||
|
|
edcafb69bb | ||
|
|
14908479d5 | ||
|
|
ab4593eeb7 | ||
|
|
e75bb1d6a1 | ||
|
|
63b9d62237 | ||
|
|
0098e3d428 | ||
|
|
69d5e4b9b1 |
12
CHANGELOG.md
12
CHANGELOG.md
@@ -1,3 +1,15 @@
|
|||||||
|
Tantivy 0.8.0
|
||||||
|
=====================
|
||||||
|
*No change in the index format*
|
||||||
|
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
||||||
|
- Multithreaded search (@jwolfe, @fulmicoton)
|
||||||
|
|
||||||
|
|
||||||
|
Tantivy 0.7.1
|
||||||
|
=====================
|
||||||
|
*No change in the index format*
|
||||||
|
- Bugfix: NGramTokenizer panics on non ascii chars
|
||||||
|
- Added a space usage API
|
||||||
|
|
||||||
Tantivy 0.7
|
Tantivy 0.7
|
||||||
=====================
|
=====================
|
||||||
|
|||||||
14
Cargo.toml
14
Cargo.toml
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.7.0"
|
version = "0.8.2"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -12,7 +12,7 @@ readme = "README.md"
|
|||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
base64 = "0.9.1"
|
base64 = "0.10.0"
|
||||||
byteorder = "1.0"
|
byteorder = "1.0"
|
||||||
lazy_static = "1"
|
lazy_static = "1"
|
||||||
regex = "1.0"
|
regex = "1.0"
|
||||||
@@ -29,12 +29,11 @@ serde = "1.0"
|
|||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
num_cpus = "1.2"
|
num_cpus = "1.2"
|
||||||
itertools = "0.7"
|
itertools = "0.8"
|
||||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||||
bit-set = "0.5"
|
bit-set = "0.5"
|
||||||
uuid = { version = "0.7", features = ["v4", "serde"] }
|
uuid = { version = "0.7", features = ["v4", "serde"] }
|
||||||
crossbeam = "0.4"
|
crossbeam = "0.5"
|
||||||
crossbeam-channel = "0.2"
|
|
||||||
futures = "0.1"
|
futures = "0.1"
|
||||||
futures-cpupool = "0.1"
|
futures-cpupool = "0.1"
|
||||||
owning_ref = "0.4"
|
owning_ref = "0.4"
|
||||||
@@ -49,12 +48,14 @@ owned-read = "0.4"
|
|||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
htmlescape = "0.3.1"
|
htmlescape = "0.3.1"
|
||||||
fail = "0.2"
|
fail = "0.2"
|
||||||
|
scoped-pool = "1.0"
|
||||||
|
murmurhash32 = "0.2"
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies]
|
[target.'cfg(windows)'.dependencies]
|
||||||
winapi = "0.2"
|
winapi = "0.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = "0.5"
|
rand = "0.6"
|
||||||
maplit = "1"
|
maplit = "1"
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
@@ -72,6 +73,7 @@ default = ["mmap", "no_fail"]
|
|||||||
mmap = ["fst/mmap", "atomicwrites"]
|
mmap = ["fst/mmap", "atomicwrites"]
|
||||||
lz4-compression = ["lz4"]
|
lz4-compression = ["lz4"]
|
||||||
no_fail = ["fail/no_fail"]
|
no_fail = ["fail/no_fail"]
|
||||||
|
unstable = [] # useful for benches.
|
||||||
|
|
||||||
[badges]
|
[badges]
|
||||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||||
|
|||||||
@@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in rust.
|
**Tantivy** is a **full text search engine library** written in rust.
|
||||||
|
|
||||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||||
an off-the-shelf search engine server, but rather a crate that can be used
|
an off-the-shelf search engine server, but rather a crate that can be used
|
||||||
to build such a search engine.
|
to build such a search engine.
|
||||||
|
|
||||||
|
|||||||
@@ -16,10 +16,11 @@ extern crate tempdir;
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopCollector;
|
use tantivy::collector::TopDocs;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
|
use tempdir::TempDir;
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// Let's create a temporary directory for the
|
// Let's create a temporary directory for the
|
||||||
@@ -34,7 +35,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// be indexed".
|
// be indexed".
|
||||||
|
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
// Our first field is title.
|
// Our first field is title.
|
||||||
// We want full-text search for it, and we also want
|
// We want full-text search for it, and we also want
|
||||||
@@ -105,37 +106,37 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// For convenience, tantivy also comes with a macro to
|
// For convenience, tantivy also comes with a macro to
|
||||||
// reduce the boilerplate above.
|
// reduce the boilerplate above.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
|
|
||||||
// Multivalued field just need to be repeated.
|
// Multivalued field just need to be repeated.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
title => "The Modern Prometheus",
|
title => "The Modern Prometheus",
|
||||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||||
increasing confidence in the success of my undertaking."
|
increasing confidence in the success of my undertaking."
|
||||||
));
|
));
|
||||||
|
|
||||||
// This is an example, so we will only index 3 documents
|
// This is an example, so we will only index 3 documents
|
||||||
@@ -212,15 +213,10 @@ fn main() -> tantivy::Result<()> {
|
|||||||
//
|
//
|
||||||
// We are not interested in all of the documents but
|
// We are not interested in all of the documents but
|
||||||
// only in the top 10. Keeping track of our top 10 best documents
|
// only in the top 10. Keeping track of our top 10 best documents
|
||||||
// is the role of the TopCollector.
|
// is the role of the TopDocs.
|
||||||
let mut top_collector = TopCollector::with_limit(10);
|
|
||||||
|
|
||||||
// We can now perform our query.
|
// We can now perform our query.
|
||||||
searcher.search(&*query, &mut top_collector)?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
|
|
||||||
// Our top collector now contains the 10
|
|
||||||
// most relevant doc ids...
|
|
||||||
let doc_addresses = top_collector.docs();
|
|
||||||
|
|
||||||
// The actual documents still need to be
|
// The actual documents still need to be
|
||||||
// retrieved from Tantivy's store.
|
// retrieved from Tantivy's store.
|
||||||
@@ -229,12 +225,10 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// the document returned will only contain
|
// the document returned will only contain
|
||||||
// a title.
|
// a title.
|
||||||
|
|
||||||
for doc_address in doc_addresses {
|
for (_score, doc_address) in top_docs {
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
use tempdir::TempDir;
|
|
||||||
|
|||||||
187
examples/custom_collector.rs
Normal file
187
examples/custom_collector.rs
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
// # Custom collector example
|
||||||
|
//
|
||||||
|
// This example shows how you can implement your own
|
||||||
|
// collector. As an example, we will compute a collector
|
||||||
|
// that computes the standard deviation of a given fast field.
|
||||||
|
//
|
||||||
|
// Of course, you can have a look at the tantivy's built-in collectors
|
||||||
|
// such as the `CountCollector` for more examples.
|
||||||
|
|
||||||
|
extern crate tempdir;
|
||||||
|
|
||||||
|
// ---
|
||||||
|
// Importing tantivy...
|
||||||
|
#[macro_use]
|
||||||
|
extern crate tantivy;
|
||||||
|
use tantivy::collector::{Collector, SegmentCollector};
|
||||||
|
use tantivy::fastfield::FastFieldReader;
|
||||||
|
use tantivy::query::QueryParser;
|
||||||
|
use tantivy::schema::Field;
|
||||||
|
use tantivy::schema::{Schema, FAST, INT_INDEXED, TEXT};
|
||||||
|
use tantivy::Index;
|
||||||
|
use tantivy::SegmentReader;
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct Stats {
|
||||||
|
count: usize,
|
||||||
|
sum: f64,
|
||||||
|
squared_sum: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Stats {
|
||||||
|
pub fn count(&self) -> usize {
|
||||||
|
self.count
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mean(&self) -> f64 {
|
||||||
|
self.sum / (self.count as f64)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn square_mean(&self) -> f64 {
|
||||||
|
self.squared_sum / (self.count as f64)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn standard_deviation(&self) -> f64 {
|
||||||
|
let mean = self.mean();
|
||||||
|
(self.square_mean() - mean * mean).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn non_zero_count(self) -> Option<Stats> {
|
||||||
|
if self.count == 0 {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct StatsCollector {
|
||||||
|
field: Field,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StatsCollector {
|
||||||
|
fn with_field(field: Field) -> StatsCollector {
|
||||||
|
StatsCollector { field }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for StatsCollector {
|
||||||
|
// That's the type of our result.
|
||||||
|
// Our standard deviation will be a float.
|
||||||
|
type Fruit = Option<Stats>;
|
||||||
|
|
||||||
|
type Child = StatsSegmentCollector;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
_segment_local_id: u32,
|
||||||
|
segment: &SegmentReader,
|
||||||
|
) -> tantivy::Result<StatsSegmentCollector> {
|
||||||
|
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
||||||
|
Ok(StatsSegmentCollector {
|
||||||
|
fast_field_reader,
|
||||||
|
stats: Stats::default(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
// this collector does not care about score.
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
||||||
|
let mut stats = Stats::default();
|
||||||
|
for segment_stats_opt in segment_stats {
|
||||||
|
if let Some(segment_stats) = segment_stats_opt {
|
||||||
|
stats.count += segment_stats.count;
|
||||||
|
stats.sum += segment_stats.sum;
|
||||||
|
stats.squared_sum += segment_stats.squared_sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(stats.non_zero_count())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct StatsSegmentCollector {
|
||||||
|
fast_field_reader: FastFieldReader<u64>,
|
||||||
|
stats: Stats,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for StatsSegmentCollector {
|
||||||
|
type Fruit = Option<Stats>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: u32, _score: f32) {
|
||||||
|
let value = self.fast_field_reader.get(doc) as f64;
|
||||||
|
self.stats.count += 1;
|
||||||
|
self.stats.sum += value;
|
||||||
|
self.stats.squared_sum += value * value;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
|
self.stats.non_zero_count()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> tantivy::Result<()> {
|
||||||
|
// # Defining the schema
|
||||||
|
//
|
||||||
|
// The Tantivy index requires a very strict schema.
|
||||||
|
// The schema declares which fields are in the index,
|
||||||
|
// and for each field, its type and "the way it should
|
||||||
|
// be indexed".
|
||||||
|
|
||||||
|
// first we need to define a schema ...
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
|
// We'll assume a fictional index containing
|
||||||
|
// products, and with a name, a description, and a price.
|
||||||
|
let product_name = schema_builder.add_text_field("name", TEXT);
|
||||||
|
let product_description = schema_builder.add_text_field("description", TEXT);
|
||||||
|
let price = schema_builder.add_u64_field("price", INT_INDEXED | FAST);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
// # Indexing documents
|
||||||
|
//
|
||||||
|
// Lets index a bunch of fake documents for the sake of
|
||||||
|
// this example.
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
|
let mut index_writer = index.writer(50_000_000)?;
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
product_name => "Super Broom 2000",
|
||||||
|
product_description => "While it is ok for short distance travel, this broom \
|
||||||
|
was designed quiditch. It will up your game.",
|
||||||
|
price => 30_200u64
|
||||||
|
));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
product_name => "Turbulobroom",
|
||||||
|
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
||||||
|
You'll enjoy its sharp turns, and rapid acceleration",
|
||||||
|
price => 29_240u64
|
||||||
|
));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
product_name => "Broomio",
|
||||||
|
product_description => "Great value for the price. This broom is a market favorite",
|
||||||
|
price => 21_240u64
|
||||||
|
));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
product_name => "Whack a Mole",
|
||||||
|
product_description => "Prime quality bat.",
|
||||||
|
price => 5_200u64
|
||||||
|
));
|
||||||
|
index_writer.commit()?;
|
||||||
|
index.load_searchers()?;
|
||||||
|
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
|
||||||
|
|
||||||
|
// here we want to get a hit on the 'ken' in Frankenstein
|
||||||
|
let query = query_parser.parse_query("broom")?;
|
||||||
|
if let Some(stats) = searcher.search(&query, &StatsCollector::with_field(price))? {
|
||||||
|
println!("count: {}", stats.count());
|
||||||
|
println!("mean: {}", stats.mean());
|
||||||
|
println!("standard deviation: {}", stats.standard_deviation());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopCollector;
|
use tantivy::collector::TopDocs;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::tokenizer::NgramTokenizer;
|
use tantivy::tokenizer::NgramTokenizer;
|
||||||
@@ -20,7 +20,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// be indexed".
|
// be indexed".
|
||||||
|
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
// Our first field is title.
|
// Our first field is title.
|
||||||
// In this example we want to use NGram searching
|
// In this example we want to use NGram searching
|
||||||
@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// heap for the indexer can increase its throughput.
|
// heap for the indexer can increase its throughput.
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
let mut index_writer = index.writer(50_000_000)?;
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "The Old Man and the Sea",
|
title => "The Old Man and the Sea",
|
||||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
he had gone eighty-four days now without taking a fish."
|
he had gone eighty-four days now without taking a fish."
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one
|
over the yellow sands in the sunlight before reaching the narrow pool. On one
|
||||||
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
limbs and branches that arch over the pool"#
|
limbs and branches that arch over the pool"#
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||||
@@ -104,11 +104,9 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// here we want to get a hit on the 'ken' in Frankenstein
|
// here we want to get a hit on the 'ken' in Frankenstein
|
||||||
let query = query_parser.parse_query("ken")?;
|
let query = query_parser.parse_query("ken")?;
|
||||||
|
|
||||||
let mut top_collector = TopCollector::with_limit(10);
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
searcher.search(&*query, &mut top_collector)?;
|
|
||||||
|
|
||||||
let doc_addresses = top_collector.docs();
|
for (_, doc_address) in top_docs {
|
||||||
for doc_address in doc_addresses {
|
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopCollector;
|
use tantivy::collector::TopDocs;
|
||||||
use tantivy::query::TermQuery;
|
use tantivy::query::TermQuery;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
@@ -27,10 +27,9 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
|
|||||||
// The second argument is here to tell we don't care about decoding positions,
|
// The second argument is here to tell we don't care about decoding positions,
|
||||||
// or term frequencies.
|
// or term frequencies.
|
||||||
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
||||||
let mut top_collector = TopCollector::with_limit(1);
|
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
|
||||||
searcher.search(&term_query, &mut top_collector)?;
|
|
||||||
|
|
||||||
if let Some(doc_address) = top_collector.docs().first() {
|
if let Some((_score, doc_address)) = top_docs.first() {
|
||||||
let doc = searcher.doc(*doc_address)?;
|
let doc = searcher.doc(*doc_address)?;
|
||||||
Ok(Some(doc))
|
Ok(Some(doc))
|
||||||
} else {
|
} else {
|
||||||
@@ -44,7 +43,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
//
|
//
|
||||||
// Check out the *basic_search* example if this makes
|
// Check out the *basic_search* example if this makes
|
||||||
// small sense to you.
|
// small sense to you.
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
// Tantivy does not really have a notion of primary id.
|
// Tantivy does not really have a notion of primary id.
|
||||||
// This may change in the future.
|
// This may change in the future.
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Let's create a temporary directory for the
|
// Let's create a temporary directory for the
|
||||||
// sake of this example
|
// sake of this example
|
||||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
schema_builder.add_text_field("name", TEXT | STORED);
|
schema_builder.add_text_field("name", TEXT | STORED);
|
||||||
|
|
||||||
@@ -62,11 +62,10 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let mut facet_collector = FacetCollector::for_field(tags);
|
let mut facet_collector = FacetCollector::for_field(tags);
|
||||||
facet_collector.add_facet("/pools");
|
facet_collector.add_facet("/pools");
|
||||||
|
|
||||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
|
|
||||||
let counts = facet_collector.harvest();
|
|
||||||
// This lists all of the facet counts
|
// This lists all of the facet counts
|
||||||
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect();
|
let facets: Vec<(&Facet, u64)> = facet_counts.get("/pools").collect();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
facets,
|
facets,
|
||||||
vec![
|
vec![
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ use tantivy::{DocId, DocSet, Postings};
|
|||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// We first create a schema for the sake of the
|
// We first create a schema for the sake of the
|
||||||
// example. Check the `basic_search` example for more information.
|
// example. Check the `basic_search` example for more information.
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
// For this example, we need to make sure to index positions for our title
|
// For this example, we need to make sure to index positions for our title
|
||||||
// field. `TEXT` precisely does this.
|
// field. `TEXT` precisely does this.
|
||||||
|
|||||||
@@ -10,11 +10,11 @@ extern crate tempdir;
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopCollector;
|
use tantivy::collector::TopDocs;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use tantivy::SnippetGenerator;
|
use tantivy::{Snippet, SnippetGenerator};
|
||||||
use tempdir::TempDir;
|
use tempdir::TempDir;
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||||
|
|
||||||
// # Defining the schema
|
// # Defining the schema
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -35,15 +35,15 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
// we'll only need one doc for this example.
|
// we'll only need one doc for this example.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
// ...
|
// ...
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
@@ -54,18 +54,34 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||||
let query = query_parser.parse_query("sycamore spring")?;
|
let query = query_parser.parse_query("sycamore spring")?;
|
||||||
|
|
||||||
let mut top_collector = TopCollector::with_limit(10);
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
searcher.search(&*query, &mut top_collector)?;
|
|
||||||
|
|
||||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
|
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||||
|
|
||||||
let doc_addresses = top_collector.docs();
|
for (score, doc_address) in top_docs {
|
||||||
for doc_address in doc_addresses {
|
|
||||||
let doc = searcher.doc(doc_address)?;
|
let doc = searcher.doc(doc_address)?;
|
||||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||||
|
println!("Document score {}:", score);
|
||||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||||
println!("snippet: {}", snippet.to_html());
|
println!("snippet: {}", snippet.to_html());
|
||||||
|
println!("custom highlighting: {}", highlight(snippet));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn highlight(snippet: Snippet) -> String {
|
||||||
|
let mut result = String::new();
|
||||||
|
let mut start_from = 0;
|
||||||
|
|
||||||
|
for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) {
|
||||||
|
result.push_str(&snippet.fragments()[start_from..start]);
|
||||||
|
result.push_str(" --> ");
|
||||||
|
result.push_str(&snippet.fragments()[start..end]);
|
||||||
|
result.push_str(" <-- ");
|
||||||
|
start_from = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.push_str(&snippet.fragments()[start_from..]);
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ extern crate tempdir;
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopCollector;
|
use tantivy::collector::TopDocs;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
@@ -23,7 +23,7 @@ use tantivy::Index;
|
|||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// this example assumes you understand the content in `basic_search`
|
// this example assumes you understand the content in `basic_search`
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
// This configures your custom options for how tantivy will
|
// This configures your custom options for how tantivy will
|
||||||
// store and process your content in the index; The key
|
// store and process your content in the index; The key
|
||||||
@@ -72,26 +72,26 @@ fn main() -> tantivy::Result<()> {
|
|||||||
title => "The Old Man and the Sea",
|
title => "The Old Man and the Sea",
|
||||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
he had gone eighty-four days now without taking a fish."
|
he had gone eighty-four days now without taking a fish."
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||||
increasing confidence in the success of my undertaking."
|
increasing confidence in the success of my undertaking."
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
@@ -105,15 +105,11 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// stop words are applied on the query as well.
|
// stop words are applied on the query as well.
|
||||||
// The following will be equivalent to `title:frankenstein`
|
// The following will be equivalent to `title:frankenstein`
|
||||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||||
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
|
|
||||||
let mut top_collector = TopCollector::with_limit(10);
|
for (score, doc_address) in top_docs {
|
||||||
|
|
||||||
searcher.search(&*query, &mut top_collector)?;
|
|
||||||
|
|
||||||
let doc_addresses = top_collector.docs();
|
|
||||||
|
|
||||||
for doc_address in doc_addresses {
|
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
|
println!("\n==\nDocument score {}:", score);
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Check out the basic example if this is confusing to you.
|
// Check out the basic example if this is confusing to you.
|
||||||
//
|
//
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_text_field("title", TEXT | STORED);
|
schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
schema_builder.add_text_field("body", TEXT);
|
schema_builder.add_text_field("body", TEXT);
|
||||||
schema_builder.add_u64_field("year", INT_INDEXED);
|
schema_builder.add_u64_field("year", INT_INDEXED);
|
||||||
|
|||||||
@@ -1,142 +0,0 @@
|
|||||||
use collector::Collector;
|
|
||||||
use DocId;
|
|
||||||
use Result;
|
|
||||||
use Score;
|
|
||||||
use SegmentLocalId;
|
|
||||||
use SegmentReader;
|
|
||||||
|
|
||||||
/// Collector that does nothing.
|
|
||||||
/// This is used in the chain Collector and will hopefully
|
|
||||||
/// be optimized away by the compiler.
|
|
||||||
pub struct DoNothingCollector;
|
|
||||||
impl Collector for DoNothingCollector {
|
|
||||||
#[inline]
|
|
||||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
#[inline]
|
|
||||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
|
||||||
#[inline]
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
|
||||||
/// This contraption is only usable if the type of your collectors
|
|
||||||
/// are known at compile time.
|
|
||||||
///
|
|
||||||
/// ```rust
|
|
||||||
/// #[macro_use]
|
|
||||||
/// extern crate tantivy;
|
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
|
||||||
/// use tantivy::{Index, Result};
|
|
||||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
|
||||||
/// use tantivy::query::QueryParser;
|
|
||||||
///
|
|
||||||
/// # fn main() { example().unwrap(); }
|
|
||||||
/// fn example() -> Result<()> {
|
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
|
||||||
/// let schema = schema_builder.build();
|
|
||||||
/// let index = Index::create_in_ram(schema);
|
|
||||||
/// {
|
|
||||||
/// let mut index_writer = index.writer(3_000_000)?;
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "The Name of the Wind",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "The Diary of Muadib",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "A Dairy Cow",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "The Diary of a Young Girl",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.commit().unwrap();
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// index.load_searchers()?;
|
|
||||||
/// let searcher = index.searcher();
|
|
||||||
///
|
|
||||||
/// {
|
|
||||||
/// let mut top_collector = TopCollector::with_limit(2);
|
|
||||||
/// let mut count_collector = CountCollector::default();
|
|
||||||
/// {
|
|
||||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
|
||||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
|
||||||
/// }
|
|
||||||
/// assert_eq!(count_collector.count(), 2);
|
|
||||||
/// assert!(top_collector.at_capacity());
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// Ok(())
|
|
||||||
/// }
|
|
||||||
/// ```
|
|
||||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
|
||||||
left: Left,
|
|
||||||
right: Right,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
|
||||||
/// Adds a collector
|
|
||||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
|
||||||
ChainedCollector {
|
|
||||||
left: self,
|
|
||||||
right: new_collector,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
|
||||||
fn set_segment(
|
|
||||||
&mut self,
|
|
||||||
segment_local_id: SegmentLocalId,
|
|
||||||
segment: &SegmentReader,
|
|
||||||
) -> Result<()> {
|
|
||||||
self.left.set_segment(segment_local_id, segment)?;
|
|
||||||
self.right.set_segment(segment_local_id, segment)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.left.collect(doc, score);
|
|
||||||
self.right.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.left.requires_scoring() || self.right.requires_scoring()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a `ChainedCollector`
|
|
||||||
pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
|
||||||
ChainedCollector {
|
|
||||||
left: DoNothingCollector,
|
|
||||||
right: DoNothingCollector,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
use collector::{Collector, CountCollector, TopCollector};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_chained_collector() {
|
|
||||||
let mut top_collector = TopCollector::with_limit(2);
|
|
||||||
let mut count_collector = CountCollector::default();
|
|
||||||
{
|
|
||||||
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
|
||||||
collectors.collect(1, 0.2);
|
|
||||||
collectors.collect(2, 0.1);
|
|
||||||
collectors.collect(3, 0.5);
|
|
||||||
}
|
|
||||||
assert_eq!(count_collector.count(), 3);
|
|
||||||
assert!(top_collector.at_capacity());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
|
use collector::SegmentCollector;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -11,14 +12,14 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::CountCollector;
|
/// use tantivy::collector::Count;
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -43,59 +44,86 @@ use SegmentReader;
|
|||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// {
|
||||||
/// let mut count_collector = CountCollector::default();
|
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// searcher.search(&*query, &mut count_collector).unwrap();
|
/// let count = searcher.search(&query, &Count).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(count_collector.count(), 2);
|
/// assert_eq!(count, 2);
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Default)]
|
pub struct Count;
|
||||||
pub struct CountCollector {
|
|
||||||
count: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CountCollector {
|
impl Collector for Count {
|
||||||
/// Returns the count of documents that were
|
type Fruit = usize;
|
||||||
/// collected.
|
|
||||||
pub fn count(&self) -> usize {
|
|
||||||
self.count
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for CountCollector {
|
type Child = SegmentCountCollector;
|
||||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&mut self, _: DocId, _: Score) {
|
fn for_segment(&self, _: SegmentLocalId, _: &SegmentReader) -> Result<SegmentCountCollector> {
|
||||||
self.count += 1;
|
Ok(SegmentCountCollector::default())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, segment_counts: Vec<usize>) -> Result<usize> {
|
||||||
|
Ok(segment_counts.into_iter().sum())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct SegmentCountCollector {
|
||||||
|
count: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for SegmentCountCollector {
|
||||||
|
type Fruit = usize;
|
||||||
|
|
||||||
|
fn collect(&mut self, _: DocId, _: Score) {
|
||||||
|
self.count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> usize {
|
||||||
|
self.count
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use super::{Count, SegmentCountCollector};
|
||||||
use collector::{Collector, CountCollector};
|
use collector::Collector;
|
||||||
|
use collector::SegmentCollector;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_count_collector() {
|
fn test_count_collect_does_not_requires_scoring() {
|
||||||
let mut count_collector = CountCollector::default();
|
assert!(!Count.requires_scoring());
|
||||||
assert_eq!(count_collector.count(), 0);
|
}
|
||||||
count_collector.collect(0u32, 1f32);
|
|
||||||
assert_eq!(count_collector.count(), 1);
|
#[test]
|
||||||
assert_eq!(count_collector.count(), 1);
|
fn test_segment_count_collector() {
|
||||||
count_collector.collect(1u32, 1f32);
|
{
|
||||||
assert_eq!(count_collector.count(), 2);
|
let count_collector = SegmentCountCollector::default();
|
||||||
assert!(!count_collector.requires_scoring());
|
assert_eq!(count_collector.harvest(), 0);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut count_collector = SegmentCountCollector::default();
|
||||||
|
count_collector.collect(0u32, 1f32);
|
||||||
|
assert_eq!(count_collector.harvest(), 1);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut count_collector = SegmentCountCollector::default();
|
||||||
|
count_collector.collect(0u32, 1f32);
|
||||||
|
assert_eq!(count_collector.harvest(), 1);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut count_collector = SegmentCountCollector::default();
|
||||||
|
count_collector.collect(0u32, 1f32);
|
||||||
|
count_collector.collect(1u32, 1f32);
|
||||||
|
assert_eq!(count_collector.harvest(), 2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,20 +1,17 @@
|
|||||||
use collector::Collector;
|
use collector::Collector;
|
||||||
|
use collector::SegmentCollector;
|
||||||
use docset::SkipResult;
|
use docset::SkipResult;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use std::cell::UnsafeCell;
|
use std::cmp::Ordering;
|
||||||
use std::collections::btree_map;
|
use std::collections::btree_map;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use std::collections::Bound;
|
use std::collections::Bound;
|
||||||
use std::iter::Peekable;
|
use std::iter::Peekable;
|
||||||
use std::mem;
|
|
||||||
use std::{u64, usize};
|
use std::{u64, usize};
|
||||||
use termdict::TermMerger;
|
|
||||||
|
|
||||||
use std::cmp::Ordering;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -46,12 +43,6 @@ impl<'a> Ord for Hit<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct SegmentFacetCounter {
|
|
||||||
pub facet_reader: FacetReader,
|
|
||||||
pub facet_ords: Vec<u64>,
|
|
||||||
pub facet_counts: Vec<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||||
if facet_bytes.is_empty() {
|
if facet_bytes.is_empty() {
|
||||||
0
|
0
|
||||||
@@ -91,14 +82,14 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
|
/// use tantivy::schema::{Facet, Schema, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::FacetCollector;
|
/// use tantivy::collector::FacetCollector;
|
||||||
/// use tantivy::query::AllQuery;
|
/// use tantivy::query::AllQuery;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
///
|
///
|
||||||
/// // Facet have their own specific type.
|
/// // Facet have their own specific type.
|
||||||
/// // It is not a bad practise to put all of your
|
/// // It is not a bad practise to put all of your
|
||||||
@@ -141,13 +132,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/lang");
|
/// facet_collector.add_facet("/lang");
|
||||||
/// facet_collector.add_facet("/category");
|
/// facet_collector.add_facet("/category");
|
||||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
///
|
|
||||||
/// // this object contains count aggregate for all of the facets.
|
|
||||||
/// let counts = facet_collector.harvest();
|
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = counts
|
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||||
/// .get("/category")
|
/// .get("/category")
|
||||||
/// .collect();
|
/// .collect();
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
@@ -159,13 +147,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// {
|
/// {
|
||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/category/fiction");
|
/// facet_collector.add_facet("/category/fiction");
|
||||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
///
|
|
||||||
/// // this object contains count aggregate for all of the facets.
|
|
||||||
/// let counts = facet_collector.harvest();
|
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = counts
|
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||||
/// .get("/category/fiction")
|
/// .get("/category/fiction")
|
||||||
/// .collect();
|
/// .collect();
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
@@ -178,13 +163,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// {
|
/// {
|
||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/category/fiction");
|
/// facet_collector.add_facet("/category/fiction");
|
||||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
///
|
|
||||||
/// // this object contains count aggregate for all of the facets.
|
|
||||||
/// let counts = facet_collector.harvest();
|
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
|
/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
||||||
/// ]);
|
/// ]);
|
||||||
@@ -194,28 +176,28 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct FacetCollector {
|
pub struct FacetCollector {
|
||||||
facet_ords: Vec<u64>,
|
|
||||||
field: Field,
|
field: Field,
|
||||||
ff_reader: Option<UnsafeCell<FacetReader>>,
|
|
||||||
segment_counters: Vec<SegmentFacetCounter>,
|
|
||||||
|
|
||||||
// facet_ord -> collapse facet_id
|
|
||||||
current_segment_collapse_mapping: Vec<usize>,
|
|
||||||
// collapse facet_id -> count
|
|
||||||
current_segment_counts: Vec<u64>,
|
|
||||||
// collapse facet_id -> facet_ord
|
|
||||||
current_collapse_facet_ords: Vec<u64>,
|
|
||||||
|
|
||||||
facets: BTreeSet<Facet>,
|
facets: BTreeSet<Facet>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct FacetSegmentCollector {
|
||||||
|
reader: FacetReader,
|
||||||
|
facet_ords_buf: Vec<u64>,
|
||||||
|
// facet_ord -> collapse facet_id
|
||||||
|
collapse_mapping: Vec<usize>,
|
||||||
|
// collapse facet_id -> count
|
||||||
|
counts: Vec<u64>,
|
||||||
|
// collapse facet_id -> facet_ord
|
||||||
|
collapse_facet_ords: Vec<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||||
target: &[u8],
|
target: &[u8],
|
||||||
collapse_it: &mut Peekable<I>,
|
collapse_it: &mut Peekable<I>,
|
||||||
) -> SkipResult {
|
) -> SkipResult {
|
||||||
loop {
|
loop {
|
||||||
match collapse_it.peek() {
|
match collapse_it.peek() {
|
||||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
|
||||||
Ordering::Less => {}
|
Ordering::Less => {}
|
||||||
Ordering::Greater => {
|
Ordering::Greater => {
|
||||||
return SkipResult::OverStep;
|
return SkipResult::OverStep;
|
||||||
@@ -240,15 +222,8 @@ impl FacetCollector {
|
|||||||
/// is of the proper type.
|
/// is of the proper type.
|
||||||
pub fn for_field(field: Field) -> FacetCollector {
|
pub fn for_field(field: Field) -> FacetCollector {
|
||||||
FacetCollector {
|
FacetCollector {
|
||||||
facet_ords: Vec::with_capacity(255),
|
|
||||||
segment_counters: Vec::new(),
|
|
||||||
field,
|
field,
|
||||||
ff_reader: None,
|
facets: BTreeSet::default(),
|
||||||
facets: BTreeSet::new(),
|
|
||||||
|
|
||||||
current_segment_collapse_mapping: Vec::new(),
|
|
||||||
current_collapse_facet_ords: Vec::new(),
|
|
||||||
current_segment_counts: Vec::new(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -278,143 +253,100 @@ impl FacetCollector {
|
|||||||
}
|
}
|
||||||
self.facets.insert(facet);
|
self.facets.insert(facet);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
|
||||||
self.current_segment_collapse_mapping.clear();
|
|
||||||
self.current_collapse_facet_ords.clear();
|
|
||||||
self.current_segment_counts.clear();
|
|
||||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
|
||||||
self.current_collapse_facet_ords.push(0);
|
|
||||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
|
||||||
if !facet_streamer.advance() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
'outer: loop {
|
|
||||||
// at the begining of this loop, facet_streamer
|
|
||||||
// is positionned on a term that has not been processed yet.
|
|
||||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
|
||||||
match skip_result {
|
|
||||||
SkipResult::Reached => {
|
|
||||||
// we reach a facet we decided to collapse.
|
|
||||||
let collapse_depth = facet_depth(facet_streamer.key());
|
|
||||||
let mut collapsed_id = 0;
|
|
||||||
self.current_segment_collapse_mapping.push(0);
|
|
||||||
while facet_streamer.advance() {
|
|
||||||
let depth = facet_depth(facet_streamer.key());
|
|
||||||
if depth <= collapse_depth {
|
|
||||||
continue 'outer;
|
|
||||||
}
|
|
||||||
if depth == collapse_depth + 1 {
|
|
||||||
collapsed_id = self.current_collapse_facet_ords.len();
|
|
||||||
self.current_collapse_facet_ords
|
|
||||||
.push(facet_streamer.term_ord());
|
|
||||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
|
||||||
} else {
|
|
||||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
SkipResult::End | SkipResult::OverStep => {
|
|
||||||
self.current_segment_collapse_mapping.push(0);
|
|
||||||
if !facet_streamer.advance() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn finalize_segment(&mut self) {
|
|
||||||
if self.ff_reader.is_some() {
|
|
||||||
self.segment_counters.push(SegmentFacetCounter {
|
|
||||||
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
|
||||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
|
||||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the results of the collection.
|
|
||||||
///
|
|
||||||
/// This method does not just return the counters,
|
|
||||||
/// it also translates the facet ordinals of the last segment.
|
|
||||||
pub fn harvest(mut self) -> FacetCounts {
|
|
||||||
self.finalize_segment();
|
|
||||||
|
|
||||||
let collapsed_facet_ords: Vec<&[u64]> = self
|
|
||||||
.segment_counters
|
|
||||||
.iter()
|
|
||||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
|
||||||
.collect();
|
|
||||||
let collapsed_facet_counts: Vec<&[u64]> = self
|
|
||||||
.segment_counters
|
|
||||||
.iter()
|
|
||||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let facet_streams = self
|
|
||||||
.segment_counters
|
|
||||||
.iter()
|
|
||||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let mut facet_merger = TermMerger::new(facet_streams);
|
|
||||||
let mut facet_counts = BTreeMap::new();
|
|
||||||
|
|
||||||
while facet_merger.advance() {
|
|
||||||
let count = facet_merger
|
|
||||||
.current_kvs()
|
|
||||||
.iter()
|
|
||||||
.map(|it| {
|
|
||||||
let seg_ord = it.segment_ord;
|
|
||||||
let term_ord = it.streamer.term_ord();
|
|
||||||
collapsed_facet_ords[seg_ord]
|
|
||||||
.binary_search(&term_ord)
|
|
||||||
.map(|collapsed_term_id| {
|
|
||||||
if collapsed_term_id == 0 {
|
|
||||||
0
|
|
||||||
} else {
|
|
||||||
collapsed_facet_counts[seg_ord][collapsed_term_id]
|
|
||||||
}
|
|
||||||
}).unwrap_or(0)
|
|
||||||
}).sum();
|
|
||||||
if count > 0u64 {
|
|
||||||
let bytes: Vec<u8> = facet_merger.key().to_owned();
|
|
||||||
// may create an corrupted facet if the term dicitonary is corrupted
|
|
||||||
let facet = unsafe { Facet::from_encoded(bytes) };
|
|
||||||
facet_counts.insert(facet, count);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
FacetCounts { facet_counts }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Collector for FacetCollector {
|
impl Collector for FacetCollector {
|
||||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
type Fruit = FacetCounts;
|
||||||
self.finalize_segment();
|
|
||||||
|
type Child = FacetSegmentCollector;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
_: SegmentLocalId,
|
||||||
|
reader: &SegmentReader,
|
||||||
|
) -> Result<FacetSegmentCollector> {
|
||||||
let facet_reader = reader.facet_reader(self.field)?;
|
let facet_reader = reader.facet_reader(self.field)?;
|
||||||
self.set_collapse_mapping(&facet_reader);
|
|
||||||
self.current_segment_counts
|
let mut collapse_mapping = Vec::new();
|
||||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
let mut counts = Vec::new();
|
||||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
let mut collapse_facet_ords = Vec::new();
|
||||||
Ok(())
|
|
||||||
|
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||||
|
collapse_facet_ords.push(0);
|
||||||
|
{
|
||||||
|
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||||
|
if facet_streamer.advance() {
|
||||||
|
'outer: loop {
|
||||||
|
// at the begining of this loop, facet_streamer
|
||||||
|
// is positionned on a term that has not been processed yet.
|
||||||
|
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||||
|
match skip_result {
|
||||||
|
SkipResult::Reached => {
|
||||||
|
// we reach a facet we decided to collapse.
|
||||||
|
let collapse_depth = facet_depth(facet_streamer.key());
|
||||||
|
let mut collapsed_id = 0;
|
||||||
|
collapse_mapping.push(0);
|
||||||
|
while facet_streamer.advance() {
|
||||||
|
let depth = facet_depth(facet_streamer.key());
|
||||||
|
if depth <= collapse_depth {
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
if depth == collapse_depth + 1 {
|
||||||
|
collapsed_id = collapse_facet_ords.len();
|
||||||
|
collapse_facet_ords.push(facet_streamer.term_ord());
|
||||||
|
collapse_mapping.push(collapsed_id);
|
||||||
|
} else {
|
||||||
|
collapse_mapping.push(collapsed_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
SkipResult::End | SkipResult::OverStep => {
|
||||||
|
collapse_mapping.push(0);
|
||||||
|
if !facet_streamer.advance() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
counts.resize(collapse_facet_ords.len(), 0);
|
||||||
|
|
||||||
|
Ok(FacetSegmentCollector {
|
||||||
|
reader: facet_reader,
|
||||||
|
facet_ords_buf: Vec::with_capacity(255),
|
||||||
|
collapse_mapping,
|
||||||
|
counts,
|
||||||
|
collapse_facet_ords,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, segments_facet_counts: Vec<FacetCounts>) -> Result<FacetCounts> {
|
||||||
|
let mut facet_counts: BTreeMap<Facet, u64> = BTreeMap::new();
|
||||||
|
for segment_facet_counts in segments_facet_counts {
|
||||||
|
for (facet, count) in segment_facet_counts.facet_counts {
|
||||||
|
*(facet_counts.entry(facet).or_insert(0)) += count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(FacetCounts { facet_counts })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for FacetSegmentCollector {
|
||||||
|
type Fruit = FacetCounts;
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, _: Score) {
|
fn collect(&mut self, doc: DocId, _: Score) {
|
||||||
let facet_reader: &mut FacetReader = unsafe {
|
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
|
||||||
&mut *self
|
|
||||||
.ff_reader
|
|
||||||
.as_ref()
|
|
||||||
.expect("collect() was called before set_segment. This should never happen.")
|
|
||||||
.get()
|
|
||||||
};
|
|
||||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
|
||||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||||
for &facet_ord in &self.facet_ords {
|
for &facet_ord in &self.facet_ords_buf {
|
||||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
|
||||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
|
||||||
{
|
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
1
|
1
|
||||||
@@ -423,8 +355,24 @@ impl Collector for FacetCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
/// Returns the results of the collection.
|
||||||
false
|
///
|
||||||
|
/// This method does not just return the counters,
|
||||||
|
/// it also translates the facet ordinals of the last segment.
|
||||||
|
fn harvest(self) -> FacetCounts {
|
||||||
|
let mut facet_counts = BTreeMap::new();
|
||||||
|
let facet_dict = self.reader.facet_dict();
|
||||||
|
for (collapsed_facet_ord, count) in self.counts.iter().cloned().enumerate() {
|
||||||
|
if count == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let mut facet = vec![];
|
||||||
|
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
|
||||||
|
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
|
||||||
|
// TODO
|
||||||
|
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
|
||||||
|
}
|
||||||
|
FacetCounts { facet_counts }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -456,9 +404,9 @@ impl FacetCounts {
|
|||||||
let right_bound = if facet.is_root() {
|
let right_bound = if facet.is_root() {
|
||||||
Bound::Unbounded
|
Bound::Unbounded
|
||||||
} else {
|
} else {
|
||||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
|
||||||
facet_after_bytes.push(1u8);
|
facet_after_bytes.push('\u{1}');
|
||||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
let facet_after = Facet::from_encoded_string(facet_after_bytes);
|
||||||
Bound::Excluded(facet_after)
|
Bound::Excluded(facet_after)
|
||||||
};
|
};
|
||||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||||
@@ -505,14 +453,14 @@ mod tests {
|
|||||||
use core::Index;
|
use core::Index;
|
||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
use rand::distributions::Uniform;
|
use rand::distributions::Uniform;
|
||||||
|
use rand::prelude::SliceRandom;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use schema::Field;
|
use schema::{Document, Facet, Field, Schema};
|
||||||
use schema::{Document, Facet, SchemaBuilder};
|
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_drilldown() {
|
fn test_facet_collector_drilldown() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -527,7 +475,8 @@ mod tests {
|
|||||||
n /= 4;
|
n /= 4;
|
||||||
let leaf = n % 5;
|
let leaf = n % 5;
|
||||||
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
for i in 0..num_facets * 10 {
|
for i in 0..num_facets * 10 {
|
||||||
let mut doc = Document::new();
|
let mut doc = Document::new();
|
||||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||||
@@ -536,12 +485,10 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet(Facet::from("/top1"));
|
facet_collector.add_facet(Facet::from("/top1"));
|
||||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
|
|
||||||
let counts: FacetCounts = facet_collector.harvest();
|
|
||||||
{
|
{
|
||||||
let facets: Vec<(String, u64)> = counts
|
let facets: Vec<(String, u64)> = counts
|
||||||
.get("/top1")
|
.get("/top1")
|
||||||
@@ -555,18 +502,16 @@ mod tests {
|
|||||||
("/top1/mid2", 50),
|
("/top1/mid2", 50),
|
||||||
("/top1/mid3", 50),
|
("/top1/mid3", 50),
|
||||||
]
|
]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic(
|
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||||
expected = "Tried to add a facet which is a descendant of \
|
an already added facet.")]
|
||||||
an already added facet."
|
|
||||||
)]
|
|
||||||
fn test_misused_facet_collector() {
|
fn test_misused_facet_collector() {
|
||||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||||
facet_collector.add_facet(Facet::from("/country"));
|
facet_collector.add_facet(Facet::from("/country"));
|
||||||
@@ -575,7 +520,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_unsorted_multifacet() {
|
fn test_doc_unsorted_multifacet() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facets");
|
let facet_field = schema_builder.add_facet_field("facets");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -592,8 +537,7 @@ mod tests {
|
|||||||
assert_eq!(searcher.num_docs(), 1);
|
assert_eq!(searcher.num_docs(), 1);
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet("/subjects");
|
facet_collector.add_facet("/subjects");
|
||||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
let counts = facet_collector.harvest();
|
|
||||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||||
assert_eq!(facets[0].1, 1);
|
assert_eq!(facets[0].1, 1);
|
||||||
}
|
}
|
||||||
@@ -607,7 +551,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_topk() {
|
fn test_facet_collector_topk() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -619,14 +563,16 @@ mod tests {
|
|||||||
let facet = Facet::from(&format!("/facet/{}", c));
|
let facet = Facet::from(&format!("/facet/{}", c));
|
||||||
let doc = doc!(facet_field => facet);
|
let doc = doc!(facet_field => facet);
|
||||||
iter::repeat(doc).take(count)
|
iter::repeat(doc).take(count)
|
||||||
}).map(|mut doc| {
|
})
|
||||||
|
.map(|mut doc| {
|
||||||
doc.add_facet(
|
doc.add_facet(
|
||||||
facet_field,
|
facet_field,
|
||||||
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
||||||
);
|
);
|
||||||
doc
|
doc
|
||||||
}).collect();
|
})
|
||||||
thread_rng().shuffle(&mut docs[..]);
|
.collect();
|
||||||
|
docs[..].shuffle(&mut thread_rng());
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
for doc in docs {
|
for doc in docs {
|
||||||
@@ -639,9 +585,8 @@ mod tests {
|
|||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet("/facet");
|
facet_collector.add_facet("/facet");
|
||||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
|
|
||||||
let counts: FacetCounts = facet_collector.harvest();
|
|
||||||
{
|
{
|
||||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
|
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -664,13 +609,13 @@ mod bench {
|
|||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::SchemaBuilder;
|
use schema::Schema;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_facet_collector(b: &mut Bencher) {
|
fn bench_facet_collector(b: &mut Bencher) {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -694,8 +639,8 @@ mod bench {
|
|||||||
|
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let facet_collector = FacetCollector::for_field(facet_field);
|
||||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ mod tests {
|
|||||||
// make sure we have facet counters correctly filled
|
// make sure we have facet counters correctly filled
|
||||||
fn test_facet_collector_results() {
|
fn test_facet_collector_results() {
|
||||||
|
|
||||||
let mut schema_builder = schema::SchemaBuilder::new();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
let text_field = schema_builder.add_text_field("text", STRING);
|
let text_field = schema_builder.add_text_field("text", STRING);
|
||||||
|
|||||||
@@ -1,7 +1,91 @@
|
|||||||
/*!
|
/*!
|
||||||
Defines how the documents matching a search query should be processed.
|
|
||||||
|
# Collectors
|
||||||
|
|
||||||
|
Collectors define the information you want to extract from the documents matching the queries.
|
||||||
|
In tantivy jargon, we call this information your search "fruit".
|
||||||
|
|
||||||
|
Your fruit could for instance be :
|
||||||
|
- [the count of matching documents](./struct.Count.html)
|
||||||
|
- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
||||||
|
- [facet counts](./struct.FacetCollector.html)
|
||||||
|
|
||||||
|
At one point in your code, you will trigger the actual search operation by calling
|
||||||
|
[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
||||||
|
This call will look like this.
|
||||||
|
|
||||||
|
```verbatim
|
||||||
|
let fruit = searcher.search(&query, &collector)?;
|
||||||
|
```
|
||||||
|
|
||||||
|
Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`).
|
||||||
|
|
||||||
|
|
||||||
|
# Combining several collectors
|
||||||
|
|
||||||
|
A rich search experience often requires to run several collectors on your search query.
|
||||||
|
For instance,
|
||||||
|
- selecting the top-K products matching your query
|
||||||
|
- counting the matching documents
|
||||||
|
- computing several facets
|
||||||
|
- computing statistics about the matching product prices
|
||||||
|
|
||||||
|
A simple and efficient way to do that is to pass your collectors as one tuple.
|
||||||
|
The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
||||||
|
in their respective position.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate tantivy;
|
||||||
|
# use tantivy::schema::*;
|
||||||
|
# use tantivy::*;
|
||||||
|
# use tantivy::query::*;
|
||||||
|
use tantivy::collector::{Count, TopDocs};
|
||||||
|
#
|
||||||
|
# fn main() -> tantivy::Result<()> {
|
||||||
|
# let mut schema_builder = Schema::builder();
|
||||||
|
# let title = schema_builder.add_text_field("title", TEXT);
|
||||||
|
# let schema = schema_builder.build();
|
||||||
|
# let index = Index::create_in_ram(schema);
|
||||||
|
# let mut index_writer = index.writer(3_000_000)?;
|
||||||
|
# index_writer.add_document(doc!(
|
||||||
|
# title => "The Name of the Wind",
|
||||||
|
# ));
|
||||||
|
# index_writer.add_document(doc!(
|
||||||
|
# title => "The Diary of Muadib",
|
||||||
|
# ));
|
||||||
|
# index_writer.commit().unwrap();
|
||||||
|
# index.load_searchers()?;
|
||||||
|
# let searcher = index.searcher();
|
||||||
|
# let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
|
# let query = query_parser.parse_query("diary")?;
|
||||||
|
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||||
|
searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
||||||
|
# Ok(())
|
||||||
|
# }
|
||||||
|
```
|
||||||
|
|
||||||
|
The `Collector` trait is implemented for up to 4 collectors.
|
||||||
|
If you have more than 4 collectors, you can either group them into
|
||||||
|
tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
|
||||||
|
|
||||||
|
# Combining several collectors dynamically
|
||||||
|
|
||||||
|
Combining collectors into a tuple is a zero-cost abstraction: everything
|
||||||
|
happens as if you had manually implemented a single collector
|
||||||
|
combining all of our features.
|
||||||
|
|
||||||
|
Unfortunately it requires you to know at compile time your collector types.
|
||||||
|
If on the other hand, the collectors depend on some query parameter,
|
||||||
|
you can rely on `MultiCollector`'s.
|
||||||
|
|
||||||
|
|
||||||
|
# Implementing your own collectors.
|
||||||
|
|
||||||
|
See the `custom_collector` example.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
use downcast;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -9,7 +93,7 @@ use SegmentLocalId;
|
|||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
|
||||||
mod count_collector;
|
mod count_collector;
|
||||||
pub use self::count_collector::CountCollector;
|
pub use self::count_collector::Count;
|
||||||
|
|
||||||
mod multi_collector;
|
mod multi_collector;
|
||||||
pub use self::multi_collector::MultiCollector;
|
pub use self::multi_collector::MultiCollector;
|
||||||
@@ -17,237 +101,267 @@ pub use self::multi_collector::MultiCollector;
|
|||||||
mod top_collector;
|
mod top_collector;
|
||||||
|
|
||||||
mod top_score_collector;
|
mod top_score_collector;
|
||||||
pub use self::top_score_collector::TopScoreCollector;
|
pub use self::top_score_collector::TopDocs;
|
||||||
#[deprecated]
|
|
||||||
pub use self::top_score_collector::TopScoreCollector as TopCollector;
|
|
||||||
|
|
||||||
mod top_field_collector;
|
mod top_field_collector;
|
||||||
pub use self::top_field_collector::TopFieldCollector;
|
pub use self::top_field_collector::TopDocsByField;
|
||||||
|
|
||||||
mod facet_collector;
|
mod facet_collector;
|
||||||
pub use self::facet_collector::FacetCollector;
|
pub use self::facet_collector::FacetCollector;
|
||||||
|
|
||||||
mod chained_collector;
|
/// `Fruit` is the type for the result of our collection.
|
||||||
pub use self::chained_collector::{chain, ChainedCollector};
|
/// e.g. `usize` for the `Count` collector.
|
||||||
|
pub trait Fruit: Send + downcast::Any {}
|
||||||
|
|
||||||
|
impl<T> Fruit for T where T: Send + downcast::Any {}
|
||||||
|
|
||||||
/// Collectors are in charge of collecting and retaining relevant
|
/// Collectors are in charge of collecting and retaining relevant
|
||||||
/// information from the document found and scored by the query.
|
/// information from the document found and scored by the query.
|
||||||
///
|
///
|
||||||
///
|
|
||||||
/// For instance,
|
/// For instance,
|
||||||
///
|
///
|
||||||
/// - keeping track of the top 10 best documents
|
/// - keeping track of the top 10 best documents
|
||||||
/// - computing a breakdown over a fast field
|
/// - computing a breakdown over a fast field
|
||||||
/// - computing the number of documents matching the query
|
/// - computing the number of documents matching the query
|
||||||
///
|
///
|
||||||
/// Queries are in charge of pushing the `DocSet` to the collector.
|
/// Our search index is in fact a collection of segments, so
|
||||||
|
/// a `Collector` trait is actually more of a factory to instance
|
||||||
|
/// `SegmentCollector`s for each segments.
|
||||||
///
|
///
|
||||||
/// As they work on multiple segments, they first inform
|
/// The collection logic itself is in the `SegmentCollector`.
|
||||||
/// the collector of a change in a segment and then
|
|
||||||
/// call the `collect` method to push the document to the collector.
|
|
||||||
///
|
|
||||||
/// Temporally, our collector will receive calls
|
|
||||||
/// - `.set_segment(0, segment_reader_0)`
|
|
||||||
/// - `.collect(doc0_of_segment_0)`
|
|
||||||
/// - `.collect(...)`
|
|
||||||
/// - `.collect(last_doc_of_segment_0)`
|
|
||||||
/// - `.set_segment(1, segment_reader_1)`
|
|
||||||
/// - `.collect(doc0_of_segment_1)`
|
|
||||||
/// - `.collect(...)`
|
|
||||||
/// - `.collect(last_doc_of_segment_1)`
|
|
||||||
/// - `...`
|
|
||||||
/// - `.collect(last_doc_of_last_segment)`
|
|
||||||
///
|
///
|
||||||
/// Segments are not guaranteed to be visited in any specific order.
|
/// Segments are not guaranteed to be visited in any specific order.
|
||||||
pub trait Collector {
|
pub trait Collector: Sync {
|
||||||
|
/// `Fruit` is the type for the result of our collection.
|
||||||
|
/// e.g. `usize` for the `Count` collector.
|
||||||
|
type Fruit: Fruit;
|
||||||
|
|
||||||
|
/// Type of the `SegmentCollector` associated to this collector.
|
||||||
|
type Child: SegmentCollector<Fruit = Self::Fruit>;
|
||||||
|
|
||||||
/// `set_segment` is called before beginning to enumerate
|
/// `set_segment` is called before beginning to enumerate
|
||||||
/// on this segment.
|
/// on this segment.
|
||||||
fn set_segment(
|
fn for_segment(
|
||||||
&mut self,
|
&self,
|
||||||
segment_local_id: SegmentLocalId,
|
segment_local_id: SegmentLocalId,
|
||||||
segment: &SegmentReader,
|
segment: &SegmentReader,
|
||||||
) -> Result<()>;
|
) -> Result<Self::Child>;
|
||||||
/// The query pushes the scored document to the collector via this method.
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score);
|
|
||||||
|
|
||||||
/// Returns true iff the collector requires to compute scores for documents.
|
/// Returns true iff the collector requires to compute scores for documents.
|
||||||
fn requires_scoring(&self) -> bool;
|
fn requires_scoring(&self) -> bool;
|
||||||
|
|
||||||
|
/// Combines the fruit associated to the collection of each segments
|
||||||
|
/// into one fruit.
|
||||||
|
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, C: Collector> Collector for &'a mut C {
|
/// The `SegmentCollector` is the trait in charge of defining the
|
||||||
fn set_segment(
|
/// collect operation at the scale of the segment.
|
||||||
&mut self,
|
///
|
||||||
segment_local_id: SegmentLocalId,
|
/// `.collect(doc, score)` will be called for every documents
|
||||||
segment: &SegmentReader,
|
/// matching the query.
|
||||||
) -> Result<()> {
|
pub trait SegmentCollector: 'static {
|
||||||
(*self).set_segment(segment_local_id, segment)
|
/// `Fruit` is the type for the result of our collection.
|
||||||
}
|
/// e.g. `usize` for the `Count` collector.
|
||||||
|
type Fruit: Fruit;
|
||||||
|
|
||||||
/// The query pushes the scored document to the collector via this method.
|
/// The query pushes the scored document to the collector via this method.
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
fn collect(&mut self, doc: DocId, score: Score);
|
||||||
C::collect(self, doc, score)
|
|
||||||
|
/// Extract the fruit of the collection from the `SegmentCollector`.
|
||||||
|
fn harvest(self) -> Self::Fruit;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------
|
||||||
|
// Tuple implementations.
|
||||||
|
|
||||||
|
impl<Left, Right> Collector for (Left, Right)
|
||||||
|
where
|
||||||
|
Left: Collector,
|
||||||
|
Right: Collector,
|
||||||
|
{
|
||||||
|
type Fruit = (Left::Fruit, Right::Fruit);
|
||||||
|
type Child = (Left::Child, Right::Child);
|
||||||
|
|
||||||
|
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||||
|
let left = self.0.for_segment(segment_local_id, segment)?;
|
||||||
|
let right = self.1.for_segment(segment_local_id, segment)?;
|
||||||
|
Ok((left, right))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
C::requires_scoring(self)
|
self.0.requires_scoring() || self.1.requires_scoring()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(
|
||||||
|
&self,
|
||||||
|
children: Vec<(Left::Fruit, Right::Fruit)>,
|
||||||
|
) -> Result<(Left::Fruit, Right::Fruit)> {
|
||||||
|
let mut left_fruits = vec![];
|
||||||
|
let mut right_fruits = vec![];
|
||||||
|
for (left_fruit, right_fruit) in children {
|
||||||
|
left_fruits.push(left_fruit);
|
||||||
|
right_fruits.push(right_fruit);
|
||||||
|
}
|
||||||
|
Ok((
|
||||||
|
self.0.merge_fruits(left_fruits)?,
|
||||||
|
self.1.merge_fruits(right_fruits)?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Left, Right> SegmentCollector for (Left, Right)
|
||||||
|
where
|
||||||
|
Left: SegmentCollector,
|
||||||
|
Right: SegmentCollector,
|
||||||
|
{
|
||||||
|
type Fruit = (Left::Fruit, Right::Fruit);
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.0.collect(doc, score);
|
||||||
|
self.1.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
|
(self.0.harvest(), self.1.harvest())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3-Tuple
|
||||||
|
|
||||||
|
impl<One, Two, Three> Collector for (One, Two, Three)
|
||||||
|
where
|
||||||
|
One: Collector,
|
||||||
|
Two: Collector,
|
||||||
|
Three: Collector,
|
||||||
|
{
|
||||||
|
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
||||||
|
type Child = (One::Child, Two::Child, Three::Child);
|
||||||
|
|
||||||
|
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||||
|
let one = self.0.for_segment(segment_local_id, segment)?;
|
||||||
|
let two = self.1.for_segment(segment_local_id, segment)?;
|
||||||
|
let three = self.2.for_segment(segment_local_id, segment)?;
|
||||||
|
Ok((one, two, three))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||||
|
let mut one_fruits = vec![];
|
||||||
|
let mut two_fruits = vec![];
|
||||||
|
let mut three_fruits = vec![];
|
||||||
|
for (one_fruit, two_fruit, three_fruit) in children {
|
||||||
|
one_fruits.push(one_fruit);
|
||||||
|
two_fruits.push(two_fruit);
|
||||||
|
three_fruits.push(three_fruit);
|
||||||
|
}
|
||||||
|
Ok((
|
||||||
|
self.0.merge_fruits(one_fruits)?,
|
||||||
|
self.1.merge_fruits(two_fruits)?,
|
||||||
|
self.2.merge_fruits(three_fruits)?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<One, Two, Three> SegmentCollector for (One, Two, Three)
|
||||||
|
where
|
||||||
|
One: SegmentCollector,
|
||||||
|
Two: SegmentCollector,
|
||||||
|
Three: SegmentCollector,
|
||||||
|
{
|
||||||
|
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.0.collect(doc, score);
|
||||||
|
self.1.collect(doc, score);
|
||||||
|
self.2.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
|
(self.0.harvest(), self.1.harvest(), self.2.harvest())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4-Tuple
|
||||||
|
|
||||||
|
impl<One, Two, Three, Four> Collector for (One, Two, Three, Four)
|
||||||
|
where
|
||||||
|
One: Collector,
|
||||||
|
Two: Collector,
|
||||||
|
Three: Collector,
|
||||||
|
Four: Collector,
|
||||||
|
{
|
||||||
|
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
||||||
|
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
|
||||||
|
|
||||||
|
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||||
|
let one = self.0.for_segment(segment_local_id, segment)?;
|
||||||
|
let two = self.1.for_segment(segment_local_id, segment)?;
|
||||||
|
let three = self.2.for_segment(segment_local_id, segment)?;
|
||||||
|
let four = self.3.for_segment(segment_local_id, segment)?;
|
||||||
|
Ok((one, two, three, four))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
self.0.requires_scoring()
|
||||||
|
|| self.1.requires_scoring()
|
||||||
|
|| self.2.requires_scoring()
|
||||||
|
|| self.3.requires_scoring()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||||
|
let mut one_fruits = vec![];
|
||||||
|
let mut two_fruits = vec![];
|
||||||
|
let mut three_fruits = vec![];
|
||||||
|
let mut four_fruits = vec![];
|
||||||
|
for (one_fruit, two_fruit, three_fruit, four_fruit) in children {
|
||||||
|
one_fruits.push(one_fruit);
|
||||||
|
two_fruits.push(two_fruit);
|
||||||
|
three_fruits.push(three_fruit);
|
||||||
|
four_fruits.push(four_fruit);
|
||||||
|
}
|
||||||
|
Ok((
|
||||||
|
self.0.merge_fruits(one_fruits)?,
|
||||||
|
self.1.merge_fruits(two_fruits)?,
|
||||||
|
self.2.merge_fruits(three_fruits)?,
|
||||||
|
self.3.merge_fruits(four_fruits)?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<One, Two, Three, Four> SegmentCollector for (One, Two, Three, Four)
|
||||||
|
where
|
||||||
|
One: SegmentCollector,
|
||||||
|
Two: SegmentCollector,
|
||||||
|
Three: SegmentCollector,
|
||||||
|
Four: SegmentCollector,
|
||||||
|
{
|
||||||
|
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.0.collect(doc, score);
|
||||||
|
self.1.collect(doc, score);
|
||||||
|
self.2.collect(doc, score);
|
||||||
|
self.3.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
|
(
|
||||||
|
self.0.harvest(),
|
||||||
|
self.1.harvest(),
|
||||||
|
self.2.harvest(),
|
||||||
|
self.3.harvest(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(missing_docs)]
|
||||||
|
mod downcast_impl {
|
||||||
|
downcast!(super::Fruit);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests;
|
||||||
|
|
||||||
use super::*;
|
|
||||||
use core::SegmentReader;
|
|
||||||
use fastfield::BytesFastFieldReader;
|
|
||||||
use fastfield::FastFieldReader;
|
|
||||||
use schema::Field;
|
|
||||||
use DocId;
|
|
||||||
use Score;
|
|
||||||
use SegmentLocalId;
|
|
||||||
|
|
||||||
/// Stores all of the doc ids.
|
|
||||||
/// This collector is only used for tests.
|
|
||||||
/// It is unusable in practise, as it does not store
|
|
||||||
/// the segment ordinals
|
|
||||||
pub struct TestCollector {
|
|
||||||
offset: DocId,
|
|
||||||
segment_max_doc: DocId,
|
|
||||||
docs: Vec<DocId>,
|
|
||||||
scores: Vec<Score>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TestCollector {
|
|
||||||
/// Return the exhalist of documents.
|
|
||||||
pub fn docs(self) -> Vec<DocId> {
|
|
||||||
self.docs
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn scores(self) -> Vec<Score> {
|
|
||||||
self.scores
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for TestCollector {
|
|
||||||
fn default() -> TestCollector {
|
|
||||||
TestCollector {
|
|
||||||
offset: 0,
|
|
||||||
segment_max_doc: 0,
|
|
||||||
docs: Vec::new(),
|
|
||||||
scores: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for TestCollector {
|
|
||||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
|
||||||
self.offset += self.segment_max_doc;
|
|
||||||
self.segment_max_doc = reader.max_doc();
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.docs.push(doc + self.offset);
|
|
||||||
self.scores.push(score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collects in order all of the fast fields for all of the
|
|
||||||
/// doc in the `DocSet`
|
|
||||||
///
|
|
||||||
/// This collector is mainly useful for tests.
|
|
||||||
pub struct FastFieldTestCollector {
|
|
||||||
vals: Vec<u64>,
|
|
||||||
field: Field,
|
|
||||||
ff_reader: Option<FastFieldReader<u64>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FastFieldTestCollector {
|
|
||||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
|
||||||
FastFieldTestCollector {
|
|
||||||
vals: Vec::new(),
|
|
||||||
field,
|
|
||||||
ff_reader: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn vals(self) -> Vec<u64> {
|
|
||||||
self.vals
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for FastFieldTestCollector {
|
|
||||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
|
||||||
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
|
||||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
|
||||||
self.vals.push(val);
|
|
||||||
}
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collects in order all of the fast field bytes for all of the
|
|
||||||
/// docs in the `DocSet`
|
|
||||||
///
|
|
||||||
/// This collector is mainly useful for tests.
|
|
||||||
pub struct BytesFastFieldTestCollector {
|
|
||||||
vals: Vec<u8>,
|
|
||||||
field: Field,
|
|
||||||
ff_reader: Option<BytesFastFieldReader>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BytesFastFieldTestCollector {
|
|
||||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
|
||||||
BytesFastFieldTestCollector {
|
|
||||||
vals: Vec::new(),
|
|
||||||
field,
|
|
||||||
ff_reader: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn vals(self) -> Vec<u8> {
|
|
||||||
self.vals
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for BytesFastFieldTestCollector {
|
|
||||||
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
|
||||||
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
|
||||||
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
|
||||||
self.vals.extend(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench {
|
|
||||||
use collector::{Collector, CountCollector};
|
|
||||||
use test::Bencher;
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn build_collector(b: &mut Bencher) {
|
|
||||||
b.iter(|| {
|
|
||||||
let mut count_collector = CountCollector::default();
|
|
||||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
|
||||||
for doc in docs {
|
|
||||||
count_collector.collect(doc, 1f32);
|
|
||||||
}
|
|
||||||
count_collector.count()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,9 +1,97 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
|
use super::SegmentCollector;
|
||||||
|
use collector::Fruit;
|
||||||
|
use downcast::Downcast;
|
||||||
|
use std::marker::PhantomData;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
use TantivyError;
|
||||||
|
|
||||||
|
pub struct MultiFruit {
|
||||||
|
sub_fruits: Vec<Option<Box<Fruit>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct CollectorWrapper<TCollector: Collector>(TCollector);
|
||||||
|
|
||||||
|
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||||
|
type Fruit = Box<Fruit>;
|
||||||
|
type Child = Box<BoxableSegmentCollector>;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
segment_local_id: u32,
|
||||||
|
reader: &SegmentReader,
|
||||||
|
) -> Result<Box<BoxableSegmentCollector>> {
|
||||||
|
let child = self.0.for_segment(segment_local_id, reader)?;
|
||||||
|
Ok(Box::new(SegmentCollectorWrapper(child)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
self.0.requires_scoring()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
|
||||||
|
let typed_fruit: Vec<TCollector::Fruit> = children
|
||||||
|
.into_iter()
|
||||||
|
.map(|untyped_fruit| {
|
||||||
|
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
|
||||||
|
.map(|boxed_but_typed| *boxed_but_typed)
|
||||||
|
.map_err(|e| {
|
||||||
|
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
||||||
|
TantivyError::InvalidArgument(err_msg)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect::<Result<_>>()?;
|
||||||
|
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
|
||||||
|
Ok(Box::new(merged_fruit))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for Box<BoxableSegmentCollector> {
|
||||||
|
type Fruit = Box<Fruit>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: u32, score: f32) {
|
||||||
|
self.as_mut().collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> Box<Fruit> {
|
||||||
|
BoxableSegmentCollector::harvest_from_box(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait BoxableSegmentCollector {
|
||||||
|
fn collect(&mut self, doc: u32, score: f32);
|
||||||
|
fn harvest_from_box(self: Box<Self>) -> Box<Fruit>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegmentCollector);
|
||||||
|
|
||||||
|
impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
||||||
|
for SegmentCollectorWrapper<TSegmentCollector>
|
||||||
|
{
|
||||||
|
fn collect(&mut self, doc: u32, score: f32) {
|
||||||
|
self.0.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest_from_box(self: Box<Self>) -> Box<Fruit> {
|
||||||
|
Box::new(self.0.harvest())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FruitHandle<TFruit: Fruit> {
|
||||||
|
pos: usize,
|
||||||
|
_phantom: PhantomData<TFruit>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||||
|
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||||
|
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||||
|
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Multicollector makes it possible to collect on more than one collector.
|
/// Multicollector makes it possible to collect on more than one collector.
|
||||||
/// It should only be used for use cases where the Collector types is unknown
|
/// It should only be used for use cases where the Collector types is unknown
|
||||||
@@ -13,14 +101,14 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
/// use tantivy::collector::{Count, TopDocs, MultiCollector};
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -44,55 +132,115 @@ use SegmentReader;
|
|||||||
/// index.load_searchers()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// let mut collectors = MultiCollector::new();
|
||||||
/// let mut top_collector = TopCollector::with_limit(2);
|
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
|
||||||
/// let mut count_collector = CountCollector::default();
|
/// let count_handle = collectors.add_collector(Count);
|
||||||
/// {
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let mut collectors =
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
/// let mut multi_fruit = searcher.search(&query, &collectors)?;
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
///
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let count = count_handle.extract(&mut multi_fruit);
|
||||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
/// let top_docs = top_docs_handle.extract(&mut multi_fruit);
|
||||||
/// }
|
///
|
||||||
/// assert_eq!(count_collector.count(), 2);
|
/// # assert_eq!(count, 2);
|
||||||
/// assert!(top_collector.at_capacity());
|
/// # assert_eq!(top_docs.len(), 2);
|
||||||
/// }
|
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
|
#[allow(clippy::type_complexity)]
|
||||||
|
#[derive(Default)]
|
||||||
pub struct MultiCollector<'a> {
|
pub struct MultiCollector<'a> {
|
||||||
collectors: Vec<&'a mut Collector>,
|
collector_wrappers:
|
||||||
|
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> MultiCollector<'a> {
|
impl<'a> MultiCollector<'a> {
|
||||||
/// Constructor
|
/// Create a new `MultiCollector`
|
||||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
pub fn new() -> Self {
|
||||||
MultiCollector { collectors }
|
Default::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a new collector to our `MultiCollector`.
|
||||||
|
pub fn add_collector<'b: 'a, TCollector: Collector + 'b>(
|
||||||
|
&mut self,
|
||||||
|
collector: TCollector,
|
||||||
|
) -> FruitHandle<TCollector::Fruit> {
|
||||||
|
let pos = self.collector_wrappers.len();
|
||||||
|
self.collector_wrappers
|
||||||
|
.push(Box::new(CollectorWrapper(collector)));
|
||||||
|
FruitHandle {
|
||||||
|
pos,
|
||||||
|
_phantom: PhantomData,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Collector for MultiCollector<'a> {
|
impl<'a> Collector for MultiCollector<'a> {
|
||||||
fn set_segment(
|
type Fruit = MultiFruit;
|
||||||
&mut self,
|
type Child = MultiCollectorChild;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
segment_local_id: SegmentLocalId,
|
segment_local_id: SegmentLocalId,
|
||||||
segment: &SegmentReader,
|
segment: &SegmentReader,
|
||||||
) -> Result<()> {
|
) -> Result<MultiCollectorChild> {
|
||||||
for collector in &mut self.collectors {
|
let children = self
|
||||||
collector.set_segment(segment_local_id, segment)?;
|
.collector_wrappers
|
||||||
}
|
.iter()
|
||||||
Ok(())
|
.map(|collector_wrapper| collector_wrapper.for_segment(segment_local_id, segment))
|
||||||
|
.collect::<Result<Vec<_>>>()?;
|
||||||
|
Ok(MultiCollectorChild { children })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
||||||
|
let mut segment_fruits_list: Vec<Vec<Box<Fruit>>> = (0..self.collector_wrappers.len())
|
||||||
|
.map(|_| Vec::with_capacity(segments_multifruits.len()))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
for segment_multifruit in segments_multifruits {
|
||||||
|
for (idx, segment_fruit_opt) in segment_multifruit.sub_fruits.into_iter().enumerate() {
|
||||||
|
if let Some(segment_fruit) = segment_fruit_opt {
|
||||||
|
segment_fruits_list[idx].push(segment_fruit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let sub_fruits = self
|
||||||
|
.collector_wrappers
|
||||||
|
.iter()
|
||||||
|
.zip(segment_fruits_list)
|
||||||
|
.map(|(child_collector, segment_fruits)| {
|
||||||
|
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
|
||||||
|
})
|
||||||
|
.collect::<Result<_>>()?;
|
||||||
|
Ok(MultiFruit { sub_fruits })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MultiCollectorChild {
|
||||||
|
children: Vec<Box<BoxableSegmentCollector>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for MultiCollectorChild {
|
||||||
|
type Fruit = MultiFruit;
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
for collector in &mut self.collectors {
|
for child in &mut self.children {
|
||||||
collector.collect(doc, score);
|
child.collect(doc, score);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.collectors
|
fn harvest(self) -> MultiFruit {
|
||||||
.iter()
|
MultiFruit {
|
||||||
.any(|collector| collector.requires_scoring())
|
sub_fruits: self
|
||||||
|
.children
|
||||||
|
.into_iter()
|
||||||
|
.map(|child| Some(child.harvest()))
|
||||||
|
.collect(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -100,20 +248,42 @@ impl<'a> Collector for MultiCollector<'a> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use collector::{Collector, CountCollector, TopScoreCollector};
|
use collector::{Count, TopDocs};
|
||||||
|
use query::TermQuery;
|
||||||
|
use schema::IndexRecordOption;
|
||||||
|
use schema::{Schema, TEXT};
|
||||||
|
use Index;
|
||||||
|
use Term;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multi_collector() {
|
fn test_multi_collector() {
|
||||||
let mut top_collector = TopScoreCollector::with_limit(2);
|
let mut schema_builder = Schema::builder();
|
||||||
let mut count_collector = CountCollector::default();
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut collectors =
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
index_writer.add_document(doc!(text=>"abc"));
|
||||||
collectors.collect(1, 0.2);
|
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||||
collectors.collect(2, 0.1);
|
index_writer.add_document(doc!(text=>"abc abc"));
|
||||||
collectors.collect(3, 0.5);
|
index_writer.commit().unwrap();
|
||||||
|
index_writer.add_document(doc!(text=>""));
|
||||||
|
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||||
|
index_writer.add_document(doc!(text=>"abc"));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
assert_eq!(count_collector.count(), 3);
|
index.load_searchers().unwrap();
|
||||||
assert!(top_collector.at_capacity());
|
let searcher = index.searcher();
|
||||||
|
let term = Term::from_field_text(text, "abc");
|
||||||
|
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||||
|
|
||||||
|
let mut collectors = MultiCollector::new();
|
||||||
|
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
|
||||||
|
let count_handler = collectors.add_collector(Count);
|
||||||
|
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
||||||
|
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
201
src/collector/tests.rs
Normal file
201
src/collector/tests.rs
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
use super::*;
|
||||||
|
use core::SegmentReader;
|
||||||
|
use fastfield::BytesFastFieldReader;
|
||||||
|
use fastfield::FastFieldReader;
|
||||||
|
use schema::Field;
|
||||||
|
use DocAddress;
|
||||||
|
use DocId;
|
||||||
|
use Score;
|
||||||
|
use SegmentLocalId;
|
||||||
|
|
||||||
|
/// Stores all of the doc ids.
|
||||||
|
/// This collector is only used for tests.
|
||||||
|
/// It is unusable in pr
|
||||||
|
///
|
||||||
|
/// actise, as it does not store
|
||||||
|
/// the segment ordinals
|
||||||
|
pub struct TestCollector;
|
||||||
|
|
||||||
|
pub struct TestSegmentCollector {
|
||||||
|
segment_id: SegmentLocalId,
|
||||||
|
fruit: TestFruit,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct TestFruit {
|
||||||
|
docs: Vec<DocAddress>,
|
||||||
|
scores: Vec<Score>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TestFruit {
|
||||||
|
/// Return the list of matching documents exhaustively.
|
||||||
|
pub fn docs(&self) -> &[DocAddress] {
|
||||||
|
&self.docs[..]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn scores(&self) -> &[Score] {
|
||||||
|
&self.scores[..]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for TestCollector {
|
||||||
|
type Fruit = TestFruit;
|
||||||
|
type Child = TestSegmentCollector;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
segment_id: SegmentLocalId,
|
||||||
|
_reader: &SegmentReader,
|
||||||
|
) -> Result<TestSegmentCollector> {
|
||||||
|
Ok(TestSegmentCollector {
|
||||||
|
segment_id,
|
||||||
|
fruit: TestFruit::default(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, mut children: Vec<TestFruit>) -> Result<TestFruit> {
|
||||||
|
children.sort_by_key(|fruit| {
|
||||||
|
if fruit.docs().is_empty() {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
fruit.docs()[0].segment_ord()
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let mut docs = vec![];
|
||||||
|
let mut scores = vec![];
|
||||||
|
for child in children {
|
||||||
|
docs.extend(child.docs());
|
||||||
|
scores.extend(child.scores);
|
||||||
|
}
|
||||||
|
Ok(TestFruit { docs, scores })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for TestSegmentCollector {
|
||||||
|
type Fruit = TestFruit;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.fruit.docs.push(DocAddress(self.segment_id, doc));
|
||||||
|
self.fruit.scores.push(score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
|
self.fruit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects in order all of the fast fields for all of the
|
||||||
|
/// doc in the `DocSet`
|
||||||
|
///
|
||||||
|
/// This collector is mainly useful for tests.
|
||||||
|
pub struct FastFieldTestCollector {
|
||||||
|
field: Field,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FastFieldSegmentCollector {
|
||||||
|
vals: Vec<u64>,
|
||||||
|
reader: FastFieldReader<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldTestCollector {
|
||||||
|
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||||
|
FastFieldTestCollector { field }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for FastFieldTestCollector {
|
||||||
|
type Fruit = Vec<u64>;
|
||||||
|
type Child = FastFieldSegmentCollector;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
_: SegmentLocalId,
|
||||||
|
reader: &SegmentReader,
|
||||||
|
) -> Result<FastFieldSegmentCollector> {
|
||||||
|
Ok(FastFieldSegmentCollector {
|
||||||
|
vals: Vec::new(),
|
||||||
|
reader: reader.fast_field_reader(self.field)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, children: Vec<Vec<u64>>) -> Result<Vec<u64>> {
|
||||||
|
Ok(children.into_iter().flat_map(|v| v.into_iter()).collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for FastFieldSegmentCollector {
|
||||||
|
type Fruit = Vec<u64>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||||
|
let val = self.reader.get(doc);
|
||||||
|
self.vals.push(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> Vec<u64> {
|
||||||
|
self.vals
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects in order all of the fast field bytes for all of the
|
||||||
|
/// docs in the `DocSet`
|
||||||
|
///
|
||||||
|
/// This collector is mainly useful for tests.
|
||||||
|
pub struct BytesFastFieldTestCollector {
|
||||||
|
field: Field,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct BytesFastFieldSegmentCollector {
|
||||||
|
vals: Vec<u8>,
|
||||||
|
reader: BytesFastFieldReader,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BytesFastFieldTestCollector {
|
||||||
|
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||||
|
BytesFastFieldTestCollector { field }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for BytesFastFieldTestCollector {
|
||||||
|
type Fruit = Vec<u8>;
|
||||||
|
type Child = BytesFastFieldSegmentCollector;
|
||||||
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
_segment_local_id: u32,
|
||||||
|
segment: &SegmentReader,
|
||||||
|
) -> Result<BytesFastFieldSegmentCollector> {
|
||||||
|
Ok(BytesFastFieldSegmentCollector {
|
||||||
|
vals: Vec::new(),
|
||||||
|
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> Result<Vec<u8>> {
|
||||||
|
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||||
|
type Fruit = Vec<u8>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: u32, _score: f32) {
|
||||||
|
let data = self.reader.get_val(doc);
|
||||||
|
self.vals.extend(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||||
|
self.vals
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,56 +1,59 @@
|
|||||||
|
use serde::export::PhantomData;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use DocId;
|
use DocId;
|
||||||
|
use Result;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
|
use SegmentReader;
|
||||||
|
|
||||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||||
///
|
///
|
||||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
/// default Rust heap is a max heap, whereas a min heap is needed.
|
||||||
#[derive(Clone, Copy)]
|
///
|
||||||
pub struct ComparableDoc<T> {
|
/// WARNING: equality is not what you would expect here.
|
||||||
|
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||||
|
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||||
|
/// struct is never public.
|
||||||
|
struct ComparableDoc<T, D> {
|
||||||
feature: T,
|
feature: T,
|
||||||
doc_address: DocAddress,
|
doc: D,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd> PartialOrd for ComparableDoc<T> {
|
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd> Ord for ComparableDoc<T> {
|
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
other
|
other
|
||||||
.feature
|
.feature
|
||||||
.partial_cmp(&self.feature)
|
.partial_cmp(&self.feature)
|
||||||
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
|
.unwrap_or_else(|| Ordering::Equal)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd> PartialEq for ComparableDoc<T> {
|
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.cmp(other) == Ordering::Equal
|
self.cmp(other) == Ordering::Equal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd> Eq for ComparableDoc<T> {}
|
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
||||||
|
|
||||||
/// The Top Collector keeps track of the K documents
|
pub(crate) struct TopCollector<T> {
|
||||||
/// sorted by type `T`.
|
|
||||||
///
|
|
||||||
/// The implementation is based on a `BinaryHeap`.
|
|
||||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
|
||||||
/// is `O(n log K)`.
|
|
||||||
pub struct TopCollector<T> {
|
|
||||||
limit: usize,
|
limit: usize,
|
||||||
heap: BinaryHeap<ComparableDoc<T>>,
|
_marker: PhantomData<T>,
|
||||||
segment_id: u32,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd + Clone> TopCollector<T> {
|
impl<T> TopCollector<T>
|
||||||
|
where
|
||||||
|
T: PartialOrd + Clone,
|
||||||
|
{
|
||||||
/// Creates a top collector, with a number of documents equal to "limit".
|
/// Creates a top collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
@@ -61,127 +64,156 @@ impl<T: PartialOrd + Clone> TopCollector<T> {
|
|||||||
}
|
}
|
||||||
TopCollector {
|
TopCollector {
|
||||||
limit,
|
limit,
|
||||||
heap: BinaryHeap::with_capacity(limit),
|
_marker: PhantomData,
|
||||||
segment_id: 0,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns K best documents sorted in decreasing order.
|
pub fn limit(&self) -> usize {
|
||||||
///
|
self.limit
|
||||||
/// Calling this method triggers the sort.
|
|
||||||
/// The result of the sort is not cached.
|
|
||||||
pub fn docs(&self) -> Vec<DocAddress> {
|
|
||||||
self.top_docs()
|
|
||||||
.into_iter()
|
|
||||||
.map(|(_feature, doc)| doc)
|
|
||||||
.collect()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns K best FeatureDocuments sorted in decreasing order.
|
pub fn merge_fruits(
|
||||||
///
|
&self,
|
||||||
/// Calling this method triggers the sort.
|
children: Vec<Vec<(T, DocAddress)>>,
|
||||||
/// The result of the sort is not cached.
|
) -> Result<Vec<(T, DocAddress)>> {
|
||||||
pub fn top_docs(&self) -> Vec<(T, DocAddress)> {
|
if self.limit == 0 {
|
||||||
let mut feature_docs: Vec<ComparableDoc<T>> = self.heap.iter().cloned().collect();
|
return Ok(Vec::new());
|
||||||
feature_docs.sort();
|
}
|
||||||
feature_docs
|
let mut top_collector = BinaryHeap::new();
|
||||||
|
for child_fruit in children {
|
||||||
|
for (feature, doc) in child_fruit {
|
||||||
|
if top_collector.len() < self.limit {
|
||||||
|
top_collector.push(ComparableDoc { feature, doc });
|
||||||
|
} else if let Some(mut head) = top_collector.peek_mut() {
|
||||||
|
if head.feature < feature {
|
||||||
|
*head = ComparableDoc { feature, doc };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(top_collector
|
||||||
|
.into_sorted_vec()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(
|
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
||||||
|ComparableDoc {
|
.collect())
|
||||||
feature,
|
}
|
||||||
doc_address,
|
|
||||||
}| (feature, doc_address),
|
pub(crate) fn for_segment(
|
||||||
).collect()
|
&self,
|
||||||
|
segment_id: SegmentLocalId,
|
||||||
|
_: &SegmentReader,
|
||||||
|
) -> Result<TopSegmentCollector<T>> {
|
||||||
|
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The Top Collector keeps track of the K documents
|
||||||
|
/// sorted by type `T`.
|
||||||
|
///
|
||||||
|
/// The implementation is based on a `BinaryHeap`.
|
||||||
|
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||||
|
/// is `O(n log K)`.
|
||||||
|
pub(crate) struct TopSegmentCollector<T> {
|
||||||
|
limit: usize,
|
||||||
|
heap: BinaryHeap<ComparableDoc<T, DocId>>,
|
||||||
|
segment_id: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd> TopSegmentCollector<T> {
|
||||||
|
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
||||||
|
TopSegmentCollector {
|
||||||
|
limit,
|
||||||
|
heap: BinaryHeap::with_capacity(limit),
|
||||||
|
segment_id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||||
|
pub fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||||
|
let segment_id = self.segment_id;
|
||||||
|
self.heap
|
||||||
|
.into_sorted_vec()
|
||||||
|
.into_iter()
|
||||||
|
.map(|comparable_doc| {
|
||||||
|
(
|
||||||
|
comparable_doc.feature,
|
||||||
|
DocAddress(segment_id, comparable_doc.doc),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return true iff at least K documents have gone through
|
/// Return true iff at least K documents have gone through
|
||||||
/// the collector.
|
/// the collector.
|
||||||
#[inline]
|
#[inline(always)]
|
||||||
pub fn at_capacity(&self) -> bool {
|
pub(crate) fn at_capacity(&self) -> bool {
|
||||||
self.heap.len() >= self.limit
|
self.heap.len() >= self.limit
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets the segment local ID for the collector
|
|
||||||
pub fn set_segment_id(&mut self, segment_id: SegmentLocalId) {
|
|
||||||
self.segment_id = segment_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collects a document scored by the given feature
|
/// Collects a document scored by the given feature
|
||||||
///
|
///
|
||||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||||
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
||||||
|
#[inline(always)]
|
||||||
pub fn collect(&mut self, doc: DocId, feature: T) {
|
pub fn collect(&mut self, doc: DocId, feature: T) {
|
||||||
if self.at_capacity() {
|
if self.at_capacity() {
|
||||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||||
let limit_doc: ComparableDoc<T> = self
|
if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
|
||||||
.heap
|
if limit_feature < feature {
|
||||||
.peek()
|
if let Some(mut head) = self.heap.peek_mut() {
|
||||||
.expect("Top collector with size 0 is forbidden")
|
head.feature = feature;
|
||||||
.clone();
|
head.doc = doc;
|
||||||
if limit_doc.feature < feature {
|
}
|
||||||
let mut mut_head = self
|
}
|
||||||
.heap
|
|
||||||
.peek_mut()
|
|
||||||
.expect("Top collector with size 0 is forbidden");
|
|
||||||
mut_head.feature = feature;
|
|
||||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let wrapped_doc = ComparableDoc {
|
// we have not reached capacity yet, so we can just push the
|
||||||
feature,
|
// element.
|
||||||
doc_address: DocAddress(self.segment_id, doc),
|
self.heap.push(ComparableDoc { feature, doc });
|
||||||
};
|
|
||||||
self.heap.push(wrapped_doc);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::{TopCollector, TopSegmentCollector};
|
||||||
use DocId;
|
use DocAddress;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity() {
|
fn test_top_collector_not_at_capacity() {
|
||||||
let mut top_collector = TopCollector::with_limit(4);
|
let mut top_collector = TopSegmentCollector::new(0, 4);
|
||||||
top_collector.collect(1, 0.8);
|
top_collector.collect(1, 0.8);
|
||||||
top_collector.collect(3, 0.2);
|
top_collector.collect(3, 0.2);
|
||||||
top_collector.collect(5, 0.3);
|
top_collector.collect(5, 0.3);
|
||||||
assert!(!top_collector.at_capacity());
|
assert_eq!(
|
||||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
top_collector.harvest(),
|
||||||
.top_docs()
|
vec![
|
||||||
.into_iter()
|
(0.8, DocAddress(0, 1)),
|
||||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
(0.3, DocAddress(0, 5)),
|
||||||
.collect();
|
(0.2, DocAddress(0, 3))
|
||||||
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_at_capacity() {
|
fn test_top_collector_at_capacity() {
|
||||||
let mut top_collector = TopCollector::with_limit(4);
|
let mut top_collector = TopSegmentCollector::new(0, 4);
|
||||||
top_collector.collect(1, 0.8);
|
top_collector.collect(1, 0.8);
|
||||||
top_collector.collect(3, 0.2);
|
top_collector.collect(3, 0.2);
|
||||||
top_collector.collect(5, 0.3);
|
top_collector.collect(5, 0.3);
|
||||||
top_collector.collect(7, 0.9);
|
top_collector.collect(7, 0.9);
|
||||||
top_collector.collect(9, -0.2);
|
top_collector.collect(9, -0.2);
|
||||||
assert!(top_collector.at_capacity());
|
assert_eq!(
|
||||||
{
|
top_collector.harvest(),
|
||||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
vec![
|
||||||
.top_docs()
|
(0.9, DocAddress(0, 7)),
|
||||||
.into_iter()
|
(0.8, DocAddress(0, 1)),
|
||||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
(0.3, DocAddress(0, 5)),
|
||||||
.collect();
|
(0.2, DocAddress(0, 3))
|
||||||
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
]
|
||||||
}
|
);
|
||||||
{
|
|
||||||
let docs: Vec<DocId> = top_collector
|
|
||||||
.docs()
|
|
||||||
.into_iter()
|
|
||||||
.map(|doc_address| doc_address.doc())
|
|
||||||
.collect();
|
|
||||||
assert_eq!(docs, vec![7, 1, 5, 3]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -189,5 +221,4 @@ mod tests {
|
|||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
let _collector: TopCollector<Score> = TopCollector::with_limit(0);
|
let _collector: TopCollector<Score> = TopCollector::with_limit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use collector::top_collector::TopCollector;
|
use collector::top_collector::TopCollector;
|
||||||
|
use collector::top_collector::TopSegmentCollector;
|
||||||
|
use collector::SegmentCollector;
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use fastfield::FastValue;
|
use fastfield::FastValue;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use DocId;
|
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
|
||||||
/// The Top Field Collector keeps track of the K documents
|
/// The Top Field Collector keeps track of the K documents
|
||||||
@@ -19,67 +20,57 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST};
|
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
|
||||||
/// use tantivy::{Index, Result, DocId};
|
/// # use tantivy::{Index, Result, DocAddress};
|
||||||
/// use tantivy::collector::TopFieldCollector;
|
/// # use tantivy::query::{Query, QueryParser};
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::collector::TopDocs;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() {
|
||||||
/// fn example() -> Result<()> {
|
/// # let mut schema_builder = Schema::builder();
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// # let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// # let rating = schema_builder.add_u64_field("rating", FAST);
|
||||||
/// let rating = schema_builder.add_u64_field("rating", FAST);
|
/// # let schema = schema_builder.build();
|
||||||
/// let schema = schema_builder.build();
|
/// # let index = Index::create_in_ram(schema);
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
/// {
|
/// # index_writer.add_document(doc!(
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
/// # title => "The Name of the Wind",
|
||||||
/// index_writer.add_document(doc!(
|
/// # rating => 92u64,
|
||||||
/// title => "The Name of the Wind",
|
/// # ));
|
||||||
/// rating => 92u64,
|
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||||
/// ));
|
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||||
/// index_writer.add_document(doc!(
|
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||||
/// title => "The Diary of Muadib",
|
/// # index_writer.commit().unwrap();
|
||||||
/// rating => 97u64,
|
/// # index.load_searchers().unwrap();
|
||||||
/// ));
|
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary").unwrap();
|
||||||
/// index_writer.add_document(doc!(
|
/// # let top_docs = docs_sorted_by_rating(&index, &query, rating).unwrap();
|
||||||
/// title => "A Dairy Cow",
|
/// # assert_eq!(top_docs,
|
||||||
/// rating => 63u64,
|
/// # vec![(97u64, DocAddress(0u32, 1)),
|
||||||
/// ));
|
/// # (80u64, DocAddress(0u32, 3))]);
|
||||||
/// index_writer.add_document(doc!(
|
/// # }
|
||||||
/// title => "The Diary of a Young Girl",
|
/// #
|
||||||
/// rating => 80u64,
|
/// /// Searches the document matching the given query, and
|
||||||
/// ));
|
/// /// collects the top 10 documents, order by the `field`
|
||||||
/// index_writer.commit().unwrap();
|
/// /// given in argument.
|
||||||
/// }
|
/// ///
|
||||||
|
/// /// `field` is required to be a FAST field.
|
||||||
|
/// fn docs_sorted_by_rating(index: &Index, query: &Query, sort_by_field: Field)
|
||||||
|
/// -> Result<Vec<(u64, DocAddress)>> {
|
||||||
///
|
///
|
||||||
/// index.load_searchers()?;
|
/// // This is where we build our collector!
|
||||||
/// let searcher = index.searcher();
|
/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
|
||||||
///
|
///
|
||||||
/// {
|
/// // ... and here is our documents. Not this is a simple vec.
|
||||||
/// let mut top_collector = TopFieldCollector::with_limit(rating, 2);
|
/// // The `u64` in the pair is the value of our fast field for each documents.
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// index.searcher()
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// .search(query, &top_docs_by_rating)
|
||||||
/// searcher.search(&*query, &mut top_collector).unwrap();
|
|
||||||
///
|
|
||||||
/// let score_docs: Vec<(u64, DocId)> = top_collector
|
|
||||||
/// .top_docs()
|
|
||||||
/// .into_iter()
|
|
||||||
/// .map(|(field, doc_address)| (field, doc_address.doc()))
|
|
||||||
/// .collect();
|
|
||||||
///
|
|
||||||
/// assert_eq!(score_docs, vec![(97u64, 1), (80, 3)]);
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// Ok(())
|
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct TopFieldCollector<T: FastValue> {
|
pub struct TopDocsByField<T> {
|
||||||
field: Field,
|
|
||||||
collector: TopCollector<T>,
|
collector: TopCollector<T>,
|
||||||
fast_field: Option<FastFieldReader<T>>,
|
field: Field,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Clone> TopFieldCollector<T> {
|
impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
|
||||||
/// Creates a top field collector, with a number of documents equal to "limit".
|
/// Creates a top field collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
/// The given field name must be a fast field, otherwise the collector have an error while
|
/// The given field name must be a fast field, otherwise the collector have an error while
|
||||||
@@ -87,68 +78,72 @@ impl<T: FastValue + PartialOrd + Clone> TopFieldCollector<T> {
|
|||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// The method panics if limit is 0
|
/// The method panics if limit is 0
|
||||||
pub fn with_limit(field: Field, limit: usize) -> Self {
|
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
|
||||||
TopFieldCollector {
|
TopDocsByField {
|
||||||
field,
|
|
||||||
collector: TopCollector::with_limit(limit),
|
collector: TopCollector::with_limit(limit),
|
||||||
fast_field: None,
|
field,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns K best documents sorted the given field name in decreasing order.
|
|
||||||
///
|
|
||||||
/// Calling this method triggers the sort.
|
|
||||||
/// The result of the sort is not cached.
|
|
||||||
pub fn docs(&self) -> Vec<DocAddress> {
|
|
||||||
self.collector.docs()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns K best FieldDocuments sorted in decreasing order.
|
|
||||||
///
|
|
||||||
/// Calling this method triggers the sort.
|
|
||||||
/// The result of the sort is not cached.
|
|
||||||
pub fn top_docs(&self) -> Vec<(T, DocAddress)> {
|
|
||||||
self.collector.top_docs()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return true iff at least K documents have gone through
|
|
||||||
/// the collector.
|
|
||||||
#[inline]
|
|
||||||
pub fn at_capacity(&self) -> bool {
|
|
||||||
self.collector.at_capacity()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Clone> Collector for TopFieldCollector<T> {
|
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
|
||||||
fn set_segment(&mut self, segment_id: u32, segment: &SegmentReader) -> Result<()> {
|
type Fruit = Vec<(T, DocAddress)>;
|
||||||
self.collector.set_segment_id(segment_id);
|
|
||||||
self.fast_field = Some(segment.fast_field_reader(self.field)?);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
type Child = TopFieldSegmentCollector<T>;
|
||||||
let field_value = self
|
|
||||||
.fast_field
|
fn for_segment(
|
||||||
.as_ref()
|
&self,
|
||||||
.expect("collect() was called before set_segment. This should never happen.")
|
segment_local_id: SegmentLocalId,
|
||||||
.get(doc);
|
reader: &SegmentReader,
|
||||||
self.collector.collect(doc, field_value);
|
) -> Result<TopFieldSegmentCollector<T>> {
|
||||||
|
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
||||||
|
let reader = reader.fast_field_reader(self.field)?;
|
||||||
|
Ok(TopFieldSegmentCollector { collector, reader })
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(
|
||||||
|
&self,
|
||||||
|
segment_fruits: Vec<Vec<(T, DocAddress)>>,
|
||||||
|
) -> Result<Vec<(T, DocAddress)>> {
|
||||||
|
self.collector.merge_fruits(segment_fruits)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
||||||
|
collector: TopSegmentCollector<T>,
|
||||||
|
reader: FastFieldReader<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||||
|
for TopFieldSegmentCollector<T>
|
||||||
|
{
|
||||||
|
type Fruit = Vec<(T, DocAddress)>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: u32, _score: f32) {
|
||||||
|
let field_value = self.reader.get(doc);
|
||||||
|
self.collector.collect(doc, field_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||||
|
self.collector.harvest()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::TopDocsByField;
|
||||||
|
use collector::Collector;
|
||||||
|
use collector::TopDocs;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use query::QueryParser;
|
use query::QueryParser;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::Schema;
|
use schema::{Schema, FAST, TEXT};
|
||||||
use schema::{SchemaBuilder, FAST, TEXT};
|
use DocAddress;
|
||||||
use Index;
|
use Index;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
use TantivyError;
|
use TantivyError;
|
||||||
@@ -158,7 +153,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity() {
|
fn test_top_collector_not_at_capacity() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -178,22 +173,22 @@ mod tests {
|
|||||||
});
|
});
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut top_collector = TopFieldCollector::with_limit(size, 4);
|
let top_collector = TopDocs::with_limit(4).order_by_field(size);
|
||||||
searcher.search(&*query, &mut top_collector).unwrap();
|
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||||
assert!(!top_collector.at_capacity());
|
assert_eq!(
|
||||||
|
top_docs,
|
||||||
let score_docs: Vec<(u64, DocId)> = top_collector
|
vec![
|
||||||
.top_docs()
|
(64, DocAddress(0, 1)),
|
||||||
.into_iter()
|
(16, DocAddress(0, 2)),
|
||||||
.map(|(field, doc_address)| (field, doc_address.doc()))
|
(12, DocAddress(0, 0))
|
||||||
.collect();
|
]
|
||||||
assert_eq!(score_docs, vec![(64, 1), (16, 2), (12, 0)]);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_field_does_not_exist() {
|
fn test_field_does_not_exist() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -204,14 +199,16 @@ mod tests {
|
|||||||
));
|
));
|
||||||
});
|
});
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let segment = searcher.segment_reader(0);
|
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
|
||||||
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(2), 4);
|
let segment_reader = searcher.segment_reader(0u32);
|
||||||
let _ = top_collector.set_segment(0, segment);
|
top_collector
|
||||||
|
.for_segment(0, segment_reader)
|
||||||
|
.expect("should panic");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_field_not_fast_field() {
|
fn test_field_not_fast_field() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
|
let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -223,26 +220,16 @@ mod tests {
|
|||||||
});
|
});
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let segment = searcher.segment_reader(0);
|
let segment = searcher.segment_reader(0);
|
||||||
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(size, 4);
|
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
|
||||||
assert_matches!(
|
assert_matches!(
|
||||||
top_collector.set_segment(0, segment),
|
top_collector
|
||||||
Err(TantivyError::FastFieldError(_))
|
.for_segment(0, segment)
|
||||||
|
.map(|_| ())
|
||||||
|
.unwrap_err(),
|
||||||
|
TantivyError::FastFieldError(_)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn test_collect_before_set_segment() {
|
|
||||||
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(0), 4);
|
|
||||||
top_collector.collect(0, 0f32);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn test_top_0() {
|
|
||||||
let _: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(0), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn index(
|
fn index(
|
||||||
query: &str,
|
query: &str,
|
||||||
query_field: Field,
|
query_field: Field,
|
||||||
|
|||||||
@@ -1,5 +1,10 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use collector::top_collector::TopCollector;
|
use collector::top_collector::TopCollector;
|
||||||
|
use collector::top_collector::TopSegmentCollector;
|
||||||
|
use collector::SegmentCollector;
|
||||||
|
use collector::TopDocsByField;
|
||||||
|
use fastfield::FastValue;
|
||||||
|
use schema::Field;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -17,14 +22,15 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
/// use tantivy::DocAddress;
|
||||||
/// use tantivy::{Index, Result, DocId, Score};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::collector::TopScoreCollector;
|
/// use tantivy::{Index, Result};
|
||||||
|
/// use tantivy::collector::TopDocs;
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -48,140 +54,147 @@ use SegmentReader;
|
|||||||
/// index.load_searchers()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let mut top_collector = TopScoreCollector::with_limit(2);
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
|
||||||
/// searcher.search(&*query, &mut top_collector).unwrap();
|
|
||||||
///
|
///
|
||||||
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
/// assert_eq!(&top_docs[0], &(0.7261542, DocAddress(0, 1)));
|
||||||
/// .top_docs()
|
/// assert_eq!(&top_docs[1], &(0.6099695, DocAddress(0, 3)));
|
||||||
/// .into_iter()
|
|
||||||
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
|
||||||
/// .collect();
|
|
||||||
///
|
|
||||||
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
|
||||||
/// }
|
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct TopScoreCollector {
|
pub struct TopDocs(TopCollector<Score>);
|
||||||
collector: TopCollector<Score>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TopScoreCollector {
|
impl TopDocs {
|
||||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// The method panics if limit is 0
|
/// The method panics if limit is 0
|
||||||
pub fn with_limit(limit: usize) -> TopScoreCollector {
|
pub fn with_limit(limit: usize) -> TopDocs {
|
||||||
TopScoreCollector {
|
TopDocs(TopCollector::with_limit(limit))
|
||||||
collector: TopCollector::with_limit(limit),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns K best scored documents sorted in decreasing order.
|
/// Set top-K to rank documents by a given fast field.
|
||||||
///
|
///
|
||||||
/// Calling this method triggers the sort.
|
/// (By default, `TopDocs` collects the top-K documents sorted by
|
||||||
/// The result of the sort is not cached.
|
/// the similarity score.)
|
||||||
pub fn docs(&self) -> Vec<DocAddress> {
|
pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
|
||||||
self.collector.docs()
|
self,
|
||||||
}
|
field: Field,
|
||||||
|
) -> TopDocsByField<T> {
|
||||||
/// Returns K best ScoredDocuments sorted in decreasing order.
|
TopDocsByField::new(field, self.0.limit())
|
||||||
///
|
|
||||||
/// Calling this method triggers the sort.
|
|
||||||
/// The result of the sort is not cached.
|
|
||||||
pub fn top_docs(&self) -> Vec<(Score, DocAddress)> {
|
|
||||||
self.collector.top_docs()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns K best ScoredDocuments sorted in decreasing order.
|
|
||||||
///
|
|
||||||
/// Calling this method triggers the sort.
|
|
||||||
/// The result of the sort is not cached.
|
|
||||||
#[deprecated]
|
|
||||||
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
|
|
||||||
self.collector.top_docs()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return true iff at least K documents have gone through
|
|
||||||
/// the collector.
|
|
||||||
#[inline]
|
|
||||||
pub fn at_capacity(&self) -> bool {
|
|
||||||
self.collector.at_capacity()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Collector for TopScoreCollector {
|
impl Collector for TopDocs {
|
||||||
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
type Fruit = Vec<(Score, DocAddress)>;
|
||||||
self.collector.set_segment_id(segment_id);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
type Child = TopScoreSegmentCollector;
|
||||||
self.collector.collect(doc, score);
|
|
||||||
|
fn for_segment(
|
||||||
|
&self,
|
||||||
|
segment_local_id: SegmentLocalId,
|
||||||
|
reader: &SegmentReader,
|
||||||
|
) -> Result<Self::Child> {
|
||||||
|
let collector = self.0.for_segment(segment_local_id, reader)?;
|
||||||
|
Ok(TopScoreSegmentCollector(collector))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn merge_fruits(&self, child_fruits: Vec<Vec<(Score, DocAddress)>>) -> Result<Self::Fruit> {
|
||||||
|
self.0.merge_fruits(child_fruits)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Segment Collector associated to `TopDocs`.
|
||||||
|
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
|
||||||
|
|
||||||
|
impl SegmentCollector for TopScoreSegmentCollector {
|
||||||
|
type Fruit = Vec<(Score, DocAddress)>;
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.0.collect(doc, score)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn harvest(self) -> Vec<(Score, DocAddress)> {
|
||||||
|
self.0.harvest()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::TopDocs;
|
||||||
use collector::Collector;
|
use query::QueryParser;
|
||||||
use DocId;
|
use schema::Schema;
|
||||||
|
use schema::TEXT;
|
||||||
|
use DocAddress;
|
||||||
|
use Index;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
|
fn make_index() -> Index {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
{
|
||||||
|
// writing the segment
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
||||||
|
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
||||||
|
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||||
|
assert!(index_writer.commit().is_ok());
|
||||||
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity() {
|
fn test_top_collector_not_at_capacity() {
|
||||||
let mut top_collector = TopScoreCollector::with_limit(4);
|
let index = make_index();
|
||||||
top_collector.collect(1, 0.8);
|
let field = index.schema().get_field("text").unwrap();
|
||||||
top_collector.collect(3, 0.2);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
top_collector.collect(5, 0.3);
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
assert!(!top_collector.at_capacity());
|
let score_docs: Vec<(Score, DocAddress)> = index
|
||||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
.searcher()
|
||||||
.top_docs()
|
.search(&text_query, &TopDocs::with_limit(4))
|
||||||
.into_iter()
|
.unwrap();
|
||||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
assert_eq!(
|
||||||
.collect();
|
score_docs,
|
||||||
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
vec![
|
||||||
|
(0.81221175, DocAddress(0u32, 1)),
|
||||||
|
(0.5376842, DocAddress(0u32, 2)),
|
||||||
|
(0.48527452, DocAddress(0, 0))
|
||||||
|
]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_at_capacity() {
|
fn test_top_collector_at_capacity() {
|
||||||
let mut top_collector = TopScoreCollector::with_limit(4);
|
let index = make_index();
|
||||||
top_collector.collect(1, 0.8);
|
let field = index.schema().get_field("text").unwrap();
|
||||||
top_collector.collect(3, 0.2);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
top_collector.collect(5, 0.3);
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
top_collector.collect(7, 0.9);
|
let score_docs: Vec<(Score, DocAddress)> = index
|
||||||
top_collector.collect(9, -0.2);
|
.searcher()
|
||||||
assert!(top_collector.at_capacity());
|
.search(&text_query, &TopDocs::with_limit(2))
|
||||||
{
|
.unwrap();
|
||||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
assert_eq!(
|
||||||
.top_docs()
|
score_docs,
|
||||||
.into_iter()
|
vec![
|
||||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
(0.81221175, DocAddress(0u32, 1)),
|
||||||
.collect();
|
(0.5376842, DocAddress(0u32, 2)),
|
||||||
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
]
|
||||||
}
|
);
|
||||||
{
|
|
||||||
let docs: Vec<DocId> = top_collector
|
|
||||||
.docs()
|
|
||||||
.into_iter()
|
|
||||||
.map(|doc_address| doc_address.doc())
|
|
||||||
.collect();
|
|
||||||
assert_eq!(docs, vec![7, 1, 5, 3]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
TopScoreCollector::with_limit(0);
|
TopDocs::with_limit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,6 @@
|
|||||||
use common::serialize::BinarySerializable;
|
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
|
||||||
use std::mem;
|
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::ptr;
|
|
||||||
|
|
||||||
pub(crate) struct BitPacker {
|
pub(crate) struct BitPacker {
|
||||||
mini_buffer: u64,
|
mini_buffer: u64,
|
||||||
@@ -18,7 +15,7 @@ impl BitPacker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write<TWrite: Write>(
|
pub fn write<TWrite: io::Write>(
|
||||||
&mut self,
|
&mut self,
|
||||||
val: u64,
|
val: u64,
|
||||||
num_bits: u8,
|
num_bits: u8,
|
||||||
@@ -28,14 +25,14 @@ impl BitPacker {
|
|||||||
let num_bits = num_bits as usize;
|
let num_bits = num_bits as usize;
|
||||||
if self.mini_buffer_written + num_bits > 64 {
|
if self.mini_buffer_written + num_bits > 64 {
|
||||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||||
self.mini_buffer.serialize(output)?;
|
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||||
} else {
|
} else {
|
||||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||||
self.mini_buffer_written += num_bits;
|
self.mini_buffer_written += num_bits;
|
||||||
if self.mini_buffer_written == 64 {
|
if self.mini_buffer_written == 64 {
|
||||||
self.mini_buffer.serialize(output)?;
|
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||||
self.mini_buffer_written = 0;
|
self.mini_buffer_written = 0;
|
||||||
self.mini_buffer = 0u64;
|
self.mini_buffer = 0u64;
|
||||||
}
|
}
|
||||||
@@ -43,17 +40,18 @@ impl BitPacker {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||||
if self.mini_buffer_written > 0 {
|
if self.mini_buffer_written > 0 {
|
||||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
let mut arr: [u8; 8] = [0u8; 8];
|
||||||
|
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
||||||
output.write_all(&arr[..num_bytes])?;
|
output.write_all(&arr[..num_bytes])?;
|
||||||
self.mini_buffer_written = 0;
|
self.mini_buffer_written = 0;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||||
self.flush(output)?;
|
self.flush(output)?;
|
||||||
// Padding the write file to simplify reads.
|
// Padding the write file to simplify reads.
|
||||||
output.write_all(&[0u8; 7])?;
|
output.write_all(&[0u8; 7])?;
|
||||||
@@ -102,9 +100,7 @@ where
|
|||||||
addr + 8 <= data.len(),
|
addr + 8 <= data.len(),
|
||||||
"The fast field field should have been padded with 7 bytes."
|
"The fast field field should have been padded with 7 bytes."
|
||||||
);
|
);
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||||
let val_unshifted_unmasked: u64 =
|
|
||||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
|
||||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||||
val_shifted & mask
|
val_shifted & mask
|
||||||
}
|
}
|
||||||
@@ -126,9 +122,7 @@ where
|
|||||||
for output_val in output.iter_mut() {
|
for output_val in output.iter_mut() {
|
||||||
let addr = addr_in_bits >> 3;
|
let addr = addr_in_bits >> 3;
|
||||||
let bit_shift = addr_in_bits & 7;
|
let bit_shift = addr_in_bits & 7;
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||||
let val_unshifted_unmasked: u64 =
|
|
||||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
|
||||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||||
*output_val = val_shifted & mask;
|
*output_val = val_shifted & mask;
|
||||||
addr_in_bits += num_bits;
|
addr_in_bits += num_bits;
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ use common::VInt;
|
|||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
|
use space_usage::FieldUsage;
|
||||||
|
use space_usage::PerFieldSpaceUsage;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::io::{self, Read};
|
use std::io::{self, Read};
|
||||||
@@ -166,6 +168,17 @@ impl CompositeFile {
|
|||||||
.get(&FileAddr { field, idx })
|
.get(&FileAddr { field, idx })
|
||||||
.map(|&(from, to)| self.data.slice(from, to))
|
.map(|&(from, to)| self.data.slice(from, to))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||||
|
let mut fields = HashMap::new();
|
||||||
|
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
||||||
|
fields
|
||||||
|
.entry(field_addr.field)
|
||||||
|
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
||||||
|
.add_field_idx(field_addr.idx, end - start);
|
||||||
|
}
|
||||||
|
PerFieldSpaceUsage::new(fields)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
137
src/core/executor.rs
Normal file
137
src/core/executor.rs
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
use crossbeam::channel;
|
||||||
|
use scoped_pool::{Pool, ThreadConfig};
|
||||||
|
use Result;
|
||||||
|
|
||||||
|
/// Search executor whether search request are single thread or multithread.
|
||||||
|
///
|
||||||
|
/// We don't expose Rayon thread pool directly here for several reasons.
|
||||||
|
///
|
||||||
|
/// First dependency hell. It is not a good idea to expose the
|
||||||
|
/// API of a dependency, knowing it might conflict with a different version
|
||||||
|
/// used by the client. Second, we may stop using rayon in the future.
|
||||||
|
pub enum Executor {
|
||||||
|
SingleThread,
|
||||||
|
ThreadPool(Pool),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Executor {
|
||||||
|
/// Creates an Executor that performs all task in the caller thread.
|
||||||
|
pub fn single_thread() -> Executor {
|
||||||
|
Executor::SingleThread
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates an Executor that dispatches the tasks in a thread pool.
|
||||||
|
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Executor {
|
||||||
|
let thread_config = ThreadConfig::new().prefix(prefix);
|
||||||
|
let pool = Pool::with_thread_config(num_threads, thread_config);
|
||||||
|
Executor::ThreadPool(pool)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform a map in the thread pool.
|
||||||
|
//
|
||||||
|
// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
|
||||||
|
// will propagate to the caller.
|
||||||
|
pub fn map<
|
||||||
|
A: Send,
|
||||||
|
R: Send,
|
||||||
|
AIterator: Iterator<Item = A>,
|
||||||
|
F: Sized + Sync + Fn(A) -> Result<R>,
|
||||||
|
>(
|
||||||
|
&self,
|
||||||
|
f: F,
|
||||||
|
args: AIterator,
|
||||||
|
) -> Result<Vec<R>> {
|
||||||
|
match self {
|
||||||
|
Executor::SingleThread => args.map(f).collect::<Result<_>>(),
|
||||||
|
Executor::ThreadPool(pool) => {
|
||||||
|
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
||||||
|
let num_fruits = args_with_indices.len();
|
||||||
|
let fruit_receiver = {
|
||||||
|
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
||||||
|
pool.scoped(|scope| {
|
||||||
|
for arg_with_idx in args_with_indices {
|
||||||
|
scope.execute(|| {
|
||||||
|
let (idx, arg) = arg_with_idx;
|
||||||
|
let fruit = f(arg);
|
||||||
|
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||||
|
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
fruit_receiver
|
||||||
|
// This ends the scope of fruit_sender.
|
||||||
|
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||||
|
// terminate.
|
||||||
|
};
|
||||||
|
// This is lame, but it does not use unsafe code.
|
||||||
|
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||||
|
for (pos, fruit_res) in fruit_receiver {
|
||||||
|
let fruit = fruit_res?;
|
||||||
|
results_with_position.push((pos, fruit));
|
||||||
|
}
|
||||||
|
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||||
|
assert_eq!(results_with_position.len(), num_fruits);
|
||||||
|
Ok(results_with_position
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_, fruit)| fruit)
|
||||||
|
.collect::<Vec<_>>())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::Executor;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "panic should propagate")]
|
||||||
|
fn test_panic_propagates_single_thread() {
|
||||||
|
let _result: Vec<usize> = Executor::single_thread()
|
||||||
|
.map(
|
||||||
|
|_| {
|
||||||
|
panic!("panic should propagate");
|
||||||
|
},
|
||||||
|
vec![0].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic] //< unfortunately the panic message is not propagated
|
||||||
|
fn test_panic_propagates_multi_thread() {
|
||||||
|
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
|
||||||
|
.map(
|
||||||
|
|_| {
|
||||||
|
panic!("panic should propagate");
|
||||||
|
},
|
||||||
|
vec![0].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_map_singlethread() {
|
||||||
|
let result: Vec<usize> = Executor::single_thread()
|
||||||
|
.map(|i| Ok(i * 2), 0..1_000)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(result.len(), 1_000);
|
||||||
|
for i in 0..1_000 {
|
||||||
|
assert_eq!(result[i], i * 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_map_multithread() {
|
||||||
|
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
||||||
|
.map(|i| Ok(i * 2), 0..10)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(result.len(), 10);
|
||||||
|
for i in 0..10 {
|
||||||
|
assert_eq!(result[i], i * 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@ use super::pool::Pool;
|
|||||||
use super::segment::create_segment;
|
use super::segment::create_segment;
|
||||||
use super::segment::Segment;
|
use super::segment::Segment;
|
||||||
use core::searcher::Searcher;
|
use core::searcher::Searcher;
|
||||||
|
use core::Executor;
|
||||||
use core::IndexMeta;
|
use core::IndexMeta;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
@@ -12,6 +13,7 @@ use directory::ManagedDirectory;
|
|||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
use directory::MmapDirectory;
|
use directory::MmapDirectory;
|
||||||
use directory::{Directory, RAMDirectory};
|
use directory::{Directory, RAMDirectory};
|
||||||
|
use error::DataCorruption;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use indexer::index_writer::open_index_writer;
|
use indexer::index_writer::open_index_writer;
|
||||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||||
@@ -36,7 +38,13 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
|||||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||||
serde_json::from_str(&meta_string)
|
serde_json::from_str(&meta_string)
|
||||||
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
|
.map_err(|e| {
|
||||||
|
DataCorruption::new(
|
||||||
|
META_FILEPATH.clone(),
|
||||||
|
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.map_err(From::from)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search Index
|
/// Search Index
|
||||||
@@ -45,10 +53,39 @@ pub struct Index {
|
|||||||
schema: Schema,
|
schema: Schema,
|
||||||
num_searchers: Arc<AtomicUsize>,
|
num_searchers: Arc<AtomicUsize>,
|
||||||
searcher_pool: Arc<Pool<Searcher>>,
|
searcher_pool: Arc<Pool<Searcher>>,
|
||||||
|
executor: Arc<Executor>,
|
||||||
tokenizers: TokenizerManager,
|
tokenizers: TokenizerManager,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
|
/// Examines the director to see if it contains an index
|
||||||
|
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
|
||||||
|
dir.exists(&META_FILEPATH)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Accessor to the search executor.
|
||||||
|
///
|
||||||
|
/// This pool is used by default when calling `searcher.search(...)`
|
||||||
|
/// to perform search on the individual segments.
|
||||||
|
///
|
||||||
|
/// By default the executor is single thread, and simply runs in the calling thread.
|
||||||
|
pub fn search_executor(&self) -> &Executor {
|
||||||
|
self.executor.as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Replace the default single thread search executor pool
|
||||||
|
/// by a thread pool with a given number of threads.
|
||||||
|
pub fn set_multithread_executor(&mut self, num_threads: usize) {
|
||||||
|
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Replace the default single thread search executor pool
|
||||||
|
/// by a thread pool with a given number of threads.
|
||||||
|
pub fn set_default_multithread_executor(&mut self) {
|
||||||
|
let default_num_threads = num_cpus::get();
|
||||||
|
self.set_multithread_executor(default_num_threads);
|
||||||
|
}
|
||||||
|
|
||||||
/// Creates a new index using the `RAMDirectory`.
|
/// Creates a new index using the `RAMDirectory`.
|
||||||
///
|
///
|
||||||
/// The index will be allocated in anonymous memory.
|
/// The index will be allocated in anonymous memory.
|
||||||
@@ -65,9 +102,30 @@ impl Index {
|
|||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||||
|
if Index::exists(&mmap_directory) {
|
||||||
|
return Err(TantivyError::IndexAlreadyExists);
|
||||||
|
}
|
||||||
|
|
||||||
Index::create(mmap_directory, schema)
|
Index::create(mmap_directory, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Opens or creates a new index in the provided directory
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
|
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||||
|
if Index::exists(&dir) {
|
||||||
|
let index = Index::open(dir)?;
|
||||||
|
if index.schema() == schema {
|
||||||
|
Ok(index)
|
||||||
|
} else {
|
||||||
|
Err(TantivyError::SchemaError(
|
||||||
|
"An index exists but the schema does not match.".to_string(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Index::create(dir, schema)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Creates a new index in a temp directory.
|
/// Creates a new index in a temp directory.
|
||||||
///
|
///
|
||||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||||
@@ -84,13 +142,15 @@ impl Index {
|
|||||||
|
|
||||||
/// Creates a new index given an implementation of the trait `Directory`
|
/// Creates a new index given an implementation of the trait `Directory`
|
||||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||||
let directory = ManagedDirectory::new(dir)?;
|
let directory = ManagedDirectory::wrap(dir)?;
|
||||||
Index::from_directory(directory, schema)
|
Index::from_directory(directory, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a new index from a directory.
|
/// Create a new index from a directory.
|
||||||
|
///
|
||||||
|
/// This will overwrite existing meta.json
|
||||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
save_new_metas(schema.clone(), directory.borrow_mut())?;
|
||||||
let metas = IndexMeta::with_schema(schema);
|
let metas = IndexMeta::with_schema(schema);
|
||||||
Index::create_from_metas(directory, &metas)
|
Index::create_from_metas(directory, &metas)
|
||||||
}
|
}
|
||||||
@@ -105,6 +165,7 @@ impl Index {
|
|||||||
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
|
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
|
||||||
searcher_pool: Arc::new(Pool::new()),
|
searcher_pool: Arc::new(Pool::new()),
|
||||||
tokenizers: TokenizerManager::default(),
|
tokenizers: TokenizerManager::default(),
|
||||||
|
executor: Arc::new(Executor::single_thread()),
|
||||||
};
|
};
|
||||||
index.load_searchers()?;
|
index.load_searchers()?;
|
||||||
Ok(index)
|
Ok(index)
|
||||||
@@ -145,7 +206,7 @@ impl Index {
|
|||||||
|
|
||||||
/// Open the index using the provided directory
|
/// Open the index using the provided directory
|
||||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||||
let directory = ManagedDirectory::new(directory)?;
|
let directory = ManagedDirectory::wrap(directory)?;
|
||||||
let metas = load_metas(&directory)?;
|
let metas = load_metas(&directory)?;
|
||||||
Index::create_from_metas(directory, &metas)
|
Index::create_from_metas(directory, &metas)
|
||||||
}
|
}
|
||||||
@@ -322,18 +383,20 @@ impl Clone for Index {
|
|||||||
num_searchers: Arc::clone(&self.num_searchers),
|
num_searchers: Arc::clone(&self.num_searchers),
|
||||||
searcher_pool: Arc::clone(&self.searcher_pool),
|
searcher_pool: Arc::clone(&self.searcher_pool),
|
||||||
tokenizers: self.tokenizers.clone(),
|
tokenizers: self.tokenizers.clone(),
|
||||||
|
executor: self.executor.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use schema::{SchemaBuilder, INT_INDEXED, TEXT};
|
use directory::RAMDirectory;
|
||||||
|
use schema::{Schema, INT_INDEXED, TEXT};
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexer_for_field() {
|
fn test_indexer_for_field() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -345,4 +408,54 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_index_exists() {
|
||||||
|
let directory = RAMDirectory::create();
|
||||||
|
assert!(!Index::exists(&directory));
|
||||||
|
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||||
|
assert!(Index::exists(&directory));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn open_or_create_should_create() {
|
||||||
|
let directory = RAMDirectory::create();
|
||||||
|
assert!(!Index::exists(&directory));
|
||||||
|
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||||
|
assert!(Index::exists(&directory));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn open_or_create_should_open() {
|
||||||
|
let directory = RAMDirectory::create();
|
||||||
|
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||||
|
assert!(Index::exists(&directory));
|
||||||
|
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn create_should_wipeoff_existing() {
|
||||||
|
let directory = RAMDirectory::create();
|
||||||
|
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||||
|
assert!(Index::exists(&directory));
|
||||||
|
assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn open_or_create_exists_but_schema_does_not_match() {
|
||||||
|
let directory = RAMDirectory::create();
|
||||||
|
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||||
|
assert!(Index::exists(&directory));
|
||||||
|
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||||
|
let err = Index::open_or_create(directory, Schema::builder().build());
|
||||||
|
assert_eq!(
|
||||||
|
format!("{:?}", err.unwrap_err()),
|
||||||
|
"SchemaError(\"An index exists but the schema does not match.\")"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn throw_away_schema() -> Schema {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||||
|
schema_builder.build()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,13 +46,13 @@ impl fmt::Debug for IndexMeta {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::IndexMeta;
|
use super::IndexMeta;
|
||||||
use schema::{SchemaBuilder, TEXT};
|
use schema::{Schema, TEXT};
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_serialize_metas() {
|
fn test_serialize_metas() {
|
||||||
let schema = {
|
let schema = {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_text_field("text", TEXT);
|
schema_builder.add_text_field("text", TEXT);
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -32,10 +32,7 @@ pub struct InvertedIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl InvertedIndexReader {
|
impl InvertedIndexReader {
|
||||||
#[cfg_attr(
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::needless_pass_by_value)
|
|
||||||
)] // for symetry
|
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
termdict: TermDictionary,
|
termdict: TermDictionary,
|
||||||
postings_source: ReadOnlySource,
|
postings_source: ReadOnlySource,
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
mod executor;
|
||||||
pub mod index;
|
pub mod index;
|
||||||
mod index_meta;
|
mod index_meta;
|
||||||
mod inverted_index_reader;
|
mod inverted_index_reader;
|
||||||
@@ -9,6 +10,7 @@ mod segment_id;
|
|||||||
mod segment_meta;
|
mod segment_meta;
|
||||||
mod segment_reader;
|
mod segment_reader;
|
||||||
|
|
||||||
|
pub use self::executor::Executor;
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
pub use self::index_meta::IndexMeta;
|
pub use self::index_meta::IndexMeta;
|
||||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||||
|
|||||||
@@ -1,17 +1,43 @@
|
|||||||
use collector::Collector;
|
use collector::Collector;
|
||||||
|
use collector::SegmentCollector;
|
||||||
|
use core::Executor;
|
||||||
use core::InvertedIndexReader;
|
use core::InvertedIndexReader;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
|
use query::Scorer;
|
||||||
|
use query::Weight;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::{Field, Term};
|
use schema::{Field, Term};
|
||||||
|
use space_usage::SearcherSpaceUsage;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use store::StoreReader;
|
||||||
use termdict::TermMerger;
|
use termdict::TermMerger;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use Index;
|
use Index;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
|
fn collect_segment<C: Collector>(
|
||||||
|
collector: &C,
|
||||||
|
weight: &Weight,
|
||||||
|
segment_ord: u32,
|
||||||
|
segment_reader: &SegmentReader,
|
||||||
|
) -> Result<C::Fruit> {
|
||||||
|
let mut scorer = weight.scorer(segment_reader)?;
|
||||||
|
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||||
|
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
||||||
|
scorer.for_each(&mut |doc, score| {
|
||||||
|
if !delete_bitset.is_deleted(doc) {
|
||||||
|
segment_collector.collect(doc, score);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
|
||||||
|
}
|
||||||
|
Ok(segment_collector.harvest())
|
||||||
|
}
|
||||||
|
|
||||||
/// Holds a list of `SegmentReader`s ready for search.
|
/// Holds a list of `SegmentReader`s ready for search.
|
||||||
///
|
///
|
||||||
/// It guarantees that the `Segment` will not be removed before
|
/// It guarantees that the `Segment` will not be removed before
|
||||||
@@ -21,6 +47,7 @@ pub struct Searcher {
|
|||||||
schema: Schema,
|
schema: Schema,
|
||||||
index: Index,
|
index: Index,
|
||||||
segment_readers: Vec<SegmentReader>,
|
segment_readers: Vec<SegmentReader>,
|
||||||
|
store_readers: Vec<StoreReader>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Searcher {
|
impl Searcher {
|
||||||
@@ -30,10 +57,15 @@ impl Searcher {
|
|||||||
index: Index,
|
index: Index,
|
||||||
segment_readers: Vec<SegmentReader>,
|
segment_readers: Vec<SegmentReader>,
|
||||||
) -> Searcher {
|
) -> Searcher {
|
||||||
|
let store_readers = segment_readers
|
||||||
|
.iter()
|
||||||
|
.map(|segment_reader| segment_reader.get_store_reader())
|
||||||
|
.collect();
|
||||||
Searcher {
|
Searcher {
|
||||||
schema,
|
schema,
|
||||||
index,
|
index,
|
||||||
segment_readers,
|
segment_readers,
|
||||||
|
store_readers,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -48,8 +80,8 @@ impl Searcher {
|
|||||||
/// the request to the right `Segment`.
|
/// the request to the right `Segment`.
|
||||||
pub fn doc(&self, doc_address: DocAddress) -> Result<Document> {
|
pub fn doc(&self, doc_address: DocAddress) -> Result<Document> {
|
||||||
let DocAddress(segment_local_id, doc_id) = doc_address;
|
let DocAddress(segment_local_id, doc_id) = doc_address;
|
||||||
let segment_reader = &self.segment_readers[segment_local_id as usize];
|
let store_reader = &self.store_readers[segment_local_id as usize];
|
||||||
segment_reader.doc(doc_id)
|
store_reader.get(doc_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Access the schema associated to the index of this searcher.
|
/// Access the schema associated to the index of this searcher.
|
||||||
@@ -72,7 +104,8 @@ impl Searcher {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| {
|
.map(|segment_reader| {
|
||||||
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
|
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
|
||||||
}).sum::<u64>()
|
})
|
||||||
|
.sum::<u64>()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the list of segment readers
|
/// Return the list of segment readers
|
||||||
@@ -85,9 +118,58 @@ impl Searcher {
|
|||||||
&self.segment_readers[segment_ord as usize]
|
&self.segment_readers[segment_ord as usize]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Runs a query on the segment readers wrapped by the searcher
|
/// Runs a query on the segment readers wrapped by the searcher.
|
||||||
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
|
///
|
||||||
query.search(self, collector)
|
/// Search works as follows :
|
||||||
|
///
|
||||||
|
/// First the weight object associated to the query is created.
|
||||||
|
///
|
||||||
|
/// Then, the query loops over the segments and for each segment :
|
||||||
|
/// - setup the collector and informs it that the segment being processed has changed.
|
||||||
|
/// - creates a SegmentCollector for collecting documents associated to the segment
|
||||||
|
/// - creates a `Scorer` object associated for this segment
|
||||||
|
/// - iterate through the matched documents and push them to the segment collector.
|
||||||
|
///
|
||||||
|
/// Finally, the Collector merges each of the child collectors into itself for result usability
|
||||||
|
/// by the caller.
|
||||||
|
pub fn search<C: Collector>(&self, query: &Query, collector: &C) -> Result<C::Fruit> {
|
||||||
|
let executor = self.index.search_executor();
|
||||||
|
self.search_with_executor(query, collector, executor)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as [`search(...)`](#method.search) but multithreaded.
|
||||||
|
///
|
||||||
|
/// The current implementation is rather naive :
|
||||||
|
/// multithreading is by splitting search into as many task
|
||||||
|
/// as there are segments.
|
||||||
|
///
|
||||||
|
/// It is powerless at making search faster if your index consists in
|
||||||
|
/// one large segment.
|
||||||
|
///
|
||||||
|
/// Also, keep in my multithreading a single query on several
|
||||||
|
/// threads will not improve your throughput. It can actually
|
||||||
|
/// hurt it. It will however, decrease the average response time.
|
||||||
|
pub fn search_with_executor<C: Collector>(
|
||||||
|
&self,
|
||||||
|
query: &Query,
|
||||||
|
collector: &C,
|
||||||
|
executor: &Executor,
|
||||||
|
) -> Result<C::Fruit> {
|
||||||
|
let scoring_enabled = collector.requires_scoring();
|
||||||
|
let weight = query.weight(self, scoring_enabled)?;
|
||||||
|
let segment_readers = self.segment_readers();
|
||||||
|
let fruits = executor.map(
|
||||||
|
|(segment_ord, segment_reader)| {
|
||||||
|
collect_segment(
|
||||||
|
collector,
|
||||||
|
weight.as_ref(),
|
||||||
|
segment_ord as u32,
|
||||||
|
segment_reader,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
segment_readers.iter().enumerate(),
|
||||||
|
)?;
|
||||||
|
collector.merge_fruits(fruits)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the field searcher associated to a `Field`.
|
/// Return the field searcher associated to a `Field`.
|
||||||
@@ -99,6 +181,15 @@ impl Searcher {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
FieldSearcher::new(inv_index_readers)
|
FieldSearcher::new(inv_index_readers)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Summarize total space usage of this searcher.
|
||||||
|
pub fn space_usage(&self) -> SearcherSpaceUsage {
|
||||||
|
let mut space_usage = SearcherSpaceUsage::new();
|
||||||
|
for segment_reader in self.segment_readers.iter() {
|
||||||
|
space_usage.add_segment(segment_reader.space_usage());
|
||||||
|
}
|
||||||
|
space_usage
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct FieldSearcher {
|
pub struct FieldSearcher {
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use core::InvertedIndexReader;
|
|||||||
use core::Segment;
|
use core::Segment;
|
||||||
use core::SegmentComponent;
|
use core::SegmentComponent;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
|
use directory::ReadOnlySource;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fastfield::DeleteBitSet;
|
use fastfield::DeleteBitSet;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
@@ -12,10 +13,10 @@ use fastfield::{self, FastFieldNotAvailableError};
|
|||||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||||
use fieldnorm::FieldNormReader;
|
use fieldnorm::FieldNormReader;
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use schema::Document;
|
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
|
use space_usage::SegmentSpaceUsage;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -53,7 +54,7 @@ pub struct SegmentReader {
|
|||||||
fast_fields_composite: CompositeFile,
|
fast_fields_composite: CompositeFile,
|
||||||
fieldnorms_composite: CompositeFile,
|
fieldnorms_composite: CompositeFile,
|
||||||
|
|
||||||
store_reader: StoreReader,
|
store_source: ReadOnlySource,
|
||||||
delete_bitset_opt: Option<DeleteBitSet>,
|
delete_bitset_opt: Option<DeleteBitSet>,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
}
|
}
|
||||||
@@ -196,8 +197,7 @@ impl SegmentReader {
|
|||||||
/// Accessor to the segment's `Field norms`'s reader.
|
/// Accessor to the segment's `Field norms`'s reader.
|
||||||
///
|
///
|
||||||
/// Field norms are the length (in tokens) of the fields.
|
/// Field norms are the length (in tokens) of the fields.
|
||||||
/// It is used in the computation of the [TfIdf]
|
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||||
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
|
||||||
///
|
///
|
||||||
/// They are simply stored as a fast field, serialized in
|
/// They are simply stored as a fast field, serialized in
|
||||||
/// the `.fieldnorm` file of the segment.
|
/// the `.fieldnorm` file of the segment.
|
||||||
@@ -215,8 +215,8 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the segment's `StoreReader`.
|
/// Accessor to the segment's `StoreReader`.
|
||||||
pub fn get_store_reader(&self) -> &StoreReader {
|
pub fn get_store_reader(&self) -> StoreReader {
|
||||||
&self.store_reader
|
StoreReader::from_source(self.store_source.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Open a new segment for reading.
|
/// Open a new segment for reading.
|
||||||
@@ -225,7 +225,6 @@ impl SegmentReader {
|
|||||||
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
||||||
|
|
||||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||||
let store_reader = StoreReader::from_source(store_source);
|
|
||||||
|
|
||||||
fail_point!("SegmentReader::open#middle");
|
fail_point!("SegmentReader::open#middle");
|
||||||
|
|
||||||
@@ -271,7 +270,7 @@ impl SegmentReader {
|
|||||||
fast_fields_composite,
|
fast_fields_composite,
|
||||||
fieldnorms_composite,
|
fieldnorms_composite,
|
||||||
segment_id: segment.id(),
|
segment_id: segment.id(),
|
||||||
store_reader,
|
store_source,
|
||||||
delete_bitset_opt,
|
delete_bitset_opt,
|
||||||
positions_composite,
|
positions_composite,
|
||||||
positions_idx_composite,
|
positions_idx_composite,
|
||||||
@@ -350,14 +349,6 @@ impl SegmentReader {
|
|||||||
inv_idx_reader
|
inv_idx_reader
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the document (or to be accurate, its stored field)
|
|
||||||
/// bearing the given doc id.
|
|
||||||
/// This method is slow and should seldom be called from
|
|
||||||
/// within a collector.
|
|
||||||
pub fn doc(&self, doc_id: DocId) -> Result<Document> {
|
|
||||||
self.store_reader.get(doc_id)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the segment id
|
/// Returns the segment id
|
||||||
pub fn segment_id(&self) -> SegmentId {
|
pub fn segment_id(&self) -> SegmentId {
|
||||||
self.segment_id
|
self.segment_id
|
||||||
@@ -381,6 +372,24 @@ impl SegmentReader {
|
|||||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||||
SegmentReaderAliveDocsIterator::new(&self)
|
SegmentReaderAliveDocsIterator::new(&self)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Summarize total space usage of this segment.
|
||||||
|
pub fn space_usage(&self) -> SegmentSpaceUsage {
|
||||||
|
SegmentSpaceUsage::new(
|
||||||
|
self.num_docs(),
|
||||||
|
self.termdict_composite.space_usage(),
|
||||||
|
self.postings_composite.space_usage(),
|
||||||
|
self.positions_composite.space_usage(),
|
||||||
|
self.positions_idx_composite.space_usage(),
|
||||||
|
self.fast_fields_composite.space_usage(),
|
||||||
|
self.fieldnorms_composite.space_usage(),
|
||||||
|
self.get_store_reader().space_usage(),
|
||||||
|
self.delete_bitset_opt
|
||||||
|
.as_ref()
|
||||||
|
.map(|x| x.space_usage())
|
||||||
|
.unwrap_or(0),
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for SegmentReader {
|
impl fmt::Debug for SegmentReader {
|
||||||
@@ -438,12 +447,12 @@ impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
use schema::{Schema, Term, STORED, TEXT};
|
||||||
use DocId;
|
use DocId;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_alive_docs_iterator() {
|
fn test_alive_docs_iterator() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_text_field("name", TEXT | STORED);
|
schema_builder.add_text_field("name", TEXT | STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use core::MANAGED_FILEPATH;
|
use core::MANAGED_FILEPATH;
|
||||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||||
use directory::{ReadOnlySource, WritePtr};
|
use directory::{ReadOnlySource, WritePtr};
|
||||||
use error::TantivyError;
|
use error::DataCorruption;
|
||||||
use indexer::LockType;
|
use indexer::LockType;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
@@ -59,12 +59,17 @@ fn save_managed_paths(
|
|||||||
|
|
||||||
impl ManagedDirectory {
|
impl ManagedDirectory {
|
||||||
/// Wraps a directory as managed directory.
|
/// Wraps a directory as managed directory.
|
||||||
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||||
Ok(data) => {
|
Ok(data) => {
|
||||||
let managed_files_json = String::from_utf8_lossy(&data);
|
let managed_files_json = String::from_utf8_lossy(&data);
|
||||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||||
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
.map_err(|e| {
|
||||||
|
DataCorruption::new(
|
||||||
|
MANAGED_FILEPATH.clone(),
|
||||||
|
format!("Managed file cannot be deserialized: {:?}. ", e),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
Ok(ManagedDirectory {
|
Ok(ManagedDirectory {
|
||||||
directory: Box::new(directory),
|
directory: Box::new(directory),
|
||||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||||
@@ -260,7 +265,7 @@ mod tests {
|
|||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
{
|
{
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
{
|
{
|
||||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||||
write_file.flush().unwrap();
|
write_file.flush().unwrap();
|
||||||
@@ -286,7 +291,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
{
|
{
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||||
@@ -310,7 +315,7 @@ mod tests {
|
|||||||
let living_files = HashSet::new();
|
let living_files = HashSet::new();
|
||||||
|
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
managed_directory
|
managed_directory
|
||||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|||||||
@@ -364,6 +364,11 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_open_non_existant_path() {
|
||||||
|
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_open_empty() {
|
fn test_open_empty() {
|
||||||
// empty file is actually an edge case because those
|
// empty file is actually an edge case because those
|
||||||
|
|||||||
@@ -100,7 +100,8 @@ impl InnerDirectory {
|
|||||||
);
|
);
|
||||||
let io_err = make_io_err(msg);
|
let io_err = make_io_err(msg);
|
||||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||||
}).and_then(|readable_map| {
|
})
|
||||||
|
.and_then(|readable_map| {
|
||||||
readable_map
|
readable_map
|
||||||
.get(path)
|
.get(path)
|
||||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||||
@@ -120,7 +121,8 @@ impl InnerDirectory {
|
|||||||
);
|
);
|
||||||
let io_err = make_io_err(msg);
|
let io_err = make_io_err(msg);
|
||||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||||
}).and_then(|mut writable_map| match writable_map.remove(path) {
|
})
|
||||||
|
.and_then(|mut writable_map| match writable_map.remove(path) {
|
||||||
Some(_) => Ok(()),
|
Some(_) => Ok(()),
|
||||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||||
})
|
})
|
||||||
|
|||||||
67
src/error.rs
67
src/error.rs
@@ -8,18 +8,54 @@ use indexer::LockType;
|
|||||||
use query;
|
use query;
|
||||||
use schema;
|
use schema;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
use std::fmt;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::PoisonError;
|
use std::sync::PoisonError;
|
||||||
|
|
||||||
|
pub struct DataCorruption {
|
||||||
|
filepath: Option<PathBuf>,
|
||||||
|
comment: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DataCorruption {
|
||||||
|
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
|
||||||
|
DataCorruption {
|
||||||
|
filepath: Some(filepath),
|
||||||
|
comment,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn comment_only(comment: String) -> DataCorruption {
|
||||||
|
DataCorruption {
|
||||||
|
filepath: None,
|
||||||
|
comment,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for DataCorruption {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||||
|
write!(f, "Data corruption: ")?;
|
||||||
|
if let Some(ref filepath) = &self.filepath {
|
||||||
|
write!(f, "(in file `{:?}`)", filepath)?;
|
||||||
|
}
|
||||||
|
write!(f, ": {}.", self.comment)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// The library's failure based error enum
|
/// The library's failure based error enum
|
||||||
#[derive(Debug, Fail)]
|
#[derive(Debug, Fail)]
|
||||||
pub enum TantivyError {
|
pub enum TantivyError {
|
||||||
/// Path does not exist.
|
/// Path does not exist.
|
||||||
#[fail(display = "path does not exist: '{:?}'", _0)]
|
#[fail(display = "Path does not exist: '{:?}'", _0)]
|
||||||
PathDoesNotExist(PathBuf),
|
PathDoesNotExist(PathBuf),
|
||||||
/// File already exists, this is a problem when we try to write into a new file.
|
/// File already exists, this is a problem when we try to write into a new file.
|
||||||
#[fail(display = "file already exists: '{:?}'", _0)]
|
#[fail(display = "File already exists: '{:?}'", _0)]
|
||||||
FileAlreadyExists(PathBuf),
|
FileAlreadyExists(PathBuf),
|
||||||
|
/// Index already exists in this directory
|
||||||
|
#[fail(display = "Index already exists")]
|
||||||
|
IndexAlreadyExists,
|
||||||
/// Failed to acquire file lock
|
/// Failed to acquire file lock
|
||||||
#[fail(
|
#[fail(
|
||||||
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
||||||
@@ -27,28 +63,35 @@ pub enum TantivyError {
|
|||||||
)]
|
)]
|
||||||
LockFailure(LockType),
|
LockFailure(LockType),
|
||||||
/// IO Error.
|
/// IO Error.
|
||||||
#[fail(display = "an IO error occurred: '{}'", _0)]
|
#[fail(display = "An IO error occurred: '{}'", _0)]
|
||||||
IOError(#[cause] IOError),
|
IOError(#[cause] IOError),
|
||||||
/// The data within is corrupted.
|
/// Data corruption.
|
||||||
///
|
#[fail(display = "{:?}", _0)]
|
||||||
/// For instance, it contains invalid JSON.
|
DataCorruption(DataCorruption),
|
||||||
#[fail(display = "file contains corrupted data: '{:?}'", _0)]
|
|
||||||
CorruptedFile(PathBuf),
|
|
||||||
/// A thread holding the locked panicked and poisoned the lock.
|
/// A thread holding the locked panicked and poisoned the lock.
|
||||||
#[fail(display = "a thread holding the locked panicked and poisoned the lock")]
|
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
|
||||||
Poisoned,
|
Poisoned,
|
||||||
/// Invalid argument was passed by the user.
|
/// Invalid argument was passed by the user.
|
||||||
#[fail(display = "an invalid argument was passed: '{}'", _0)]
|
#[fail(display = "An invalid argument was passed: '{}'", _0)]
|
||||||
InvalidArgument(String),
|
InvalidArgument(String),
|
||||||
/// An Error happened in one of the thread.
|
/// An Error happened in one of the thread.
|
||||||
#[fail(display = "an error occurred in a thread: '{}'", _0)]
|
#[fail(display = "An error occurred in a thread: '{}'", _0)]
|
||||||
ErrorInThread(String),
|
ErrorInThread(String),
|
||||||
/// An Error appeared related to the schema.
|
/// An Error appeared related to the schema.
|
||||||
#[fail(display = "Schema error: '{}'", _0)]
|
#[fail(display = "Schema error: '{}'", _0)]
|
||||||
SchemaError(String),
|
SchemaError(String),
|
||||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||||
#[fail(display = "fast field not available: '{:?}'", _0)]
|
#[fail(display = "Fast field not available: '{:?}'", _0)]
|
||||||
FastFieldError(#[cause] FastFieldNotAvailableError),
|
FastFieldError(#[cause] FastFieldNotAvailableError),
|
||||||
|
/// System error. (e.g.: We failed spawning a new thread)
|
||||||
|
#[fail(display = "System error.'{}'", _0)]
|
||||||
|
SystemError(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<DataCorruption> for TantivyError {
|
||||||
|
fn from(data_corruption: DataCorruption) -> TantivyError {
|
||||||
|
TantivyError::DataCorruption(data_corruption)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||||
|
|||||||
@@ -6,12 +6,12 @@ pub use self::writer::BytesFastFieldWriter;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use schema::SchemaBuilder;
|
use schema::Schema;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bytes() {
|
fn test_bytes() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_bytes_field("bytesfield");
|
let field = schema_builder.add_bytes_field("bytesfield");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use bit_set::BitSet;
|
|||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
|
use space_usage::ByteCount;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use DocId;
|
use DocId;
|
||||||
@@ -63,6 +64,11 @@ impl DeleteBitSet {
|
|||||||
b & (1u8 << shift) != 0
|
b & (1u8 << shift) != 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Summarize total space usage of this bitset.
|
||||||
|
pub fn space_usage(&self) -> ByteCount {
|
||||||
|
self.data.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HasLen for DeleteBitSet {
|
impl HasLen for DeleteBitSet {
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use super::MultiValueIntFastFieldReader;
|
use super::MultiValueIntFastFieldReader;
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
|
use std::str;
|
||||||
use termdict::TermDictionary;
|
use termdict::TermDictionary;
|
||||||
use termdict::TermOrdinal;
|
use termdict::TermOrdinal;
|
||||||
use DocId;
|
use DocId;
|
||||||
@@ -20,6 +21,7 @@ use DocId;
|
|||||||
pub struct FacetReader {
|
pub struct FacetReader {
|
||||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||||
term_dict: TermDictionary,
|
term_dict: TermDictionary,
|
||||||
|
buffer: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FacetReader {
|
impl FacetReader {
|
||||||
@@ -37,6 +39,7 @@ impl FacetReader {
|
|||||||
FacetReader {
|
FacetReader {
|
||||||
term_ords,
|
term_ords,
|
||||||
term_dict,
|
term_dict,
|
||||||
|
buffer: vec![],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,11 +58,18 @@ impl FacetReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Given a term ordinal returns the term associated to it.
|
/// Given a term ordinal returns the term associated to it.
|
||||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
pub fn facet_from_ord(
|
||||||
|
&mut self,
|
||||||
|
facet_ord: TermOrdinal,
|
||||||
|
output: &mut Facet,
|
||||||
|
) -> Result<(), str::Utf8Error> {
|
||||||
let found_term = self
|
let found_term = self
|
||||||
.term_dict
|
.term_dict
|
||||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
.ord_to_term(facet_ord as u64, &mut self.buffer);
|
||||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||||
|
let facet_str = str::from_utf8(&self.buffer[..])?;
|
||||||
|
output.set_facet_str(facet_str);
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the list of facet ordinals associated to a document.
|
/// Return the list of facet ordinals associated to a document.
|
||||||
|
|||||||
@@ -127,19 +127,19 @@ mod tests {
|
|||||||
use common::CompositeFile;
|
use common::CompositeFile;
|
||||||
use directory::{Directory, RAMDirectory, WritePtr};
|
use directory::{Directory, RAMDirectory, WritePtr};
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use rand::Rng;
|
use rand::prelude::SliceRandom;
|
||||||
|
use rand::rngs::StdRng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use rand::XorShiftRng;
|
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
|
use schema::Schema;
|
||||||
use schema::FAST;
|
use schema::FAST;
|
||||||
use schema::{Schema, SchemaBuilder};
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
pub static ref SCHEMA: Schema = {
|
pub static ref SCHEMA: Schema = {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_u64_field("field", FAST);
|
schema_builder.add_u64_field("field", FAST);
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
@@ -298,7 +298,7 @@ mod tests {
|
|||||||
fn test_signed_intfastfield() {
|
fn test_signed_intfastfield() {
|
||||||
let path = Path::new("test");
|
let path = Path::new("test");
|
||||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -342,7 +342,7 @@ mod tests {
|
|||||||
fn test_signed_intfastfield_default_val() {
|
fn test_signed_intfastfield_default_val() {
|
||||||
let path = Path::new("test");
|
let path = Path::new("test");
|
||||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -367,11 +367,10 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Warning: this generates the same permutation at each call
|
||||||
pub fn generate_permutation() -> Vec<u64> {
|
pub fn generate_permutation() -> Vec<u64> {
|
||||||
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
|
||||||
let mut rng = XorShiftRng::from_seed(seed);
|
|
||||||
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
||||||
rng.shuffle(&mut permutation);
|
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
|
||||||
permutation
|
permutation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -9,12 +9,12 @@ mod tests {
|
|||||||
|
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::SchemaBuilder;
|
use schema::Schema;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_u64() {
|
fn test_multivalued_u64() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_u64_field(
|
let field = schema_builder.add_u64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||||
@@ -49,7 +49,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_i64() {
|
fn test_multivalued_i64() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_i64_field(
|
let field = schema_builder.add_i64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||||
|
|||||||
@@ -47,11 +47,11 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{Document, Facet, SchemaBuilder};
|
use schema::{Document, Facet, Schema};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multifastfield_reader() {
|
fn test_multifastfield_reader() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facets");
|
let facet_field = schema_builder.add_facet_field("facets");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -82,20 +82,20 @@ mod tests {
|
|||||||
|
|
||||||
let mut facet = Facet::root();
|
let mut facet = Facet::root();
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(1, &mut facet);
|
facet_reader.facet_from_ord(1, &mut facet).unwrap();
|
||||||
assert_eq!(facet, Facet::from("/category"));
|
assert_eq!(facet, Facet::from("/category"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(2, &mut facet);
|
facet_reader.facet_from_ord(2, &mut facet).unwrap();
|
||||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(3, &mut facet);
|
facet_reader.facet_from_ord(3, &mut facet).unwrap();
|
||||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(4, &mut facet);
|
facet_reader.facet_from_ord(4, &mut facet).unwrap();
|
||||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use directory::ReadOnlySource;
|
|||||||
use directory::{Directory, RAMDirectory, WritePtr};
|
use directory::{Directory, RAMDirectory, WritePtr};
|
||||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||||
use owning_ref::OwningRef;
|
use owning_ref::OwningRef;
|
||||||
use schema::SchemaBuilder;
|
use schema::Schema;
|
||||||
use schema::FAST;
|
use schema::FAST;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
@@ -108,7 +108,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
|
|
||||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_u64_field("field", FAST);
|
let field = schema_builder.add_u64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let path = Path::new("__dummy__");
|
let path = Path::new("__dummy__");
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
//! precompute computationally expensive functions of the fieldnorm
|
//! precompute computationally expensive functions of the fieldnorm
|
||||||
//! in a very short array.
|
//! in a very short array.
|
||||||
//!
|
//!
|
||||||
//! This trick is used by the [BM25 similarity]().
|
//! This trick is used by the BM25 similarity.
|
||||||
mod code;
|
mod code;
|
||||||
mod reader;
|
mod reader;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use rand::distributions::Range;
|
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use schema::*;
|
use schema::*;
|
||||||
use Index;
|
use Index;
|
||||||
@@ -16,7 +15,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
|||||||
#[ignore]
|
#[ignore]
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
fn test_indexing() {
|
fn test_indexing() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||||
@@ -24,7 +23,6 @@ fn test_indexing() {
|
|||||||
|
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
|
|
||||||
let universe = Range::new(0u64, 20u64);
|
|
||||||
let mut rng = thread_rng();
|
let mut rng = thread_rng();
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
||||||
@@ -33,7 +31,7 @@ fn test_indexing() {
|
|||||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||||
|
|
||||||
for _ in 0..200 {
|
for _ in 0..200 {
|
||||||
let random_val = rng.sample(&universe);
|
let random_val = rng.gen_range(0, 20);
|
||||||
if random_val == 0 {
|
if random_val == 0 {
|
||||||
index_writer.commit().expect("Commit failed");
|
index_writer.commit().expect("Commit failed");
|
||||||
committed_docs.extend(&uncommitted_docs);
|
committed_docs.extend(&uncommitted_docs);
|
||||||
|
|||||||
@@ -191,10 +191,7 @@ impl DeleteCursor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::wrong_self_convention)
|
|
||||||
)]
|
|
||||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
||||||
self.get()
|
self.get()
|
||||||
.map(|operation| operation.opstamp < target_opstamp)
|
.map(|operation| operation.opstamp < target_opstamp)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use core::SegmentComponent;
|
|||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use crossbeam_channel as channel;
|
use crossbeam::channel;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fastfield::write_delete_bitset;
|
use fastfield::write_delete_bitset;
|
||||||
@@ -61,7 +61,8 @@ fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
|||||||
"Per thread memory is too small: {}",
|
"Per thread memory is too small: {}",
|
||||||
per_thread_memory_budget
|
per_thread_memory_budget
|
||||||
)
|
)
|
||||||
}).min(19) // we cap it at 512K
|
})
|
||||||
|
.min(19) // we cap it at 512K
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||||
@@ -139,7 +140,7 @@ pub fn open_index_writer(
|
|||||||
let stamper = Stamper::new(current_opstamp);
|
let stamper = Stamper::new(current_opstamp);
|
||||||
|
|
||||||
let segment_updater =
|
let segment_updater =
|
||||||
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||||
|
|
||||||
let mut index_writer = IndexWriter {
|
let mut index_writer = IndexWriter {
|
||||||
_directory_lock: Some(directory_lock),
|
_directory_lock: Some(directory_lock),
|
||||||
@@ -388,11 +389,13 @@ impl IndexWriter {
|
|||||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||||
.name(format!(
|
.name(format!(
|
||||||
"indexing thread {} for gen {}",
|
"thrd-tantivy-index{}-gen{}",
|
||||||
self.worker_id, generation
|
self.worker_id, generation
|
||||||
)).spawn(move || {
|
))
|
||||||
|
.spawn(move || {
|
||||||
loop {
|
loop {
|
||||||
let mut document_iterator = document_receiver_clone.clone().peekable();
|
let mut document_iterator =
|
||||||
|
document_receiver_clone.clone().into_iter().peekable();
|
||||||
|
|
||||||
// the peeking here is to avoid
|
// the peeking here is to avoid
|
||||||
// creating a new segment's files
|
// creating a new segment's files
|
||||||
@@ -464,10 +467,8 @@ impl IndexWriter {
|
|||||||
///
|
///
|
||||||
/// Returns the former segment_ready channel.
|
/// Returns the former segment_ready channel.
|
||||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||||
let (mut document_sender, mut document_receiver): (
|
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
|
||||||
DocumentSender,
|
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||||
DocumentReceiver,
|
|
||||||
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
|
||||||
swap(&mut self.document_sender, &mut document_sender);
|
swap(&mut self.document_sender, &mut document_sender);
|
||||||
swap(&mut self.document_receiver, &mut document_receiver);
|
swap(&mut self.document_receiver, &mut document_receiver);
|
||||||
document_receiver
|
document_receiver
|
||||||
@@ -557,11 +558,8 @@ impl IndexWriter {
|
|||||||
// and recreate a new one channels.
|
// and recreate a new one channels.
|
||||||
self.recreate_document_channel();
|
self.recreate_document_channel();
|
||||||
|
|
||||||
let mut former_workers_join_handle = Vec::new();
|
let former_workers_join_handle =
|
||||||
swap(
|
mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||||
&mut former_workers_join_handle,
|
|
||||||
&mut self.workers_join_handle,
|
|
||||||
);
|
|
||||||
|
|
||||||
for worker_handle in former_workers_join_handle {
|
for worker_handle in former_workers_join_handle {
|
||||||
let indexing_worker_result = worker_handle
|
let indexing_worker_result = worker_handle
|
||||||
@@ -640,7 +638,10 @@ impl IndexWriter {
|
|||||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||||
let opstamp = self.stamper.stamp();
|
let opstamp = self.stamper.stamp();
|
||||||
let add_operation = AddOperation { opstamp, document };
|
let add_operation = AddOperation { opstamp, document };
|
||||||
self.document_sender.send(add_operation);
|
let send_result = self.document_sender.send(add_operation);
|
||||||
|
if let Err(e) = send_result {
|
||||||
|
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||||
|
}
|
||||||
opstamp
|
opstamp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -657,7 +658,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_stops_duplicates() {
|
fn test_lockfile_stops_duplicates() {
|
||||||
let schema_builder = schema::SchemaBuilder::default();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let _index_writer = index.writer(40_000_000).unwrap();
|
let _index_writer = index.writer(40_000_000).unwrap();
|
||||||
match index.writer(40_000_000) {
|
match index.writer(40_000_000) {
|
||||||
@@ -668,7 +669,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_already_exists_error_msg() {
|
fn test_lockfile_already_exists_error_msg() {
|
||||||
let schema_builder = schema::SchemaBuilder::default();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
match index.writer_with_num_threads(1, 3_000_000) {
|
match index.writer_with_num_threads(1, 3_000_000) {
|
||||||
@@ -683,7 +684,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_set_merge_policy() {
|
fn test_set_merge_policy() {
|
||||||
let schema_builder = schema::SchemaBuilder::default();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let index_writer = index.writer(40_000_000).unwrap();
|
let index_writer = index.writer(40_000_000).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -701,7 +702,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_released_on_drop() {
|
fn test_lockfile_released_on_drop() {
|
||||||
let schema_builder = schema::SchemaBuilder::default();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let _index_writer = index.writer(40_000_000).unwrap();
|
let _index_writer = index.writer(40_000_000).unwrap();
|
||||||
@@ -713,7 +714,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_commit_and_rollback() {
|
fn test_commit_and_rollback() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -735,7 +736,7 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(text_field=>"b"));
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
index_writer.add_document(doc!(text_field=>"c"));
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
}
|
}
|
||||||
assert_eq!(index_writer.commit().unwrap(), 2u64);
|
assert_eq!(index_writer.commit().unwrap(), 3u64);
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
assert_eq!(num_docs_containing("a"), 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
assert_eq!(num_docs_containing("b"), 1);
|
assert_eq!(num_docs_containing("b"), 1);
|
||||||
@@ -747,7 +748,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_with_merges() {
|
fn test_with_merges() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
@@ -784,7 +785,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_prepare_with_commit_message() {
|
fn test_prepare_with_commit_message() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -798,7 +799,6 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
prepared_commit.set_payload("first commit");
|
prepared_commit.set_payload("first commit");
|
||||||
assert_eq!(prepared_commit.opstamp(), 100);
|
|
||||||
prepared_commit.commit().expect("commit failed");
|
prepared_commit.commit().expect("commit failed");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -818,7 +818,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_prepare_but_rollback() {
|
fn test_prepare_but_rollback() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -832,7 +832,6 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
prepared_commit.set_payload("first commit");
|
prepared_commit.set_payload("first commit");
|
||||||
assert_eq!(prepared_commit.opstamp(), 100);
|
|
||||||
prepared_commit.abort().expect("commit failed");
|
prepared_commit.abort().expect("commit failed");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -866,7 +865,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_write_commit_fails() {
|
fn test_write_commit_fails() {
|
||||||
use fail;
|
use fail;
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
|
|||||||
@@ -40,13 +40,15 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
|||||||
total_tokens += reader.inverted_index(field).total_num_tokens();
|
total_tokens += reader.inverted_index(field).total_num_tokens();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
total_tokens + count
|
total_tokens
|
||||||
.iter()
|
+ count
|
||||||
.cloned()
|
.iter()
|
||||||
.enumerate()
|
.cloned()
|
||||||
.map(|(fieldnorm_ord, count)| {
|
.enumerate()
|
||||||
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
.map(|(fieldnorm_ord, count)| {
|
||||||
}).sum::<u64>()
|
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
||||||
|
})
|
||||||
|
.sum::<u64>()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct IndexMerger {
|
pub struct IndexMerger {
|
||||||
@@ -523,7 +525,8 @@ impl IndexMerger {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
// At this point, `segment_postings` contains the posting list
|
// At this point, `segment_postings` contains the posting list
|
||||||
// of all of the segments containing the given term.
|
// of all of the segments containing the given term.
|
||||||
@@ -614,7 +617,7 @@ impl IndexMerger {
|
|||||||
store_writer.store(&doc)?;
|
store_writer.store(&doc)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
store_writer.stack(store_reader)?;
|
store_writer.stack(&store_reader)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -635,10 +638,9 @@ impl SerializableSegment for IndexMerger {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use collector::chain;
|
|
||||||
use collector::tests::TestCollector;
|
use collector::tests::TestCollector;
|
||||||
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||||
use collector::FacetCollector;
|
use collector::{Count, FacetCollector};
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
@@ -647,10 +649,12 @@ mod tests {
|
|||||||
use schema;
|
use schema;
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
|
use schema::Facet;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use schema::TextFieldIndexing;
|
use schema::TextFieldIndexing;
|
||||||
|
use schema::INT_INDEXED;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
@@ -658,13 +662,14 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_merger_no_deletes() {
|
fn test_index_merger_no_deletes() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_fieldtype = schema::TextOptions::default()
|
let text_fieldtype = schema::TextOptions::default()
|
||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
TextFieldIndexing::default()
|
TextFieldIndexing::default()
|
||||||
.set_tokenizer("default")
|
.set_tokenizer("default")
|
||||||
.set_index_option(IndexRecordOption::WithFreqs),
|
.set_index_option(IndexRecordOption::WithFreqs),
|
||||||
).set_stored();
|
)
|
||||||
|
.set_stored();
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||||
@@ -742,27 +747,32 @@ mod tests {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let get_doc_ids = |terms: Vec<Term>| {
|
let get_doc_ids = |terms: Vec<Term>| {
|
||||||
let mut collector = TestCollector::default();
|
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
assert!(searcher.search(&query, &mut collector).is_ok());
|
let top_docs = searcher.search(&query, &TestCollector).unwrap();
|
||||||
collector.docs()
|
top_docs.docs().to_vec()
|
||||||
};
|
};
|
||||||
{
|
{
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||||
vec![1, 2, 4]
|
vec![DocAddress(0, 1), DocAddress(0, 2), DocAddress(0, 4)]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||||
vec![0, 3]
|
vec![DocAddress(0, 0), DocAddress(0, 3)]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||||
vec![4]
|
vec![DocAddress(0, 4)]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||||
vec![0, 1, 2, 3, 4]
|
vec![
|
||||||
|
DocAddress(0, 0),
|
||||||
|
DocAddress(0, 1),
|
||||||
|
DocAddress(0, 2),
|
||||||
|
DocAddress(0, 3),
|
||||||
|
DocAddress(0, 4)
|
||||||
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -788,17 +798,18 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let get_fast_vals = |terms: Vec<Term>| {
|
let get_fast_vals = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
let mut collector = FastFieldTestCollector::for_field(score_field);
|
searcher
|
||||||
assert!(searcher.search(&query, &mut collector).is_ok());
|
.search(&query, &FastFieldTestCollector::for_field(score_field))
|
||||||
collector.vals()
|
.unwrap()
|
||||||
};
|
};
|
||||||
let get_fast_vals_bytes = |terms: Vec<Term>| {
|
let get_fast_vals_bytes = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
let mut collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
|
||||||
searcher
|
searcher
|
||||||
.search(&query, &mut collector)
|
.search(
|
||||||
.expect("failed to search");
|
&query,
|
||||||
collector.vals()
|
&BytesFastFieldTestCollector::for_field(bytes_score_field),
|
||||||
|
)
|
||||||
|
.expect("failed to search")
|
||||||
};
|
};
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||||
@@ -814,11 +825,12 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_merger_with_deletes() {
|
fn test_index_merger_with_deletes() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_fieldtype = schema::TextOptions::default()
|
let text_fieldtype = schema::TextOptions::default()
|
||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||||
).set_stored();
|
)
|
||||||
|
.set_stored();
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||||
@@ -827,21 +839,13 @@ mod tests {
|
|||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
|
||||||
let search_term = |searcher: &Searcher, term: Term| {
|
let search_term = |searcher: &Searcher, term: Term| {
|
||||||
let mut collector = FastFieldTestCollector::for_field(score_field);
|
let collector = FastFieldTestCollector::for_field(score_field);
|
||||||
let mut bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
||||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||||
|
let (scores, bytes) = searcher
|
||||||
{
|
.search(&term_query, &(collector, bytes_collector))
|
||||||
let mut combined_collector =
|
.unwrap();
|
||||||
chain().push(&mut collector).push(&mut bytes_collector);
|
let mut score_bytes = Cursor::new(bytes);
|
||||||
searcher
|
|
||||||
.search(&term_query, &mut combined_collector)
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
let scores = collector.vals();
|
|
||||||
|
|
||||||
let mut score_bytes = Cursor::new(bytes_collector.vals());
|
|
||||||
for &score in &scores {
|
for &score in &scores {
|
||||||
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
|
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
|
||||||
}
|
}
|
||||||
@@ -854,21 +858,21 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// a first commit
|
// a first commit
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "a b d",
|
text_field => "a b d",
|
||||||
score_field => 1u64,
|
score_field => 1u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 1],
|
bytes_score_field => vec![0u8, 0, 0, 1],
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "b c",
|
text_field => "b c",
|
||||||
score_field => 2u64,
|
score_field => 2u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 2],
|
bytes_score_field => vec![0u8, 0, 0, 2],
|
||||||
));
|
));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "c d",
|
text_field => "c d",
|
||||||
score_field => 3u64,
|
score_field => 3u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 3],
|
bytes_score_field => vec![0u8, 0, 0, 3],
|
||||||
));
|
));
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let ref searcher = *index.searcher();
|
let ref searcher = *index.searcher();
|
||||||
@@ -895,37 +899,37 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// a second commit
|
// a second commit
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "a d e",
|
text_field => "a d e",
|
||||||
score_field => 4_000u64,
|
score_field => 4_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 4],
|
bytes_score_field => vec![0u8, 0, 0, 4],
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "e f",
|
text_field => "e f",
|
||||||
score_field => 5_000u64,
|
score_field => 5_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 5],
|
bytes_score_field => vec![0u8, 0, 0, 5],
|
||||||
));
|
));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "f g",
|
text_field => "f g",
|
||||||
score_field => 6_000u64,
|
score_field => 6_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 23, 112],
|
bytes_score_field => vec![0u8, 0, 23, 112],
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "g h",
|
text_field => "g h",
|
||||||
score_field => 7_000u64,
|
score_field => 7_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 27, 88],
|
bytes_score_field => vec![0u8, 0, 27, 88],
|
||||||
));
|
));
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 2);
|
assert_eq!(searcher.segment_readers().len(), 2);
|
||||||
assert_eq!(searcher.num_docs(), 3);
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 1);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
assert_eq!(searcher.segment_readers()[0].max_doc(), 4);
|
||||||
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[1].num_docs(), 1);
|
||||||
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
|
assert_eq!(searcher.segment_readers()[1].max_doc(), 3);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||||
empty_vec
|
empty_vec
|
||||||
@@ -959,15 +963,15 @@ mod tests {
|
|||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_field_reader::<u64>(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 1);
|
assert_eq!(score_field_reader.min_value(), 4000);
|
||||||
assert_eq!(score_field_reader.max_value(), 3);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
|
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(1)
|
.segment_reader(1)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_field_reader::<u64>(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 4000);
|
assert_eq!(score_field_reader.min_value(), 1);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 3);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// merging the segments
|
// merging the segments
|
||||||
@@ -980,7 +984,7 @@ mod tests {
|
|||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let ref searcher = *index.searcher();
|
let searcher = index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert_eq!(searcher.num_docs(), 3);
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||||
@@ -1026,7 +1030,7 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let ref searcher = *index.searcher();
|
let searcher = index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||||
@@ -1122,6 +1126,7 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// Test removing all docs
|
// Test removing all docs
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
@@ -1140,10 +1145,9 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_facets() {
|
fn test_merge_facets() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
use schema::Facet;
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||||
@@ -1172,20 +1176,16 @@ mod tests {
|
|||||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
}
|
}
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet(Facet::from("/top"));
|
facet_collector.add_facet(Facet::from("/top"));
|
||||||
use collector::{CountCollector, MultiCollector};
|
let (count, facet_counts) = searcher
|
||||||
let mut count_collector = CountCollector::default();
|
.search(&AllQuery, &(Count, facet_collector))
|
||||||
{
|
.unwrap();
|
||||||
let mut multi_collectors =
|
assert_eq!(count, expected_num_docs);
|
||||||
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
|
|
||||||
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
|
||||||
}
|
|
||||||
assert_eq!(count_collector.count(), expected_num_docs);
|
|
||||||
let facet_counts = facet_collector.harvest();
|
|
||||||
let facets: Vec<(String, u64)> = facet_counts
|
let facets: Vec<(String, u64)> = facet_counts
|
||||||
.get("/top")
|
.get("/top")
|
||||||
.map(|(facet, count)| (facet.to_string(), count))
|
.map(|(facet, count)| (facet.to_string(), count))
|
||||||
@@ -1209,7 +1209,6 @@ mod tests {
|
|||||||
("/top/f", 1),
|
("/top/f", 1),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
@@ -1222,7 +1221,6 @@ mod tests {
|
|||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
index_writer.wait_merging_threads().unwrap();
|
index_writer.wait_merging_threads().unwrap();
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
test_searcher(
|
test_searcher(
|
||||||
11,
|
11,
|
||||||
@@ -1259,9 +1257,37 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bug_merge() {
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(int_field => 1u64));
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
index_writer.add_document(doc!(int_field => 1u64));
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
|
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||||
|
let segment_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
index_writer
|
||||||
|
.merge(&segment_ids)
|
||||||
|
.expect("Failed to initiate merge")
|
||||||
|
.wait()
|
||||||
|
.expect("Merging failed");
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
// commit has not been called yet. The document should still be
|
||||||
|
// there.
|
||||||
|
assert_eq!(index.searcher().num_docs(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
@@ -1302,7 +1328,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields() {
|
fn test_merge_multivalued_int_fields() {
|
||||||
let mut schema_builder = schema::SchemaBuilder::default();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
@@ -1368,15 +1394,17 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[17]);
|
assert_eq!(&vals, &[17]);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
println!(
|
||||||
let segment = searcher.segment_reader(1u32);
|
"{:?}",
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
searcher
|
||||||
ff_reader.get_vals(0, &mut vals);
|
.segment_readers()
|
||||||
assert_eq!(&vals, &[20]);
|
.iter()
|
||||||
}
|
.map(|reader| reader.max_doc())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(2u32);
|
let segment = searcher.segment_reader(1u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
@@ -1385,6 +1413,13 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[1_000]);
|
assert_eq!(&vals, &[1_000]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let segment = searcher.segment_reader(2u32);
|
||||||
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
ff_reader.get_vals(0, &mut vals);
|
||||||
|
assert_eq!(&vals, &[20]);
|
||||||
|
}
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
@@ -1403,6 +1438,14 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
|
println!(
|
||||||
|
"{:?}",
|
||||||
|
searcher
|
||||||
|
.segment_readers()
|
||||||
|
.iter()
|
||||||
|
.map(|reader| reader.max_doc())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
);
|
||||||
let segment = searcher.segment_reader(0u32);
|
let segment = searcher.segment_reader(0u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
|
||||||
@@ -1428,13 +1471,13 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[17]);
|
assert_eq!(&vals, &[17]);
|
||||||
|
|
||||||
ff_reader.get_vals(7, &mut vals);
|
ff_reader.get_vals(7, &mut vals);
|
||||||
assert_eq!(&vals, &[20]);
|
|
||||||
|
|
||||||
ff_reader.get_vals(8, &mut vals);
|
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
ff_reader.get_vals(9, &mut vals);
|
ff_reader.get_vals(8, &mut vals);
|
||||||
assert_eq!(&vals, &[1_000]);
|
assert_eq!(&vals, &[1_000]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(9, &mut vals);
|
||||||
|
assert_eq!(&vals, &[20]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ use indexer::delete_queue::DeleteCursor;
|
|||||||
use indexer::index_writer::advance_deletes;
|
use indexer::index_writer::advance_deletes;
|
||||||
use indexer::merger::IndexMerger;
|
use indexer::merger::IndexMerger;
|
||||||
use indexer::stamper::Stamper;
|
use indexer::stamper::Stamper;
|
||||||
use indexer::MergeCandidate;
|
|
||||||
use indexer::SegmentEntry;
|
use indexer::SegmentEntry;
|
||||||
use indexer::SegmentSerializer;
|
use indexer::SegmentSerializer;
|
||||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||||
@@ -45,8 +44,15 @@ use Result;
|
|||||||
/// and flushed.
|
/// and flushed.
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
|
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
||||||
save_metas(vec![], schema, opstamp, None, directory)
|
save_metas(
|
||||||
|
&IndexMeta {
|
||||||
|
segments: Vec::new(),
|
||||||
|
schema,
|
||||||
|
opstamp: 0u64,
|
||||||
|
payload: None
|
||||||
|
},
|
||||||
|
directory)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
@@ -58,20 +64,17 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
|
|||||||
/// and flushed.
|
/// and flushed.
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
pub fn save_metas(
|
fn save_metas(
|
||||||
segment_metas: Vec<SegmentMeta>,
|
metas: &IndexMeta,
|
||||||
schema: Schema,
|
|
||||||
opstamp: u64,
|
|
||||||
payload: Option<String>,
|
|
||||||
directory: &mut Directory,
|
directory: &mut Directory,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let metas = IndexMeta {
|
// let metas = IndexMeta {
|
||||||
segments: segment_metas,
|
// segments: segment_metas,
|
||||||
schema,
|
// schema,
|
||||||
opstamp,
|
// opstamp,
|
||||||
payload,
|
// payload,
|
||||||
};
|
// };
|
||||||
let mut buffer = serde_json::to_vec_pretty(&metas)?;
|
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||||
writeln!(&mut buffer)?;
|
writeln!(&mut buffer)?;
|
||||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||||
@@ -86,6 +89,11 @@ pub fn save_metas(
|
|||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||||
|
|
||||||
|
struct MergeOperation {
|
||||||
|
pub target_opstamp: u64,
|
||||||
|
pub segment_ids: Vec<SegmentId>,
|
||||||
|
}
|
||||||
|
|
||||||
fn perform_merge(
|
fn perform_merge(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
mut segment_entries: Vec<SegmentEntry>,
|
mut segment_entries: Vec<SegmentEntry>,
|
||||||
@@ -126,6 +134,13 @@ fn perform_merge(
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct InnerSegmentUpdater {
|
struct InnerSegmentUpdater {
|
||||||
|
// we keep a copy of the current active IndexMeta to
|
||||||
|
// avoid loading the file everytime we need it in the
|
||||||
|
// `SegmentUpdater`.
|
||||||
|
//
|
||||||
|
// This should be up to date as all update happen through
|
||||||
|
// the unique active `SegmentUpdater`.
|
||||||
|
active_metas: RwLock<Arc<IndexMeta>>,
|
||||||
pool: CpuPool,
|
pool: CpuPool,
|
||||||
index: Index,
|
index: Index,
|
||||||
segment_manager: SegmentManager,
|
segment_manager: SegmentManager,
|
||||||
@@ -138,7 +153,7 @@ struct InnerSegmentUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentUpdater {
|
impl SegmentUpdater {
|
||||||
pub fn new(
|
pub fn create(
|
||||||
index: Index,
|
index: Index,
|
||||||
stamper: Stamper,
|
stamper: Stamper,
|
||||||
delete_cursor: &DeleteCursor,
|
delete_cursor: &DeleteCursor,
|
||||||
@@ -149,7 +164,9 @@ impl SegmentUpdater {
|
|||||||
.name_prefix("segment_updater")
|
.name_prefix("segment_updater")
|
||||||
.pool_size(1)
|
.pool_size(1)
|
||||||
.create();
|
.create();
|
||||||
|
let index_meta = index.load_metas()?;
|
||||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||||
|
active_metas: RwLock::new(Arc::new(index_meta)),
|
||||||
pool,
|
pool,
|
||||||
index,
|
index,
|
||||||
segment_manager,
|
segment_manager,
|
||||||
@@ -195,7 +212,8 @@ impl SegmentUpdater {
|
|||||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
true
|
true
|
||||||
}).forget();
|
})
|
||||||
|
.forget();
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
@@ -227,20 +245,42 @@ impl SegmentUpdater {
|
|||||||
if self.is_alive() {
|
if self.is_alive() {
|
||||||
let index = &self.0.index;
|
let index = &self.0.index;
|
||||||
let directory = index.directory();
|
let directory = index.directory();
|
||||||
save_metas(
|
let mut commited_segment_metas = self.0.segment_manager.committed_segment_metas();
|
||||||
self.0.segment_manager.committed_segment_metas(),
|
|
||||||
index.schema(),
|
// We sort segment_readers by number of documents.
|
||||||
|
// This is an heuristic to make multithreading more efficient.
|
||||||
|
//
|
||||||
|
// This is not done at the searcher level because I had a strange
|
||||||
|
// use case in which I was dealing with a large static index,
|
||||||
|
// dispatched over 5 SSD drives.
|
||||||
|
//
|
||||||
|
// A `UnionDirectory` makes it possible to read from these
|
||||||
|
// 5 different drives and creates a meta.json on the fly.
|
||||||
|
// In order to optimize the throughput, it creates a lasagna of segments
|
||||||
|
// from the different drives.
|
||||||
|
//
|
||||||
|
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
||||||
|
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
||||||
|
let index_meta = IndexMeta {
|
||||||
|
segments: commited_segment_metas,
|
||||||
|
schema: index.schema(),
|
||||||
opstamp,
|
opstamp,
|
||||||
commit_message,
|
payload: commit_message
|
||||||
|
};
|
||||||
|
save_metas(
|
||||||
|
&index_meta,
|
||||||
directory.box_clone().borrow_mut(),
|
directory.box_clone().borrow_mut(),
|
||||||
).expect("Could not save metas.");
|
)
|
||||||
|
.expect("Could not save metas.");
|
||||||
|
self.store_meta(&index_meta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
}).wait()
|
})
|
||||||
|
.wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn garbage_collect_files_exec(&self) {
|
fn garbage_collect_files_exec(&self) {
|
||||||
@@ -262,19 +302,32 @@ impl SegmentUpdater {
|
|||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
}
|
}
|
||||||
}).wait()
|
})
|
||||||
|
.wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||||
//let future_merged_segment = */
|
|
||||||
let segment_ids_vec = segment_ids.to_vec();
|
let segment_ids_vec = segment_ids.to_vec();
|
||||||
|
let commit_opstamp = self.load_metas().opstamp;
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
segment_updater.start_merge_impl(&segment_ids_vec[..])
|
segment_updater.start_merge_impl(&segment_ids_vec[..], commit_opstamp)
|
||||||
}).wait()?
|
})
|
||||||
|
.wait()?
|
||||||
|
}
|
||||||
|
|
||||||
|
fn store_meta(&self, index_meta: &IndexMeta) {
|
||||||
|
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
|
||||||
|
}
|
||||||
|
fn load_metas(&self) -> Arc<IndexMeta> {
|
||||||
|
self.0.active_metas.read().unwrap().clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
// `segment_ids` is required to be non-empty.
|
// `segment_ids` is required to be non-empty.
|
||||||
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
fn start_merge_impl(
|
||||||
|
&self,
|
||||||
|
segment_ids: &[SegmentId],
|
||||||
|
target_opstamp: u64,
|
||||||
|
) -> Result<Receiver<SegmentMeta>> {
|
||||||
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
|
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
|
||||||
|
|
||||||
let segment_updater_clone = self.clone();
|
let segment_updater_clone = self.clone();
|
||||||
@@ -289,8 +342,6 @@ impl SegmentUpdater {
|
|||||||
);
|
);
|
||||||
let (merging_future_send, merging_future_recv) = oneshot();
|
let (merging_future_send, merging_future_recv) = oneshot();
|
||||||
|
|
||||||
let target_opstamp = self.0.stamper.stamp();
|
|
||||||
|
|
||||||
// first we need to apply deletes to our segment.
|
// first we need to apply deletes to our segment.
|
||||||
let merging_join_handle = thread::Builder::new()
|
let merging_join_handle = thread::Builder::new()
|
||||||
.name(format!("mergingthread-{}", merging_thread_id))
|
.name(format!("mergingthread-{}", merging_thread_id))
|
||||||
@@ -336,7 +387,8 @@ impl SegmentUpdater {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.remove(&merging_thread_id);
|
.remove(&merging_thread_id);
|
||||||
Ok(())
|
Ok(())
|
||||||
}).expect("Failed to spawn a thread.");
|
})
|
||||||
|
.expect("Failed to spawn a thread.");
|
||||||
self.0
|
self.0
|
||||||
.merging_threads
|
.merging_threads
|
||||||
.write()
|
.write()
|
||||||
@@ -351,11 +403,32 @@ impl SegmentUpdater {
|
|||||||
// Committed segments cannot be merged with uncommitted_segments.
|
// Committed segments cannot be merged with uncommitted_segments.
|
||||||
// We therefore consider merges using these two sets of segments independently.
|
// We therefore consider merges using these two sets of segments independently.
|
||||||
let merge_policy = self.get_merge_policy();
|
let merge_policy = self.get_merge_policy();
|
||||||
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
|
|
||||||
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
let current_opstamp = self.0.stamper.stamp();
|
||||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
let mut merge_candidates = merge_policy
|
||||||
for MergeCandidate(segment_metas) in merge_candidates {
|
.compute_merge_candidates(&uncommitted_segments)
|
||||||
match self.start_merge_impl(&segment_metas) {
|
.into_iter()
|
||||||
|
.map(|merge_candidate| MergeOperation {
|
||||||
|
target_opstamp: current_opstamp,
|
||||||
|
segment_ids: merge_candidate.0,
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let commit_opstamp = self.load_metas().opstamp;
|
||||||
|
let committed_merge_candidates = merge_policy
|
||||||
|
.compute_merge_candidates(&committed_segments)
|
||||||
|
.into_iter()
|
||||||
|
.map(|merge_candidate| MergeOperation {
|
||||||
|
target_opstamp: commit_opstamp,
|
||||||
|
segment_ids: merge_candidate.0,
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
merge_candidates.extend(committed_merge_candidates.into_iter());
|
||||||
|
for MergeOperation {
|
||||||
|
target_opstamp,
|
||||||
|
segment_ids,
|
||||||
|
} in merge_candidates
|
||||||
|
{
|
||||||
|
match self.start_merge_impl(&segment_ids, target_opstamp) {
|
||||||
Ok(merge_future) => {
|
Ok(merge_future) => {
|
||||||
if let Err(e) = merge_future.fuse().poll() {
|
if let Err(e) = merge_future.fuse().poll() {
|
||||||
error!("The merge task failed quickly after starting: {:?}", e);
|
error!("The merge task failed quickly after starting: {:?}", e);
|
||||||
@@ -390,12 +463,7 @@ impl SegmentUpdater {
|
|||||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||||
if let Some(delete_operation) = delete_cursor.get() {
|
if let Some(delete_operation) = delete_cursor.get() {
|
||||||
let committed_opstamp = segment_updater
|
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||||
.0
|
|
||||||
.index
|
|
||||||
.load_metas()
|
|
||||||
.expect("Failed to read opstamp")
|
|
||||||
.opstamp;
|
|
||||||
if delete_operation.opstamp < committed_opstamp {
|
if delete_operation.opstamp < committed_opstamp {
|
||||||
let index = &segment_updater.0.index;
|
let index = &segment_updater.0.index;
|
||||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||||
@@ -424,10 +492,11 @@ impl SegmentUpdater {
|
|||||||
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
info!("save metas");
|
info!("save metas");
|
||||||
let previous_metas = segment_updater.0.index.load_metas().unwrap();
|
let previous_metas = segment_updater.load_metas();
|
||||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
|
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
}).wait()
|
})
|
||||||
|
.wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wait for current merging threads.
|
/// Wait for current merging threads.
|
||||||
@@ -484,7 +553,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_during_merge() {
|
fn test_delete_during_merge() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,8 @@ impl SegmentWriter {
|
|||||||
segment.index().tokenizers().get(tokenizer_name)
|
segment.index().tokenizers().get(tokenizer_name)
|
||||||
}),
|
}),
|
||||||
_ => None,
|
_ => None,
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
Ok(SegmentWriter {
|
Ok(SegmentWriter {
|
||||||
max_doc: 0,
|
max_doc: 0,
|
||||||
multifield_postings,
|
multifield_postings,
|
||||||
@@ -110,18 +111,18 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
match *field_options.field_type() {
|
match *field_options.field_type() {
|
||||||
FieldType::HierarchicalFacet => {
|
FieldType::HierarchicalFacet => {
|
||||||
let facets: Vec<&[u8]> = field_values
|
let facets: Vec<&str> = field_values
|
||||||
.iter()
|
.iter()
|
||||||
.flat_map(|field_value| match *field_value.value() {
|
.flat_map(|field_value| match *field_value.value() {
|
||||||
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||||
_ => {
|
_ => {
|
||||||
panic!("Expected hierarchical facet");
|
panic!("Expected hierarchical facet");
|
||||||
}
|
}
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
let mut term = Term::for_field(field); // we set the Term
|
let mut term = Term::for_field(field); // we set the Term
|
||||||
for facet_bytes in facets {
|
for fake_str in facets {
|
||||||
let mut unordered_term_id_opt = None;
|
let mut unordered_term_id_opt = None;
|
||||||
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
|
||||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||||
term.set_text(&token.text);
|
term.set_text(&token.text);
|
||||||
let unordered_term_id =
|
let unordered_term_id =
|
||||||
@@ -145,7 +146,8 @@ impl SegmentWriter {
|
|||||||
.flat_map(|field_value| match *field_value.value() {
|
.flat_map(|field_value| match *field_value.value() {
|
||||||
Value::Str(ref text) => Some(text.as_str()),
|
Value::Str(ref text) => Some(text.as_str()),
|
||||||
_ => None,
|
_ => None,
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
if texts.is_empty() {
|
if texts.is_empty() {
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -1,50 +1,68 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
|
||||||
|
|
||||||
// AtomicU64 have not landed in stable.
|
// AtomicU64 have not landed in stable.
|
||||||
// For the moment let's just use AtomicUsize on
|
// For the moment let's just use AtomicUsize on
|
||||||
// x86/64 bit platform, and a mutex on other platform.
|
// x86/64 bit platform, and a mutex on other platform.
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
#[cfg(target = "x86_64")]
|
|
||||||
mod archicture_impl {
|
mod archicture_impl {
|
||||||
|
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
#[derive(Clone, Default)]
|
#[derive(Default)]
|
||||||
pub struct Stamper(Arc<AtomicU64>);
|
pub struct AtomicU64Ersatz(AtomicUsize);
|
||||||
|
|
||||||
impl Stamper {
|
impl AtomicU64Ersatz {
|
||||||
pub fn new(first_opstamp: u64) -> Stamper {
|
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stamp(&self) -> u64 {
|
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
self.0.fetch_add(val as usize, order) as u64
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(target = "x86_64"))]
|
#[cfg(not(target_arch = "x86_64"))]
|
||||||
mod archicture_impl {
|
mod archicture_impl {
|
||||||
|
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::atomic::Ordering;
|
||||||
|
/// Under other architecture, we rely on a mutex.
|
||||||
|
use std::sync::RwLock;
|
||||||
|
|
||||||
#[derive(Clone, Default)]
|
#[derive(Default)]
|
||||||
pub struct Stamper(Arc<Mutex<u64>>);
|
pub struct AtomicU64Ersatz(RwLock<u64>);
|
||||||
|
|
||||||
impl Stamper {
|
impl AtomicU64Ersatz {
|
||||||
pub fn new(first_opstamp: u64) -> Stamper {
|
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||||
Stamper(Arc::new(Mutex::new(first_opstamp)))
|
AtomicU64Ersatz(RwLock::new(first_opstamp))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stamp(&self) -> u64 {
|
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
||||||
let mut guard = self.0.lock().expect("Failed to lock the stamper");
|
let mut lock = self.0.write().unwrap();
|
||||||
let previous_val = *guard;
|
let previous_val = *lock;
|
||||||
*guard = previous_val + 1;
|
*lock = previous_val + incr;
|
||||||
previous_val
|
previous_val
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub use self::archicture_impl::Stamper;
|
use self::archicture_impl::AtomicU64Ersatz;
|
||||||
|
|
||||||
|
#[derive(Clone, Default)]
|
||||||
|
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
||||||
|
|
||||||
|
impl Stamper {
|
||||||
|
pub fn new(first_opstamp: u64) -> Stamper {
|
||||||
|
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stamp(&self) -> u64 {
|
||||||
|
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
|||||||
264
src/lib.rs
264
src/lib.rs
@@ -1,6 +1,5 @@
|
|||||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||||
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
|
|
||||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
||||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||||
#![warn(missing_docs)]
|
#![warn(missing_docs)]
|
||||||
@@ -24,7 +23,8 @@
|
|||||||
//! # use tempdir::TempDir;
|
//! # use tempdir::TempDir;
|
||||||
//! # use tantivy::Index;
|
//! # use tantivy::Index;
|
||||||
//! # use tantivy::schema::*;
|
//! # use tantivy::schema::*;
|
||||||
//! # use tantivy::collector::TopCollector;
|
//! # use tantivy::{Score, DocAddress};
|
||||||
|
//! # use tantivy::collector::TopDocs;
|
||||||
//! # use tantivy::query::QueryParser;
|
//! # use tantivy::query::QueryParser;
|
||||||
//! #
|
//! #
|
||||||
//! # fn main() {
|
//! # fn main() {
|
||||||
@@ -46,7 +46,7 @@
|
|||||||
//! // in a compressed, row-oriented key-value store.
|
//! // in a compressed, row-oriented key-value store.
|
||||||
//! // This store is useful to reconstruct the
|
//! // This store is useful to reconstruct the
|
||||||
//! // documents that were selected during the search phase.
|
//! // documents that were selected during the search phase.
|
||||||
//! let mut schema_builder = SchemaBuilder::default();
|
//! let mut schema_builder = Schema::builder();
|
||||||
//! let title = schema_builder.add_text_field("title", TEXT | STORED);
|
//! let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
//! let body = schema_builder.add_text_field("body", TEXT);
|
//! let body = schema_builder.add_text_field("body", TEXT);
|
||||||
//! let schema = schema_builder.build();
|
//! let schema = schema_builder.build();
|
||||||
@@ -86,13 +86,13 @@
|
|||||||
//! // A ticket has been opened regarding this problem.
|
//! // A ticket has been opened regarding this problem.
|
||||||
//! let query = query_parser.parse_query("sea whale")?;
|
//! let query = query_parser.parse_query("sea whale")?;
|
||||||
//!
|
//!
|
||||||
//! let mut top_collector = TopCollector::with_limit(10);
|
//! // Perform search.
|
||||||
//! searcher.search(&*query, &mut top_collector)?;
|
//! // `topdocs` contains the 10 most relevant doc ids, sorted by decreasing scores...
|
||||||
|
//! let top_docs: Vec<(Score, DocAddress)> =
|
||||||
|
//! searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||||
//!
|
//!
|
||||||
//! // Our top collector now contains the 10
|
//! for (_score, doc_address) in top_docs {
|
||||||
//! // most relevant doc ids...
|
//! // Retrieve the actual content of documents given its `doc_address`.
|
||||||
//! let doc_addresses = top_collector.docs();
|
|
||||||
//! for doc_address in doc_addresses {
|
|
||||||
//! let retrieved_doc = searcher.doc(doc_address)?;
|
//! let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
//! println!("{}", schema.to_json(&retrieved_doc));
|
//! println!("{}", schema.to_json(&retrieved_doc));
|
||||||
//! }
|
//! }
|
||||||
@@ -129,11 +129,11 @@ extern crate base64;
|
|||||||
extern crate bit_set;
|
extern crate bit_set;
|
||||||
extern crate bitpacking;
|
extern crate bitpacking;
|
||||||
extern crate byteorder;
|
extern crate byteorder;
|
||||||
|
extern crate scoped_pool;
|
||||||
|
|
||||||
extern crate combine;
|
extern crate combine;
|
||||||
|
|
||||||
extern crate crossbeam;
|
extern crate crossbeam;
|
||||||
extern crate crossbeam_channel;
|
|
||||||
extern crate fnv;
|
extern crate fnv;
|
||||||
extern crate fst;
|
extern crate fst;
|
||||||
extern crate fst_regex;
|
extern crate fst_regex;
|
||||||
@@ -152,8 +152,6 @@ extern crate tempdir;
|
|||||||
extern crate tempfile;
|
extern crate tempfile;
|
||||||
extern crate uuid;
|
extern crate uuid;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate matches;
|
extern crate matches;
|
||||||
@@ -185,10 +183,7 @@ mod macros;
|
|||||||
|
|
||||||
pub use error::TantivyError;
|
pub use error::TantivyError;
|
||||||
|
|
||||||
#[deprecated(
|
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
|
||||||
since = "0.7.0",
|
|
||||||
note = "please use `tantivy::TantivyError` instead"
|
|
||||||
)]
|
|
||||||
pub use error::TantivyError as Error;
|
pub use error::TantivyError as Error;
|
||||||
|
|
||||||
extern crate census;
|
extern crate census;
|
||||||
@@ -213,11 +208,12 @@ pub(crate) mod positions;
|
|||||||
pub mod postings;
|
pub mod postings;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
|
pub mod space_usage;
|
||||||
pub mod store;
|
pub mod store;
|
||||||
pub mod termdict;
|
pub mod termdict;
|
||||||
|
|
||||||
mod snippet;
|
mod snippet;
|
||||||
pub use self::snippet::SnippetGenerator;
|
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||||
|
|
||||||
mod docset;
|
mod docset;
|
||||||
pub use self::docset::{DocSet, SkipResult};
|
pub use self::docset::{DocSet, SkipResult};
|
||||||
@@ -300,9 +296,11 @@ mod tests {
|
|||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use query::BooleanQuery;
|
use query::BooleanQuery;
|
||||||
use rand::distributions::Bernoulli;
|
use rand::distributions::Bernoulli;
|
||||||
use rand::distributions::Range;
|
use rand::distributions::Uniform;
|
||||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
use rand::rngs::StdRng;
|
||||||
|
use rand::{Rng, SeedableRng};
|
||||||
use schema::*;
|
use schema::*;
|
||||||
|
use DocAddress;
|
||||||
use Index;
|
use Index;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
use Postings;
|
use Postings;
|
||||||
@@ -321,16 +319,15 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||||
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
|
let seed: [u8; 32] = [1; 32];
|
||||||
XorShiftRng::from_seed(seed)
|
StdRng::from_seed(seed)
|
||||||
.sample_iter(&Range::new(0u32, max_value))
|
.sample_iter(&Uniform::new(0u32, max_value))
|
||||||
.take(n_elems)
|
.take(n_elems)
|
||||||
.collect::<Vec<u32>>()
|
.collect::<Vec<u32>>()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||||
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, seed_val];
|
StdRng::from_seed([seed_val; 32])
|
||||||
XorShiftRng::from_seed(seed)
|
|
||||||
.sample_iter(&Bernoulli::new(ratio))
|
.sample_iter(&Bernoulli::new(ratio))
|
||||||
.take(n as usize)
|
.take(n as usize)
|
||||||
.enumerate()
|
.enumerate()
|
||||||
@@ -345,7 +342,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
fn test_indexing() {
|
fn test_indexing() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
@@ -370,7 +367,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_docfreq1() {
|
fn test_docfreq1() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
@@ -410,7 +407,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fieldnorm_no_docs_with_field() {
|
fn test_fieldnorm_no_docs_with_field() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
@@ -439,7 +436,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fieldnorm() {
|
fn test_fieldnorm() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
@@ -480,7 +477,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_postings1() {
|
fn test_delete_postings1() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
@@ -491,42 +488,21 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
// 0
|
||||||
// 0
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
let doc = doc!(text_field=>"a b");
|
// 1
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc!(text_field=>" a c"));
|
||||||
}
|
// 2
|
||||||
{
|
index_writer.add_document(doc!(text_field=>" b c"));
|
||||||
// 1
|
// 3
|
||||||
let doc = doc!(text_field=>" a c");
|
index_writer.add_document(doc!(text_field=>" b d"));
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
{
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
// 2
|
// 4
|
||||||
let doc = doc!(text_field=>" b c");
|
index_writer.add_document(doc!(text_field=>" b c"));
|
||||||
index_writer.add_document(doc);
|
// 5
|
||||||
}
|
index_writer.add_document(doc!(text_field=>" a"));
|
||||||
{
|
|
||||||
// 3
|
|
||||||
let doc = doc!(text_field=>" b d");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// 4
|
|
||||||
let doc = doc!(text_field=>" b c");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// 5
|
|
||||||
let doc = doc!(text_field=>" a");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -534,11 +510,9 @@ mod tests {
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = reader.inverted_index(text_field);
|
let inverted_index = reader.inverted_index(text_field);
|
||||||
assert!(
|
assert!(inverted_index
|
||||||
inverted_index
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
.is_none());
|
||||||
.is_none()
|
|
||||||
);
|
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
@@ -561,15 +535,10 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
// 0
|
||||||
// 0
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
let doc = doc!(text_field=>"a b");
|
// 1
|
||||||
index_writer.add_document(doc);
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
}
|
|
||||||
{
|
|
||||||
// 1
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
|
||||||
}
|
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -578,11 +547,9 @@ mod tests {
|
|||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||||
|
|
||||||
assert!(
|
assert!(inverted_index
|
||||||
inverted_index
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
.is_none());
|
||||||
.is_none()
|
|
||||||
);
|
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
@@ -605,13 +572,8 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
let doc = doc!(text_field=>"a b");
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
|
||||||
}
|
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
@@ -621,11 +583,9 @@ mod tests {
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||||
assert!(
|
assert!(inverted_index
|
||||||
inverted_index
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
.is_none());
|
||||||
.is_none()
|
|
||||||
);
|
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
@@ -655,7 +615,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexed_u64() {
|
fn test_indexed_u64() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_u64_field("value", INT_INDEXED);
|
let field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -678,7 +638,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexed_i64() {
|
fn test_indexed_i64() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
|
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -702,7 +662,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexedfield_not_in_documents() {
|
fn test_indexedfield_not_in_documents() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let absent_field = schema_builder.add_text_field("text", TEXT);
|
let absent_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -718,7 +678,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_postings2() {
|
fn test_delete_postings2() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -754,7 +714,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_termfreq() {
|
fn test_termfreq() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -773,11 +733,9 @@ mod tests {
|
|||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = reader.inverted_index(text_field);
|
let inverted_index = reader.inverted_index(text_field);
|
||||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||||
assert!(
|
assert!(inverted_index
|
||||||
inverted_index
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
.is_none());
|
||||||
.is_none()
|
|
||||||
);
|
|
||||||
let term_af = Term::from_field_text(text_field, "af");
|
let term_af = Term::from_field_text(text_field, "af");
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
|
||||||
@@ -791,7 +749,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_searcher_1() {
|
fn test_searcher_1() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -799,18 +757,9 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
index_writer.add_document(doc!(text_field=>"af af af b"));
|
||||||
let doc = doc!(text_field=>"af af af b");
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc!(text_field=>"a b c d"));
|
||||||
}
|
|
||||||
{
|
|
||||||
let doc = doc!(text_field=>"a b c");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let doc = doc!(text_field=>"a b c d");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -818,55 +767,42 @@ mod tests {
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let get_doc_ids = |terms: Vec<Term>| {
|
let get_doc_ids = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
let mut collector = TestCollector::default();
|
let topdocs = searcher.search(&query, &TestCollector).unwrap();
|
||||||
assert!(searcher.search(&query, &mut collector).is_ok());
|
topdocs.docs().to_vec()
|
||||||
collector.docs()
|
|
||||||
};
|
};
|
||||||
{
|
assert_eq!(
|
||||||
assert_eq!(
|
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
vec![DocAddress(0, 1), DocAddress(0, 2)]
|
||||||
vec![1, 2]
|
);
|
||||||
);
|
assert_eq!(
|
||||||
}
|
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||||
{
|
vec![DocAddress(0, 0)]
|
||||||
assert_eq!(
|
);
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
assert_eq!(
|
||||||
vec![0]
|
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||||
);
|
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
|
||||||
}
|
);
|
||||||
{
|
assert_eq!(
|
||||||
assert_eq!(
|
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
vec![DocAddress(0, 1), DocAddress(0, 2)]
|
||||||
vec![0, 1, 2]
|
);
|
||||||
);
|
assert_eq!(
|
||||||
}
|
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||||
{
|
vec![DocAddress(0, 2)]
|
||||||
assert_eq!(
|
);
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
assert_eq!(
|
||||||
vec![1, 2]
|
get_doc_ids(vec![
|
||||||
);
|
Term::from_field_text(text_field, "b"),
|
||||||
}
|
Term::from_field_text(text_field, "a"),
|
||||||
{
|
]),
|
||||||
assert_eq!(
|
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
);
|
||||||
vec![2]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert_eq!(
|
|
||||||
get_doc_ids(vec![
|
|
||||||
Term::from_field_text(text_field, "b"),
|
|
||||||
Term::from_field_text(text_field, "a"),
|
|
||||||
]),
|
|
||||||
vec![0, 1, 2]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_searcher_2() {
|
fn test_searcher_2() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -893,7 +829,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_macro() {
|
fn test_doc_macro() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
||||||
let document = doc!(text_field => "tantivy",
|
let document = doc!(text_field => "tantivy",
|
||||||
@@ -911,7 +847,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_wrong_fast_field_type() {
|
fn test_wrong_fast_field_type() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
||||||
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
|
|||||||
@@ -26,12 +26,12 @@
|
|||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
///
|
///
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST};
|
/// use tantivy::schema::{Schema, TEXT, FAST};
|
||||||
///
|
///
|
||||||
/// //...
|
/// //...
|
||||||
///
|
///
|
||||||
/// # fn main() {
|
/// # fn main() {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let author = schema_builder.add_text_field("text", TEXT);
|
/// let author = schema_builder.add_text_field("text", TEXT);
|
||||||
/// let likes = schema_builder.add_u64_field("num_u64", FAST);
|
/// let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
@@ -67,33 +67,33 @@ macro_rules! doc(
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use schema::{SchemaBuilder, FAST, TEXT};
|
use schema::{Schema, FAST, TEXT};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_basic() {
|
fn test_doc_basic() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let author = schema_builder.add_text_field("text", TEXT);
|
let author = schema_builder.add_text_field("text", TEXT);
|
||||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
let _schema = schema_builder.build();
|
let _schema = schema_builder.build();
|
||||||
let _doc = doc!(
|
let _doc = doc!(
|
||||||
title => "Life Aquatic",
|
title => "Life Aquatic",
|
||||||
author => "Wes Anderson",
|
author => "Wes Anderson",
|
||||||
likes => 4u64
|
likes => 4u64
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_trailing_comma() {
|
fn test_doc_trailing_comma() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let author = schema_builder.add_text_field("text", TEXT);
|
let author = schema_builder.add_text_field("text", TEXT);
|
||||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
let _schema = schema_builder.build();
|
let _schema = schema_builder.build();
|
||||||
let _doc = doc!(
|
let _doc = doc!(
|
||||||
title => "Life Aquatic",
|
title => "Life Aquatic",
|
||||||
author => "Wes Anderson",
|
author => "Wes Anderson",
|
||||||
likes => 4u64,
|
likes => 4u64,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -266,21 +266,17 @@ pub mod tests {
|
|||||||
mod bench {
|
mod bench {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use rand::Rng;
|
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use rand::XorShiftRng;
|
use rand::{Rng, XorShiftRng};
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
|
||||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||||
(0..u32::max_value())
|
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
|
||||||
.filter(|_| rng.next_f32() < ratio)
|
|
||||||
.take(n)
|
|
||||||
.collect()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
||||||
generate_array_with_seed(n, ratio, 4)
|
generate_array_with_seed(n, ratio, 4)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -297,24 +293,23 @@ mod bench {
|
|||||||
fn bench_uncompress(b: &mut Bencher) {
|
fn bench_uncompress(b: &mut Bencher) {
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||||
let (_, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||||
let mut decoder = BlockDecoder::new();
|
let mut decoder = BlockDecoder::new();
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
decoder.uncompress_block_sorted(compressed, 0u32);
|
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_docs_compression_numbits() {
|
fn test_all_docs_compression_numbits() {
|
||||||
for num_bits in 0..33 {
|
for expected_num_bits in 0u8.. {
|
||||||
let mut data = [0u32; 128];
|
let mut data = [0u32; 128];
|
||||||
if num_bits > 0 {
|
if expected_num_bits > 0 {
|
||||||
data[0] = 1 << (num_bits - 1);
|
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
||||||
}
|
}
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let compressed = encoder.compress_block_unsorted(&data);
|
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
||||||
assert_eq!(compressed[0] as usize, num_bits);
|
assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
||||||
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -54,17 +54,18 @@ pub mod tests {
|
|||||||
use indexer::operation::AddOperation;
|
use indexer::operation::AddOperation;
|
||||||
use indexer::SegmentWriter;
|
use indexer::SegmentWriter;
|
||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
use rand::rngs::StdRng;
|
||||||
|
use rand::{Rng, SeedableRng};
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
use schema::{Document, Schema, Term, INT_INDEXED, STRING, TEXT};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_write() {
|
pub fn test_position_write() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -88,7 +89,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_skip_positions() {
|
pub fn test_skip_positions() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -163,7 +164,7 @@ pub mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm1() {
|
pub fn test_position_and_fieldnorm1() {
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
@@ -220,12 +221,10 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let term_a = Term::from_field_text(text_field, "abcdef");
|
let term_a = Term::from_field_text(text_field, "abcdef");
|
||||||
assert!(
|
assert!(segment_reader
|
||||||
segment_reader
|
.inverted_index(term_a.field())
|
||||||
.inverted_index(term_a.field())
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.is_none());
|
||||||
.is_none()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
@@ -276,7 +275,7 @@ pub mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm2() {
|
pub fn test_position_and_fieldnorm2() {
|
||||||
let mut positions: Vec<u32> = Vec::new();
|
let mut positions: Vec<u32> = Vec::new();
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -317,7 +316,7 @@ pub mod tests {
|
|||||||
let num_docs = 300u32;
|
let num_docs = 300u32;
|
||||||
|
|
||||||
let index = {
|
let index = {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
|
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -498,12 +497,11 @@ pub mod tests {
|
|||||||
Term::from_field_text(field, "d")
|
Term::from_field_text(field, "d")
|
||||||
};
|
};
|
||||||
pub static ref INDEX: Index = {
|
pub static ref INDEX: Index = {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", STRING);
|
let text_field = schema_builder.add_text_field("text", STRING);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
||||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(seed);
|
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let posting_list_size = 1_000_000;
|
let posting_list_size = 1_000_000;
|
||||||
@@ -654,7 +652,7 @@ mod bench {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bench_skip_next(p: f32, b: &mut Bencher) {
|
fn bench_skip_next(p: f64, b: &mut Bencher) {
|
||||||
let searcher = INDEX.searcher();
|
let searcher = INDEX.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let docs = tests::sample(segment_reader.num_docs(), p);
|
let docs = tests::sample(segment_reader.num_docs(), p);
|
||||||
|
|||||||
@@ -29,7 +29,8 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
|||||||
IndexRecordOption::WithFreqsAndPositions => {
|
IndexRecordOption::WithFreqsAndPositions => {
|
||||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||||
}
|
}
|
||||||
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
})
|
||||||
|
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||||
}
|
}
|
||||||
@@ -107,10 +108,8 @@ impl MultiFieldPostingsWriter {
|
|||||||
.map(|(key, _, _)| Term::wrap(key).field())
|
.map(|(key, _, _)| Term::wrap(key).field())
|
||||||
.enumerate();
|
.enumerate();
|
||||||
|
|
||||||
let mut unordered_term_mappings: HashMap<
|
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
||||||
Field,
|
HashMap::new();
|
||||||
HashMap<UnorderedTermId, TermOrdinal>,
|
|
||||||
> = HashMap::new();
|
|
||||||
|
|
||||||
let mut prev_field = Field(u32::max_value());
|
let mut prev_field = Field(u32::max_value());
|
||||||
for (offset, field) in term_offsets_it {
|
for (offset, field) in term_offsets_it {
|
||||||
@@ -138,7 +137,8 @@ impl MultiFieldPostingsWriter {
|
|||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(term_ord, unord_term_id)| {
|
.map(|(term_ord, unord_term_id)| {
|
||||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
unordered_term_mappings.insert(field, mapping);
|
unordered_term_mappings.insert(field, mapping);
|
||||||
}
|
}
|
||||||
FieldType::U64(_) | FieldType::I64(_) => {}
|
FieldType::U64(_) | FieldType::I64(_) => {}
|
||||||
|
|||||||
@@ -126,7 +126,6 @@ impl SegmentPostings {
|
|||||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||||
let mut start = 0;
|
let mut start = 0;
|
||||||
let end = arr.len();
|
let end = arr.len();
|
||||||
debug_assert!(target >= arr[start]);
|
|
||||||
debug_assert!(target <= arr[end - 1]);
|
debug_assert!(target <= arr[end - 1]);
|
||||||
let mut jump = 1;
|
let mut jump = 1;
|
||||||
loop {
|
loop {
|
||||||
@@ -216,11 +215,10 @@ impl DocSet for SegmentPostings {
|
|||||||
|
|
||||||
// we're in the right block now, start with an exponential search
|
// we're in the right block now, start with an exponential search
|
||||||
let block_docs = self.block_cursor.docs();
|
let block_docs = self.block_cursor.docs();
|
||||||
|
|
||||||
debug_assert!(target >= self.doc());
|
|
||||||
let new_cur = self
|
let new_cur = self
|
||||||
.cur
|
.cur
|
||||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||||
|
|
||||||
if need_positions {
|
if need_positions {
|
||||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||||
.iter()
|
.iter()
|
||||||
@@ -533,7 +531,8 @@ impl BlockSegmentPostings {
|
|||||||
} else {
|
} else {
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
}
|
}
|
||||||
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
})
|
||||||
|
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||||
}
|
}
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
}
|
}
|
||||||
@@ -621,6 +620,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use super::exponential_search;
|
||||||
use super::search_within_block;
|
use super::search_within_block;
|
||||||
use super::BlockSegmentPostings;
|
use super::BlockSegmentPostings;
|
||||||
use super::BlockSegmentPostingsSkipResult;
|
use super::BlockSegmentPostingsSkipResult;
|
||||||
@@ -630,10 +630,11 @@ mod tests {
|
|||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::SchemaBuilder;
|
use schema::Schema;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use schema::INT_INDEXED;
|
use schema::INT_INDEXED;
|
||||||
use DocId;
|
use DocId;
|
||||||
|
use SkipResult;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_segment_postings() {
|
fn test_empty_segment_postings() {
|
||||||
@@ -661,6 +662,16 @@ mod tests {
|
|||||||
.0
|
.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_exponentiel_search() {
|
||||||
|
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
|
||||||
|
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
|
||||||
|
assert_eq!(
|
||||||
|
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
|
||||||
|
(3, 7)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
search_within_block(block, target),
|
search_within_block(block, target),
|
||||||
@@ -692,7 +703,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings() {
|
fn test_block_segment_postings() {
|
||||||
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
|
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||||
let mut offset: u32 = 0u32;
|
let mut offset: u32 = 0u32;
|
||||||
// checking that the block before calling advance is empty
|
// checking that the block before calling advance is empty
|
||||||
assert!(block_segments.docs().is_empty());
|
assert!(block_segments.docs().is_empty());
|
||||||
@@ -706,14 +717,44 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
#[test]
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
fn test_skip_right_at_new_block() {
|
||||||
|
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||||
|
doc_ids.push(129);
|
||||||
|
doc_ids.push(130);
|
||||||
|
{
|
||||||
|
let block_segments = build_block_postings(&doc_ids);
|
||||||
|
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||||
|
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
||||||
|
assert_eq!(docset.doc(), 129);
|
||||||
|
assert!(docset.advance());
|
||||||
|
assert_eq!(docset.doc(), 130);
|
||||||
|
assert!(!docset.advance());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let block_segments = build_block_postings(&doc_ids);
|
||||||
|
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||||
|
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
||||||
|
assert_eq!(docset.doc(), 129);
|
||||||
|
assert!(docset.advance());
|
||||||
|
assert_eq!(docset.doc(), 130);
|
||||||
|
assert!(!docset.advance());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let block_segments = build_block_postings(&doc_ids);
|
||||||
|
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||||
|
assert_eq!(docset.skip_next(131), SkipResult::End);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let mut last_doc = 0u32;
|
let mut last_doc = 0u32;
|
||||||
for doc in docs {
|
for &doc in docs {
|
||||||
for _ in last_doc..doc {
|
for _ in last_doc..doc {
|
||||||
index_writer.add_document(doc!(int_field=>1u64));
|
index_writer.add_document(doc!(int_field=>1u64));
|
||||||
}
|
}
|
||||||
@@ -733,7 +774,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings_skip() {
|
fn test_block_segment_postings_skip() {
|
||||||
for i in 0..4 {
|
for i in 0..4 {
|
||||||
let mut block_postings = build_block_postings(vec![3]);
|
let mut block_postings = build_block_postings(&[3]);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(i),
|
block_postings.skip_to(i),
|
||||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||||
@@ -743,7 +784,7 @@ mod tests {
|
|||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(vec![3]);
|
let mut block_postings = build_block_postings(&[3]);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(4u32),
|
block_postings.skip_to(4u32),
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
@@ -756,7 +797,7 @@ mod tests {
|
|||||||
for i in 0..1300 {
|
for i in 0..1300 {
|
||||||
docs.push((i * i / 100) + i);
|
docs.push((i * i / 100) + i);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(docs.clone());
|
let mut block_postings = build_block_postings(&docs[..]);
|
||||||
for i in vec![0, 424, 10000] {
|
for i in vec![0, 424, 10000] {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(i),
|
block_postings.skip_to(i),
|
||||||
@@ -778,7 +819,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_reset_block_segment_postings() {
|
fn test_reset_block_segment_postings() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
|
|||||||
|
|
||||||
impl InvertedIndexSerializer {
|
impl InvertedIndexSerializer {
|
||||||
/// Open a new `PostingsSerializer` for the given segment
|
/// Open a new `PostingsSerializer` for the given segment
|
||||||
fn new(
|
fn create(
|
||||||
terms_write: CompositeWrite<WritePtr>,
|
terms_write: CompositeWrite<WritePtr>,
|
||||||
postings_write: CompositeWrite<WritePtr>,
|
postings_write: CompositeWrite<WritePtr>,
|
||||||
positions_write: CompositeWrite<WritePtr>,
|
positions_write: CompositeWrite<WritePtr>,
|
||||||
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
|
|||||||
/// Open a new `PostingsSerializer` for the given segment
|
/// Open a new `PostingsSerializer` for the given segment
|
||||||
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
|
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
|
||||||
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
||||||
InvertedIndexSerializer::new(
|
InvertedIndexSerializer::create(
|
||||||
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||||
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||||
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
||||||
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
|
|||||||
let positions_write = self.positions_write.for_field(field);
|
let positions_write = self.positions_write.for_field(field);
|
||||||
let positionsidx_write = self.positionsidx_write.for_field(field);
|
let positionsidx_write = self.positionsidx_write.for_field(field);
|
||||||
let field_type: FieldType = (*field_entry.field_type()).clone();
|
let field_type: FieldType = (*field_entry.field_type()).clone();
|
||||||
FieldSerializer::new(
|
FieldSerializer::create(
|
||||||
&field_type,
|
&field_type,
|
||||||
term_dictionary_write,
|
term_dictionary_write,
|
||||||
postings_write,
|
postings_write,
|
||||||
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FieldSerializer<'a> {
|
impl<'a> FieldSerializer<'a> {
|
||||||
fn new(
|
fn create(
|
||||||
field_type: &FieldType,
|
field_type: &FieldType,
|
||||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||||
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
_ => (false, false),
|
_ => (false, false),
|
||||||
};
|
};
|
||||||
let term_dictionary_builder =
|
let term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
|
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
|
||||||
let postings_serializer =
|
let postings_serializer =
|
||||||
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
||||||
let positions_serializer_opt = if position_enabled {
|
let positions_serializer_opt = if position_enabled {
|
||||||
|
|||||||
@@ -174,8 +174,8 @@ mod tests {
|
|||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
mod bench {
|
mod bench {
|
||||||
|
use super::super::MemoryArena;
|
||||||
use super::ExpUnrolledLinkedList;
|
use super::ExpUnrolledLinkedList;
|
||||||
use tantivy_memory_arena::MemoryArena;
|
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
const NUM_STACK: usize = 10_000;
|
const NUM_STACK: usize = 10_000;
|
||||||
@@ -199,20 +199,19 @@ mod bench {
|
|||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_push_stack(bench: &mut Bencher) {
|
fn bench_push_stack(bench: &mut Bencher) {
|
||||||
let heap = MemoryArena::new();
|
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
|
let mut heap = MemoryArena::new();
|
||||||
let mut stacks = Vec::with_capacity(100);
|
let mut stacks = Vec::with_capacity(100);
|
||||||
for _ in 0..NUM_STACK {
|
for _ in 0..NUM_STACK {
|
||||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||||
stacks.push(stack);
|
stacks.push(stack);
|
||||||
}
|
}
|
||||||
for s in 0..NUM_STACK {
|
for s in 0..NUM_STACK {
|
||||||
for i in 0u32..STACK_SIZE {
|
for i in 0u32..STACK_SIZE {
|
||||||
let t = s * 392017 % NUM_STACK;
|
let t = s * 392017 % NUM_STACK;
|
||||||
stacks[t].push(i, &heap);
|
stacks[t].push(i, &mut heap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
heap.clear();
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
mod expull;
|
mod expull;
|
||||||
mod memory_arena;
|
mod memory_arena;
|
||||||
mod murmurhash2;
|
|
||||||
mod term_hashmap;
|
mod term_hashmap;
|
||||||
|
|
||||||
pub use self::expull::ExpUnrolledLinkedList;
|
pub use self::expull::ExpUnrolledLinkedList;
|
||||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||||
use self::murmurhash2::murmurhash2;
|
|
||||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||||
|
|||||||
@@ -1,87 +0,0 @@
|
|||||||
use std::ptr;
|
|
||||||
const SEED: u32 = 3_242_157_231u32;
|
|
||||||
const M: u32 = 0x5bd1_e995;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
|
||||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
|
||||||
let len = key.len() as u32;
|
|
||||||
let mut h: u32 = SEED ^ len;
|
|
||||||
|
|
||||||
let num_blocks = len >> 2;
|
|
||||||
for _ in 0..num_blocks {
|
|
||||||
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
|
||||||
k = k.wrapping_mul(M);
|
|
||||||
k ^= k >> 24;
|
|
||||||
k = k.wrapping_mul(M);
|
|
||||||
h = h.wrapping_mul(M);
|
|
||||||
h ^= k;
|
|
||||||
key_ptr = key_ptr.wrapping_offset(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle the last few bytes of the input array
|
|
||||||
let remaining: &[u8] = &key[key.len() & !3..];
|
|
||||||
match remaining.len() {
|
|
||||||
3 => {
|
|
||||||
h ^= u32::from(remaining[2]) << 16;
|
|
||||||
h ^= u32::from(remaining[1]) << 8;
|
|
||||||
h ^= u32::from(remaining[0]);
|
|
||||||
h = h.wrapping_mul(M);
|
|
||||||
}
|
|
||||||
2 => {
|
|
||||||
h ^= u32::from(remaining[1]) << 8;
|
|
||||||
h ^= u32::from(remaining[0]);
|
|
||||||
h = h.wrapping_mul(M);
|
|
||||||
}
|
|
||||||
1 => {
|
|
||||||
h ^= u32::from(remaining[0]);
|
|
||||||
h = h.wrapping_mul(M);
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
h ^= h >> 13;
|
|
||||||
h = h.wrapping_mul(M);
|
|
||||||
h ^ (h >> 15)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
|
|
||||||
use super::murmurhash2;
|
|
||||||
use std::collections::HashSet;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_murmur() {
|
|
||||||
let s1 = "abcdef";
|
|
||||||
let s2 = "abcdeg";
|
|
||||||
for i in 0..5 {
|
|
||||||
assert_eq!(
|
|
||||||
murmurhash2(&s1[i..5].as_bytes()),
|
|
||||||
murmurhash2(&s2[i..5].as_bytes())
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_murmur_against_reference_impl() {
|
|
||||||
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
|
||||||
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
|
||||||
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
|
||||||
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
|
||||||
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
|
||||||
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
|
||||||
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_murmur_collisions() {
|
|
||||||
let mut set: HashSet<u32> = HashSet::default();
|
|
||||||
for i in 0..10_000 {
|
|
||||||
let s = format!("hash{}", i);
|
|
||||||
let hash = murmurhash2(s.as_bytes());
|
|
||||||
set.insert(hash);
|
|
||||||
}
|
|
||||||
assert_eq!(set.len(), 10_000);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,4 +1,7 @@
|
|||||||
use super::murmurhash2;
|
extern crate murmurhash32;
|
||||||
|
|
||||||
|
use self::murmurhash32::murmurhash2;
|
||||||
|
|
||||||
use super::{Addr, ArenaStorable, MemoryArena};
|
use super::{Addr, ArenaStorable, MemoryArena};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
@@ -206,7 +209,7 @@ impl TermHashMap {
|
|||||||
self.resize();
|
self.resize();
|
||||||
}
|
}
|
||||||
let key_bytes: &[u8] = key.as_ref();
|
let key_bytes: &[u8] = key.as_ref();
|
||||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
let hash = murmurhash2(key.as_ref());
|
||||||
let mut probe = self.probe(hash);
|
let mut probe = self.probe(hash);
|
||||||
loop {
|
loop {
|
||||||
let bucket = probe.next_probe();
|
let bucket = probe.next_probe();
|
||||||
|
|||||||
@@ -86,12 +86,12 @@ mod tests {
|
|||||||
|
|
||||||
use super::AllQuery;
|
use super::AllQuery;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use schema::{SchemaBuilder, TEXT};
|
use schema::{Schema, TEXT};
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_query() {
|
fn test_all_query() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_text_field("text", TEXT);
|
let field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use Result;
|
|||||||
/// A weight struct for Fuzzy Term and Regex Queries
|
/// A weight struct for Fuzzy Term and Regex Queries
|
||||||
pub struct AutomatonWeight<A>
|
pub struct AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton,
|
A: Automaton + Send + Sync + 'static,
|
||||||
{
|
{
|
||||||
field: Field,
|
field: Field,
|
||||||
automaton: A,
|
automaton: A,
|
||||||
@@ -19,7 +19,7 @@ where
|
|||||||
|
|
||||||
impl<A> AutomatonWeight<A>
|
impl<A> AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton,
|
A: Automaton + Send + Sync + 'static,
|
||||||
{
|
{
|
||||||
/// Create a new AutomationWeight
|
/// Create a new AutomationWeight
|
||||||
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
||||||
@@ -34,7 +34,7 @@ where
|
|||||||
|
|
||||||
impl<A> Weight for AutomatonWeight<A>
|
impl<A> Weight for AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton,
|
A: Automaton + Send + Sync + 'static,
|
||||||
{
|
{
|
||||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||||
let max_doc = reader.max_doc();
|
let max_doc = reader.max_doc();
|
||||||
|
|||||||
@@ -63,7 +63,8 @@ impl BM25Weight {
|
|||||||
.map(|term| {
|
.map(|term| {
|
||||||
let term_doc_freq = searcher.doc_freq(term);
|
let term_doc_freq = searcher.doc_freq(term);
|
||||||
idf(term_doc_freq, total_num_docs)
|
idf(term_doc_freq, total_num_docs)
|
||||||
}).sum::<f32>();
|
})
|
||||||
|
.sum::<f32>();
|
||||||
BM25Weight::new(idf, average_fieldnorm)
|
BM25Weight::new(idf, average_fieldnorm)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,8 @@ impl Query for BooleanQuery {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|&(ref occur, ref subquery)| {
|
.map(|&(ref occur, ref subquery)| {
|
||||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||||
}).collect::<Result<_>>()?;
|
})
|
||||||
|
.collect::<Result<_>>()?;
|
||||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,7 +69,8 @@ impl BooleanQuery {
|
|||||||
let term_query: Box<Query> =
|
let term_query: Box<Query> =
|
||||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
||||||
(Occur::Should, term_query)
|
(Occur::Should, term_query)
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
BooleanQuery::from(occur_term_queries)
|
BooleanQuery::from(occur_term_queries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,10 +19,11 @@ mod tests {
|
|||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use query::TermQuery;
|
use query::TermQuery;
|
||||||
use schema::*;
|
use schema::*;
|
||||||
|
use DocId;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
fn aux_test_helper() -> (Index, Field) {
|
fn aux_test_helper() -> (Index, Field) {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -130,9 +131,13 @@ mod tests {
|
|||||||
|
|
||||||
let matching_docs = |boolean_query: &Query| {
|
let matching_docs = |boolean_query: &Query| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let mut test_collector = TestCollector::default();
|
let test_docs = searcher.search(boolean_query, &TestCollector).unwrap();
|
||||||
searcher.search(boolean_query, &mut test_collector).unwrap();
|
test_docs
|
||||||
test_collector.docs()
|
.docs()
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.map(|doc| doc.1)
|
||||||
|
.collect::<Vec<DocId>>()
|
||||||
};
|
};
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -186,9 +191,8 @@ mod tests {
|
|||||||
|
|
||||||
let score_docs = |boolean_query: &Query| {
|
let score_docs = |boolean_query: &Query| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let mut test_collector = TestCollector::default();
|
let fruit = searcher.search(boolean_query, &TestCollector).unwrap();
|
||||||
searcher.search(boolean_query, &mut test_collector).unwrap();
|
fruit.scores().to_vec()
|
||||||
test_collector.scores()
|
|
||||||
};
|
};
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -25,14 +25,14 @@ lazy_static! {
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::{Index, Result, Term};
|
/// use tantivy::{Index, Result, Term};
|
||||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
/// use tantivy::collector::{Count, TopDocs};
|
||||||
/// use tantivy::query::FuzzyTermQuery;
|
/// use tantivy::query::FuzzyTermQuery;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -57,16 +57,12 @@ lazy_static! {
|
|||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// {
|
||||||
/// let mut top_collector = TopCollector::with_limit(2);
|
///
|
||||||
/// let mut count_collector = CountCollector::default();
|
/// let term = Term::from_field_text(title, "Diary");
|
||||||
/// {
|
/// let query = FuzzyTermQuery::new(term, 1, true);
|
||||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();
|
||||||
/// let term = Term::from_field_text(title, "Diary");
|
/// assert_eq!(count, 2);
|
||||||
/// let query = FuzzyTermQuery::new(term, 1, true);
|
/// assert_eq!(top_docs.len(), 2);
|
||||||
/// searcher.search(&query, &mut collectors).unwrap();
|
|
||||||
/// }
|
|
||||||
/// assert_eq!(count_collector.count(), 2);
|
|
||||||
/// assert!(top_collector.at_capacity());
|
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
@@ -122,8 +118,8 @@ impl Query for FuzzyTermQuery {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::FuzzyTermQuery;
|
use super::FuzzyTermQuery;
|
||||||
use collector::TopCollector;
|
use collector::TopDocs;
|
||||||
use schema::SchemaBuilder;
|
use schema::Schema;
|
||||||
use schema::TEXT;
|
use schema::TEXT;
|
||||||
use tests::assert_nearly_equals;
|
use tests::assert_nearly_equals;
|
||||||
use Index;
|
use Index;
|
||||||
@@ -131,7 +127,7 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_fuzzy_term() {
|
pub fn test_fuzzy_term() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -148,14 +144,14 @@ mod test {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
{
|
{
|
||||||
let mut collector = TopCollector::with_limit(2);
|
|
||||||
let term = Term::from_field_text(country_field, "japon");
|
let term = Term::from_field_text(country_field, "japon");
|
||||||
|
|
||||||
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
||||||
searcher.search(&fuzzy_query, &mut collector).unwrap();
|
let top_docs = searcher
|
||||||
let scored_docs = collector.top_docs();
|
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||||
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
.unwrap();
|
||||||
let (score, _) = scored_docs[0];
|
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
|
||||||
|
let (score, _) = top_docs[0];
|
||||||
assert_nearly_equals(1f32, score);
|
assert_nearly_equals(1f32, score);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -56,15 +56,15 @@ pub use self::weight::Weight;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use Index;
|
|
||||||
use schema::{SchemaBuilder, TEXT};
|
|
||||||
use query::QueryParser;
|
use query::QueryParser;
|
||||||
use Term;
|
use schema::{Schema, TEXT};
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
use Index;
|
||||||
|
use Term;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_query_terms() {
|
fn test_query_terms() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -73,33 +73,48 @@ mod tests {
|
|||||||
let term_b = Term::from_field_text(text_field, "b");
|
let term_b = Term::from_field_text(text_field, "b");
|
||||||
{
|
{
|
||||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||||
query_parser.parse_query("a").unwrap().query_terms(&mut terms_set);
|
query_parser
|
||||||
|
.parse_query("a")
|
||||||
|
.unwrap()
|
||||||
|
.query_terms(&mut terms_set);
|
||||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||||
assert_eq!(vec![&term_a], terms);
|
assert_eq!(vec![&term_a], terms);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||||
query_parser.parse_query("a b").unwrap().query_terms(&mut terms_set);
|
query_parser
|
||||||
|
.parse_query("a b")
|
||||||
|
.unwrap()
|
||||||
|
.query_terms(&mut terms_set);
|
||||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||||
assert_eq!(vec![&term_a, &term_b], terms);
|
assert_eq!(vec![&term_a, &term_b], terms);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||||
query_parser.parse_query("\"a b\"").unwrap().query_terms(&mut terms_set);
|
query_parser
|
||||||
|
.parse_query("\"a b\"")
|
||||||
|
.unwrap()
|
||||||
|
.query_terms(&mut terms_set);
|
||||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||||
assert_eq!(vec![&term_a, &term_b], terms);
|
assert_eq!(vec![&term_a, &term_b], terms);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||||
query_parser.parse_query("a a a a a").unwrap().query_terms(&mut terms_set);
|
query_parser
|
||||||
|
.parse_query("a a a a a")
|
||||||
|
.unwrap()
|
||||||
|
.query_terms(&mut terms_set);
|
||||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||||
assert_eq!(vec![&term_a], terms);
|
assert_eq!(vec![&term_a], terms);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||||
query_parser.parse_query("a -b").unwrap().query_terms(&mut terms_set);
|
query_parser
|
||||||
|
.parse_query("a -b")
|
||||||
|
.unwrap()
|
||||||
|
.query_terms(&mut terms_set);
|
||||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||||
assert_eq!(vec![&term_a, &term_b], terms);
|
assert_eq!(vec![&term_a, &term_b], terms);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,11 +13,13 @@ mod tests {
|
|||||||
use collector::tests::TestCollector;
|
use collector::tests::TestCollector;
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use schema::{SchemaBuilder, Term, TEXT};
|
use schema::{Schema, Term, TEXT};
|
||||||
use tests::assert_nearly_equals;
|
use tests::assert_nearly_equals;
|
||||||
|
use DocAddress;
|
||||||
|
use DocId;
|
||||||
|
|
||||||
fn create_index(texts: &[&'static str]) -> Index {
|
fn create_index(texts: &[&'static str]) -> Index {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -47,16 +49,19 @@ mod tests {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let test_query = |texts: Vec<&str>| {
|
let test_query = |texts: Vec<&str>| {
|
||||||
let mut test_collector = TestCollector::default();
|
|
||||||
let terms: Vec<Term> = texts
|
let terms: Vec<Term> = texts
|
||||||
.iter()
|
.iter()
|
||||||
.map(|text| Term::from_field_text(text_field, text))
|
.map(|text| Term::from_field_text(text_field, text))
|
||||||
.collect();
|
.collect();
|
||||||
let phrase_query = PhraseQuery::new(terms);
|
let phrase_query = PhraseQuery::new(terms);
|
||||||
searcher
|
let test_fruits = searcher
|
||||||
.search(&phrase_query, &mut test_collector)
|
.search(&phrase_query, &TestCollector)
|
||||||
.expect("search should succeed");
|
.expect("search should succeed");
|
||||||
test_collector.docs()
|
test_fruits
|
||||||
|
.docs()
|
||||||
|
.iter()
|
||||||
|
.map(|docaddr| docaddr.1)
|
||||||
|
.collect::<Vec<_>>()
|
||||||
};
|
};
|
||||||
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
||||||
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
|
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
|
||||||
@@ -67,7 +72,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_query_no_positions() {
|
pub fn test_phrase_query_no_positions() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::TextFieldIndexing;
|
use schema::TextFieldIndexing;
|
||||||
use schema::TextOptions;
|
use schema::TextOptions;
|
||||||
@@ -91,9 +96,9 @@ mod tests {
|
|||||||
Term::from_field_text(text_field, "a"),
|
Term::from_field_text(text_field, "a"),
|
||||||
Term::from_field_text(text_field, "b"),
|
Term::from_field_text(text_field, "b"),
|
||||||
]);
|
]);
|
||||||
let mut test_collector = TestCollector::default();
|
|
||||||
if let TantivyError::SchemaError(ref msg) = searcher
|
if let TantivyError::SchemaError(ref msg) = searcher
|
||||||
.search(&phrase_query, &mut test_collector)
|
.search(&phrase_query, &TestCollector)
|
||||||
|
.map(|_| ())
|
||||||
.unwrap_err()
|
.unwrap_err()
|
||||||
{
|
{
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -113,16 +118,16 @@ mod tests {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let test_query = |texts: Vec<&str>| {
|
let test_query = |texts: Vec<&str>| {
|
||||||
let mut test_collector = TestCollector::default();
|
|
||||||
let terms: Vec<Term> = texts
|
let terms: Vec<Term> = texts
|
||||||
.iter()
|
.iter()
|
||||||
.map(|text| Term::from_field_text(text_field, text))
|
.map(|text| Term::from_field_text(text_field, text))
|
||||||
.collect();
|
.collect();
|
||||||
let phrase_query = PhraseQuery::new(terms);
|
let phrase_query = PhraseQuery::new(terms);
|
||||||
searcher
|
searcher
|
||||||
.search(&phrase_query, &mut test_collector)
|
.search(&phrase_query, &TestCollector)
|
||||||
.expect("search should succeed");
|
.expect("search should succeed")
|
||||||
test_collector.scores()
|
.scores()
|
||||||
|
.to_vec()
|
||||||
};
|
};
|
||||||
let scores = test_query(vec!["a", "b"]);
|
let scores = test_query(vec!["a", "b"]);
|
||||||
assert_nearly_equals(scores[0], 0.40618482);
|
assert_nearly_equals(scores[0], 0.40618482);
|
||||||
@@ -131,51 +136,39 @@ mod tests {
|
|||||||
|
|
||||||
#[test] // motivated by #234
|
#[test] // motivated by #234
|
||||||
pub fn test_phrase_query_docfreq_order() {
|
pub fn test_phrase_query_docfreq_order() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
// 0
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
let doc = doc!(text_field=>"b");
|
index_writer.add_document(doc!(text_field=>"b a"));
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// 1
|
|
||||||
let doc = doc!(text_field=>"a b");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// 2
|
|
||||||
let doc = doc!(text_field=>"b a");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let test_query = |texts: Vec<&str>| {
|
let test_query = |texts: Vec<&str>| {
|
||||||
let mut test_collector = TestCollector::default();
|
|
||||||
let terms: Vec<Term> = texts
|
let terms: Vec<Term> = texts
|
||||||
.iter()
|
.iter()
|
||||||
.map(|text| Term::from_field_text(text_field, text))
|
.map(|text| Term::from_field_text(text_field, text))
|
||||||
.collect();
|
.collect();
|
||||||
let phrase_query = PhraseQuery::new(terms);
|
let phrase_query = PhraseQuery::new(terms);
|
||||||
searcher
|
searcher
|
||||||
.search(&phrase_query, &mut test_collector)
|
.search(&phrase_query, &TestCollector)
|
||||||
.expect("search should succeed");
|
.expect("search should succeed")
|
||||||
test_collector.docs()
|
.docs()
|
||||||
|
.to_vec()
|
||||||
};
|
};
|
||||||
assert_eq!(test_query(vec!["a", "b"]), vec![1]);
|
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress(0, 1)]);
|
||||||
assert_eq!(test_query(vec!["b", "a"]), vec![2]);
|
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress(0, 2)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test] // motivated by #234
|
#[test] // motivated by #234
|
||||||
pub fn test_phrase_query_non_trivial_offsets() {
|
pub fn test_phrase_query_non_trivial_offsets() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -187,16 +180,18 @@ mod tests {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let test_query = |texts: Vec<(usize, &str)>| {
|
let test_query = |texts: Vec<(usize, &str)>| {
|
||||||
let mut test_collector = TestCollector::default();
|
|
||||||
let terms: Vec<(usize, Term)> = texts
|
let terms: Vec<(usize, Term)> = texts
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)))
|
.map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)))
|
||||||
.collect();
|
.collect();
|
||||||
let phrase_query = PhraseQuery::new_with_offset(terms);
|
let phrase_query = PhraseQuery::new_with_offset(terms);
|
||||||
searcher
|
searcher
|
||||||
.search(&phrase_query, &mut test_collector)
|
.search(&phrase_query, &TestCollector)
|
||||||
.expect("search should succeed");
|
.expect("search should succeed")
|
||||||
test_collector.docs()
|
.docs()
|
||||||
|
.iter()
|
||||||
|
.map(|doc_address| doc_address.1)
|
||||||
|
.collect::<Vec<DocId>>()
|
||||||
};
|
};
|
||||||
assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]);
|
assert_eq!(test_query(vec![(0, "a"), (1, "b")]), vec![0]);
|
||||||
assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]);
|
assert_eq!(test_query(vec![(1, "b"), (0, "a")]), vec![0]);
|
||||||
|
|||||||
@@ -134,7 +134,8 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(offset, postings)| {
|
.map(|(offset, postings)| {
|
||||||
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
|
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
|
||||||
}).collect::<Vec<_>>();
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
PhraseScorer {
|
PhraseScorer {
|
||||||
intersection_docset: Intersection::new(postings_with_offsets),
|
intersection_docset: Intersection::new(postings_with_offsets),
|
||||||
num_docsets,
|
num_docsets,
|
||||||
|
|||||||
@@ -1,11 +1,9 @@
|
|||||||
use super::Weight;
|
use super::Weight;
|
||||||
use collector::Collector;
|
|
||||||
use core::searcher::Searcher;
|
use core::searcher::Searcher;
|
||||||
use downcast;
|
use downcast;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use Result;
|
use Result;
|
||||||
use SegmentLocalId;
|
|
||||||
use Term;
|
use Term;
|
||||||
|
|
||||||
/// The `Query` trait defines a set of documents and a scoring method
|
/// The `Query` trait defines a set of documents and a scoring method
|
||||||
@@ -63,26 +61,6 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug {
|
|||||||
/// Extract all of the terms associated to the query and insert them in the
|
/// Extract all of the terms associated to the query and insert them in the
|
||||||
/// term set given in arguments.
|
/// term set given in arguments.
|
||||||
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
||||||
|
|
||||||
/// Search works as follows :
|
|
||||||
///
|
|
||||||
/// First the weight object associated to the query is created.
|
|
||||||
///
|
|
||||||
/// Then, the query loops over the segments and for each segment :
|
|
||||||
/// - setup the collector and informs it that the segment being processed has changed.
|
|
||||||
/// - creates a `Scorer` object associated for this segment
|
|
||||||
/// - iterate throw the matched documents and push them to the collector.
|
|
||||||
///
|
|
||||||
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<()> {
|
|
||||||
let scoring_enabled = collector.requires_scoring();
|
|
||||||
let weight = self.weight(searcher, scoring_enabled)?;
|
|
||||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
|
||||||
collector.set_segment(segment_ord as SegmentLocalId, segment_reader)?;
|
|
||||||
let mut scorer = weight.scorer(segment_reader)?;
|
|
||||||
scorer.collect(collector, segment_reader.delete_bitset());
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait QueryClone {
|
pub trait QueryClone {
|
||||||
@@ -98,6 +76,26 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Query for Box<Query> {
|
||||||
|
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||||
|
self.as_ref().weight(searcher, scoring_enabled)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn count(&self, searcher: &Searcher) -> Result<usize> {
|
||||||
|
self.as_ref().count(searcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn query_terms(&self, term_set: &mut BTreeSet<Term<Vec<u8>>>) {
|
||||||
|
self.as_ref().query_terms(term_set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryClone for Box<Query> {
|
||||||
|
fn box_clone(&self) -> Box<Query> {
|
||||||
|
self.as_ref().box_clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(missing_docs)]
|
#[allow(missing_docs)]
|
||||||
mod downcast_impl {
|
mod downcast_impl {
|
||||||
downcast!(super::Query);
|
downcast!(super::Query);
|
||||||
|
|||||||
@@ -68,7 +68,8 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.flat_map(|(occur, child)| {
|
.flat_map(|(occur, child)| {
|
||||||
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
|
trim_ast(child).map(|trimmed_child| (occur, trimmed_child))
|
||||||
}).collect::<Vec<_>>();
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
if trimmed_children.is_empty() {
|
if trimmed_children.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
@@ -128,6 +129,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option<LogicalAST> {
|
|||||||
///
|
///
|
||||||
/// * all docs query: A plain `*` will match all documents in the index.
|
/// * all docs query: A plain `*` will match all documents in the index.
|
||||||
///
|
///
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct QueryParser {
|
pub struct QueryParser {
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
default_fields: Vec<Field>,
|
default_fields: Vec<Field>,
|
||||||
@@ -421,7 +423,8 @@ impl QueryParser {
|
|||||||
lower: self.resolve_bound(field, &lower)?,
|
lower: self.resolve_bound(field, &lower)?,
|
||||||
upper: self.resolve_bound(field, &upper)?,
|
upper: self.resolve_bound(field, &upper)?,
|
||||||
})))
|
})))
|
||||||
}).collect::<Result<Vec<_>, QueryParserError>>()?;
|
})
|
||||||
|
.collect::<Result<Vec<_>, QueryParserError>>()?;
|
||||||
let result_ast = if clauses.len() == 1 {
|
let result_ast = if clauses.len() == 1 {
|
||||||
clauses.pop().unwrap()
|
clauses.pop().unwrap()
|
||||||
} else {
|
} else {
|
||||||
@@ -484,12 +487,12 @@ mod test {
|
|||||||
use query::Query;
|
use query::Query;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||||
use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT};
|
use schema::{Schema, Term, INT_INDEXED, STORED, STRING, TEXT};
|
||||||
use tokenizer::{LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager};
|
use tokenizer::{LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager};
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
fn make_query_parser() -> QueryParser {
|
fn make_query_parser() -> QueryParser {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field_indexing = TextFieldIndexing::default()
|
let text_field_indexing = TextFieldIndexing::default()
|
||||||
.set_tokenizer("en_with_stop_words")
|
.set_tokenizer("en_with_stop_words")
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
@@ -597,25 +600,19 @@ mod test {
|
|||||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||||
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
||||||
assert!(
|
assert!(query_parser
|
||||||
query_parser
|
.parse_query("signed:\"-9999999999999\"")
|
||||||
.parse_query("signed:\"-9999999999999\"")
|
.is_ok());
|
||||||
.is_ok()
|
|
||||||
);
|
|
||||||
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
||||||
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
||||||
assert!(
|
assert!(query_parser
|
||||||
query_parser
|
.parse_query("signed:\"18446744073709551615\"")
|
||||||
.parse_query("signed:\"18446744073709551615\"")
|
.is_err());
|
||||||
.is_err()
|
|
||||||
);
|
|
||||||
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
||||||
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
||||||
assert!(
|
assert!(query_parser
|
||||||
query_parser
|
.parse_query("unsigned:\"18446744073709551615\"")
|
||||||
.parse_query("unsigned:\"18446744073709551615\"")
|
.is_ok());
|
||||||
.is_ok()
|
|
||||||
);
|
|
||||||
test_parse_query_to_logical_ast_helper(
|
test_parse_query_to_logical_ast_helper(
|
||||||
"unsigned:2324",
|
"unsigned:2324",
|
||||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||||
@@ -720,7 +717,7 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_unknown_tokenizer() {
|
pub fn test_unknown_tokenizer() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field_indexing = TextFieldIndexing::default()
|
let text_field_indexing = TextFieldIndexing::default()
|
||||||
.set_tokenizer("nonexistingtokenizer")
|
.set_tokenizer("nonexistingtokenizer")
|
||||||
.set_index_option(IndexRecordOption::Basic);
|
.set_index_option(IndexRecordOption::Basic);
|
||||||
@@ -738,7 +735,7 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_query_parser_no_positions() {
|
pub fn test_query_parser_no_positions() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field_indexing = TextFieldIndexing::default()
|
let text_field_indexing = TextFieldIndexing::default()
|
||||||
.set_tokenizer("customtokenizer")
|
.set_tokenizer("customtokenizer")
|
||||||
.set_index_option(IndexRecordOption::Basic);
|
.set_index_option(IndexRecordOption::Basic);
|
||||||
|
|||||||
@@ -40,14 +40,13 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
|||||||
/// # #[macro_use]
|
/// # #[macro_use]
|
||||||
/// # extern crate tantivy;
|
/// # extern crate tantivy;
|
||||||
/// # use tantivy::Index;
|
/// # use tantivy::Index;
|
||||||
/// # use tantivy::schema::{SchemaBuilder, INT_INDEXED};
|
/// # use tantivy::schema::{Schema, INT_INDEXED};
|
||||||
/// # use tantivy::collector::CountCollector;
|
/// # use tantivy::collector::Count;
|
||||||
/// # use tantivy::query::Query;
|
|
||||||
/// # use tantivy::Result;
|
/// # use tantivy::Result;
|
||||||
/// # use tantivy::query::RangeQuery;
|
/// # use tantivy::query::RangeQuery;
|
||||||
/// #
|
/// #
|
||||||
/// # fn run() -> Result<()> {
|
/// # fn run() -> Result<()> {
|
||||||
/// # let mut schema_builder = SchemaBuilder::new();
|
/// # let mut schema_builder = Schema::builder();
|
||||||
/// # let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
/// # let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
||||||
/// # let schema = schema_builder.build();
|
/// # let schema = schema_builder.build();
|
||||||
/// #
|
/// #
|
||||||
@@ -67,10 +66,7 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
|||||||
///
|
///
|
||||||
/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
/// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
||||||
///
|
///
|
||||||
/// let mut count_collector = CountCollector::default();
|
/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||||
/// docs_in_the_sixties.search(&searcher, &mut count_collector)?;
|
|
||||||
///
|
|
||||||
/// let num_60s_books = count_collector.count();
|
|
||||||
///
|
///
|
||||||
/// # assert_eq!(num_60s_books, 2285);
|
/// # assert_eq!(num_60s_books, 2285);
|
||||||
/// # Ok(())
|
/// # Ok(())
|
||||||
@@ -296,9 +292,8 @@ impl Weight for RangeWeight {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::RangeQuery;
|
use super::RangeQuery;
|
||||||
use collector::CountCollector;
|
use collector::Count;
|
||||||
use query::Query;
|
use schema::{Document, Field, Schema, INT_INDEXED};
|
||||||
use schema::{Document, Field, SchemaBuilder, INT_INDEXED};
|
|
||||||
use std::collections::Bound;
|
use std::collections::Bound;
|
||||||
use Index;
|
use Index;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -306,7 +301,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_range_query_simple() {
|
fn test_range_query_simple() {
|
||||||
fn run() -> Result<()> {
|
fn run() -> Result<()> {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -327,9 +322,8 @@ mod tests {
|
|||||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
|
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
|
||||||
|
|
||||||
// ... or `1960..=1969` if inclusive range is enabled.
|
// ... or `1960..=1969` if inclusive range is enabled.
|
||||||
let mut count_collector = CountCollector::default();
|
let count = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||||
docs_in_the_sixties.search(&searcher, &mut count_collector)?;
|
assert_eq!(count, 2285);
|
||||||
assert_eq!(count_collector.count(), 2285);
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -340,7 +334,7 @@ mod tests {
|
|||||||
fn test_range_query() {
|
fn test_range_query() {
|
||||||
let int_field: Field;
|
let int_field: Field;
|
||||||
let schema = {
|
let schema = {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
int_field = schema_builder.add_i64_field("intfield", INT_INDEXED);
|
int_field = schema_builder.add_i64_field("intfield", INT_INDEXED);
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
@@ -363,11 +357,8 @@ mod tests {
|
|||||||
}
|
}
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let count_multiples = |range_query: RangeQuery| {
|
let count_multiples =
|
||||||
let mut count_collector = CountCollector::default();
|
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||||
range_query.search(&searcher, &mut count_collector).unwrap();
|
|
||||||
count_collector.count()
|
|
||||||
};
|
|
||||||
|
|
||||||
assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 10..11)), 9);
|
assert_eq!(count_multiples(RangeQuery::new_i64(int_field, 10..11)), 9);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -16,14 +16,14 @@ use Searcher;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::{Index, Result, Term};
|
/// use tantivy::{Index, Result, Term};
|
||||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
/// use tantivy::collector::Count;
|
||||||
/// use tantivy::query::RegexQuery;
|
/// use tantivy::query::RegexQuery;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -47,19 +47,10 @@ use Searcher;
|
|||||||
/// index.load_searchers()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// let term = Term::from_field_text(title, "Diary");
|
||||||
/// let mut top_collector = TopCollector::with_limit(2);
|
/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title);
|
||||||
/// let mut count_collector = CountCollector::default();
|
/// let count = searcher.search(&query, &Count)?;
|
||||||
/// {
|
/// assert_eq!(count, 3);
|
||||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
|
||||||
/// let term = Term::from_field_text(title, "Diary");
|
|
||||||
/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title);
|
|
||||||
/// searcher.search(&query, &mut collectors).unwrap();
|
|
||||||
/// }
|
|
||||||
/// assert_eq!(count_collector.count(), 3);
|
|
||||||
/// assert!(top_collector.at_capacity());
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
@@ -95,15 +86,15 @@ impl Query for RegexQuery {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::RegexQuery;
|
use super::RegexQuery;
|
||||||
use collector::TopCollector;
|
use collector::TopDocs;
|
||||||
use schema::SchemaBuilder;
|
use schema::Schema;
|
||||||
use schema::TEXT;
|
use schema::TEXT;
|
||||||
use tests::assert_nearly_equals;
|
use tests::assert_nearly_equals;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_regex_query() {
|
pub fn test_regex_query() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -120,20 +111,18 @@ mod test {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
{
|
{
|
||||||
let mut collector = TopCollector::with_limit(2);
|
|
||||||
let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field);
|
let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field);
|
||||||
searcher.search(®ex_query, &mut collector).unwrap();
|
let scored_docs = searcher
|
||||||
let scored_docs = collector.top_docs();
|
.search(®ex_query, &TopDocs::with_limit(2))
|
||||||
|
.unwrap();
|
||||||
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
||||||
let (score, _) = scored_docs[0];
|
let (score, _) = scored_docs[0];
|
||||||
assert_nearly_equals(1f32, score);
|
assert_nearly_equals(1f32, score);
|
||||||
}
|
}
|
||||||
{
|
let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field);
|
||||||
let mut collector = TopCollector::with_limit(2);
|
let top_docs = searcher
|
||||||
let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field);
|
.search(®ex_query, &TopDocs::with_limit(2))
|
||||||
searcher.search(®ex_query, &mut collector).unwrap();
|
.unwrap();
|
||||||
let scored_docs = collector.top_docs();
|
assert!(top_docs.is_empty(), "Expected ZERO document");
|
||||||
assert_eq!(scored_docs.len(), 0, "Expected ZERO document");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
use collector::Collector;
|
|
||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
use docset::{DocSet, SkipResult};
|
use docset::{DocSet, SkipResult};
|
||||||
use downcast;
|
use downcast;
|
||||||
use fastfield::DeleteBitSet;
|
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -16,20 +14,11 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
|
|||||||
/// This method will perform a bit of computation and is not cached.
|
/// This method will perform a bit of computation and is not cached.
|
||||||
fn score(&mut self) -> Score;
|
fn score(&mut self) -> Score;
|
||||||
|
|
||||||
/// Consumes the complete `DocSet` and
|
/// Iterates through all of the document matched by the DocSet
|
||||||
/// push the scored documents to the collector.
|
/// `DocSet` and push the scored documents to the collector.
|
||||||
fn collect(&mut self, collector: &mut Collector, delete_bitset_opt: Option<&DeleteBitSet>) {
|
fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) {
|
||||||
if let Some(delete_bitset) = delete_bitset_opt {
|
while self.advance() {
|
||||||
while self.advance() {
|
callback(self.doc(), self.score());
|
||||||
let doc = self.doc();
|
|
||||||
if !delete_bitset.is_deleted(doc) {
|
|
||||||
collector.collect(doc, self.score());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
while self.advance() {
|
|
||||||
collector.collect(self.doc(), self.score());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -44,9 +33,9 @@ impl Scorer for Box<Scorer> {
|
|||||||
self.deref_mut().score()
|
self.deref_mut().score()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect(&mut self, collector: &mut Collector, delete_bitset: Option<&DeleteBitSet>) {
|
fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) {
|
||||||
let scorer = self.deref_mut();
|
let scorer = self.deref_mut();
|
||||||
scorer.collect(collector, delete_bitset);
|
scorer.for_each(callback);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -9,17 +9,17 @@ pub use self::term_weight::TermWeight;
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use collector::TopCollector;
|
use collector::TopDocs;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use query::{Query, QueryParser, Scorer, TermQuery};
|
use query::{Query, QueryParser, Scorer, TermQuery};
|
||||||
use schema::{IndexRecordOption, SchemaBuilder, STRING, TEXT};
|
use schema::{IndexRecordOption, Schema, STRING, TEXT};
|
||||||
use tests::assert_nearly_equals;
|
use tests::assert_nearly_equals;
|
||||||
use Index;
|
use Index;
|
||||||
use Term;
|
use Term;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_term_query_no_freq() {
|
pub fn test_term_query_no_freq() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", STRING);
|
let text_field = schema_builder.add_text_field("text", STRING);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -49,7 +49,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_term_weight() {
|
pub fn test_term_weight() {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = Schema::builder();
|
||||||
let left_field = schema_builder.add_text_field("left", TEXT);
|
let left_field = schema_builder.add_text_field("left", TEXT);
|
||||||
let right_field = schema_builder.add_text_field("right", TEXT);
|
let right_field = schema_builder.add_text_field("right", TEXT);
|
||||||
let large_field = schema_builder.add_text_field("large", TEXT);
|
let large_field = schema_builder.add_text_field("large", TEXT);
|
||||||
@@ -68,37 +68,35 @@ mod tests {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
{
|
{
|
||||||
let mut collector = TopCollector::with_limit(2);
|
|
||||||
let term = Term::from_field_text(left_field, "left2");
|
let term = Term::from_field_text(left_field, "left2");
|
||||||
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
||||||
searcher.search(&term_query, &mut collector).unwrap();
|
let topdocs = searcher
|
||||||
let scored_docs = collector.top_docs();
|
.search(&term_query, &TopDocs::with_limit(2))
|
||||||
assert_eq!(scored_docs.len(), 1);
|
.unwrap();
|
||||||
let (score, _) = scored_docs[0];
|
assert_eq!(topdocs.len(), 1);
|
||||||
|
let (score, _) = topdocs[0];
|
||||||
assert_nearly_equals(0.77802235, score);
|
assert_nearly_equals(0.77802235, score);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut collector = TopCollector::with_limit(2);
|
|
||||||
let term = Term::from_field_text(left_field, "left1");
|
let term = Term::from_field_text(left_field, "left1");
|
||||||
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
||||||
searcher.search(&term_query, &mut collector).unwrap();
|
let top_docs = searcher
|
||||||
let scored_docs = collector.top_docs();
|
.search(&term_query, &TopDocs::with_limit(2))
|
||||||
assert_eq!(scored_docs.len(), 2);
|
.unwrap();
|
||||||
let (score1, _) = scored_docs[0];
|
assert_eq!(top_docs.len(), 2);
|
||||||
|
let (score1, _) = top_docs[0];
|
||||||
assert_nearly_equals(0.27101856, score1);
|
assert_nearly_equals(0.27101856, score1);
|
||||||
let (score2, _) = scored_docs[1];
|
let (score2, _) = top_docs[1];
|
||||||
assert_nearly_equals(0.13736556, score2);
|
assert_nearly_equals(0.13736556, score2);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query_parser = QueryParser::for_index(&index, vec![]);
|
let query_parser = QueryParser::for_index(&index, vec![]);
|
||||||
let query = query_parser.parse_query("left:left2 left:left1").unwrap();
|
let query = query_parser.parse_query("left:left2 left:left1").unwrap();
|
||||||
let mut collector = TopCollector::with_limit(2);
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
|
||||||
searcher.search(&*query, &mut collector).unwrap();
|
assert_eq!(top_docs.len(), 2);
|
||||||
let scored_docs = collector.top_docs();
|
let (score1, _) = top_docs[0];
|
||||||
assert_eq!(scored_docs.len(), 2);
|
|
||||||
let (score1, _) = scored_docs[0];
|
|
||||||
assert_nearly_equals(0.9153879, score1);
|
assert_nearly_equals(0.9153879, score1);
|
||||||
let (score2, _) = scored_docs[1];
|
let (score2, _) = top_docs[1];
|
||||||
assert_nearly_equals(0.27101856, score2);
|
assert_nearly_equals(0.27101856, score2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,14 +21,14 @@ use Term;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{SchemaBuilder, TEXT, IndexRecordOption};
|
/// use tantivy::schema::{Schema, TEXT, IndexRecordOption};
|
||||||
/// use tantivy::{Index, Result, Term};
|
/// use tantivy::{Index, Result, Term};
|
||||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
/// use tantivy::collector::{Count, TopDocs};
|
||||||
/// use tantivy::query::TermQuery;
|
/// use tantivy::query::TermQuery;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = SchemaBuilder::new();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -52,20 +52,12 @@ use Term;
|
|||||||
/// index.load_searchers()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// let query = TermQuery::new(
|
||||||
/// let mut top_collector = TopCollector::with_limit(2);
|
/// Term::from_field_text(title, "diary"),
|
||||||
/// let mut count_collector = CountCollector::default();
|
/// IndexRecordOption::Basic,
|
||||||
/// {
|
/// );
|
||||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();
|
||||||
/// let query = TermQuery::new(
|
/// assert_eq!(count, 2);
|
||||||
/// Term::from_field_text(title, "diary"),
|
|
||||||
/// IndexRecordOption::Basic,
|
|
||||||
/// );
|
|
||||||
/// searcher.search(&query, &mut collectors).unwrap();
|
|
||||||
/// }
|
|
||||||
/// assert_eq!(count_collector.count(), 2);
|
|
||||||
/// assert!(top_collector.at_capacity());
|
|
||||||
/// }
|
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
|
|||||||
@@ -55,7 +55,8 @@ where
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
).collect();
|
)
|
||||||
|
.collect();
|
||||||
Union {
|
Union {
|
||||||
docsets: non_empty_docsets,
|
docsets: non_empty_docsets,
|
||||||
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
|
bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]),
|
||||||
@@ -214,10 +215,7 @@ where
|
|||||||
|
|
||||||
// The target is outside of the buffered horizon.
|
// The target is outside of the buffered horizon.
|
||||||
// advance all docsets to a doc >= to the target.
|
// advance all docsets to a doc >= to the target.
|
||||||
#[cfg_attr(
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))]
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::clippy::collapsible_if)
|
|
||||||
)]
|
|
||||||
unordered_drain_filter(&mut self.docsets, |docset| {
|
unordered_drain_filter(&mut self.docsets, |docset| {
|
||||||
if docset.doc() < target {
|
if docset.doc() < target {
|
||||||
if docset.skip_next(target) == SkipResult::End {
|
if docset.skip_next(target) == SkipResult::End {
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use Result;
|
|||||||
/// for a given set of segments.
|
/// for a given set of segments.
|
||||||
///
|
///
|
||||||
/// See [`Query`](./trait.Query.html).
|
/// See [`Query`](./trait.Query.html).
|
||||||
pub trait Weight {
|
pub trait Weight: Send + Sync + 'static {
|
||||||
/// Returns the scorer for the given segment.
|
/// Returns the scorer for the given segment.
|
||||||
/// See [`Query`](./trait.Query.html).
|
/// See [`Query`](./trait.Query.html).
|
||||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
|
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc() {
|
fn test_doc() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("title", TEXT);
|
let text_field = schema_builder.add_text_field("title", TEXT);
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_text(text_field, "My title");
|
doc.add_text(text_field, "My title");
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use std::borrow::Cow;
|
|||||||
use std::fmt::{self, Debug, Display, Formatter};
|
use std::fmt::{self, Debug, Display, Formatter};
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
use std::str;
|
use std::str;
|
||||||
|
use std::string::FromUtf8Error;
|
||||||
|
|
||||||
const SLASH_BYTE: u8 = b'/';
|
const SLASH_BYTE: u8 = b'/';
|
||||||
const ESCAPE_BYTE: u8 = b'\\';
|
const ESCAPE_BYTE: u8 = b'\\';
|
||||||
@@ -14,6 +15,10 @@ const ESCAPE_BYTE: u8 = b'\\';
|
|||||||
/// representation of facets.
|
/// representation of facets.
|
||||||
pub const FACET_SEP_BYTE: u8 = 0u8;
|
pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||||
|
|
||||||
|
/// `char` used as a level separation in the binary
|
||||||
|
/// representation of facets. (It is the null codepoint.)
|
||||||
|
pub const FACET_SEP_CHAR: char = '\u{0}';
|
||||||
|
|
||||||
/// A Facet represent a point in a given hierarchy.
|
/// A Facet represent a point in a given hierarchy.
|
||||||
///
|
///
|
||||||
/// They are typically represented similarly to a filepath.
|
/// They are typically represented similarly to a filepath.
|
||||||
@@ -26,18 +31,18 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
|
|||||||
/// its facet. In the example above, `/electronics/tv_and_video/`
|
/// its facet. In the example above, `/electronics/tv_and_video/`
|
||||||
/// and `/electronics`.
|
/// and `/electronics`.
|
||||||
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
|
#[derive(Clone, Eq, Hash, PartialEq, Ord, PartialOrd)]
|
||||||
pub struct Facet(Vec<u8>);
|
pub struct Facet(String);
|
||||||
|
|
||||||
impl Facet {
|
impl Facet {
|
||||||
/// Returns a new instance of the "root facet"
|
/// Returns a new instance of the "root facet"
|
||||||
/// Equivalent to `/`.
|
/// Equivalent to `/`.
|
||||||
pub fn root() -> Facet {
|
pub fn root() -> Facet {
|
||||||
Facet(vec![])
|
Facet("".to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true iff the facet is the root facet `/`.
|
/// Returns true iff the facet is the root facet `/`.
|
||||||
pub fn is_root(&self) -> bool {
|
pub fn is_root(&self) -> bool {
|
||||||
self.encoded_bytes().is_empty()
|
self.encoded_str().is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a binary representation of the facet.
|
/// Returns a binary representation of the facet.
|
||||||
@@ -49,13 +54,19 @@ impl Facet {
|
|||||||
/// This representation has the benefit of making it possible to
|
/// This representation has the benefit of making it possible to
|
||||||
/// express "being a child of a given facet" as a range over
|
/// express "being a child of a given facet" as a range over
|
||||||
/// the term ordinals.
|
/// the term ordinals.
|
||||||
pub fn encoded_bytes(&self) -> &[u8] {
|
pub fn encoded_str(&self) -> &str {
|
||||||
&self.0
|
&self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn from_encoded_string(facet_string: String) -> Facet {
|
||||||
|
Facet(facet_string)
|
||||||
|
}
|
||||||
|
|
||||||
/// Creates a `Facet` from its binary representation.
|
/// Creates a `Facet` from its binary representation.
|
||||||
pub(crate) unsafe fn from_encoded(encoded_bytes: Vec<u8>) -> Facet {
|
pub fn from_encoded(encoded_bytes: Vec<u8>) -> Result<Facet, FromUtf8Error> {
|
||||||
Facet(encoded_bytes)
|
// facet bytes validation. `0u8` is used a separator but that is still legal utf-8
|
||||||
|
//Ok(Facet(String::from_utf8(encoded_bytes)?))
|
||||||
|
String::from_utf8(encoded_bytes).map(Facet)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse a text representation of a facet.
|
/// Parse a text representation of a facet.
|
||||||
@@ -79,36 +90,37 @@ impl Facet {
|
|||||||
Path: IntoIterator,
|
Path: IntoIterator,
|
||||||
Path::Item: ToString,
|
Path::Item: ToString,
|
||||||
{
|
{
|
||||||
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
|
let mut facet_string: String = String::with_capacity(100);
|
||||||
let mut step_it = path.into_iter();
|
let mut step_it = path.into_iter();
|
||||||
if let Some(step) = step_it.next() {
|
if let Some(step) = step_it.next() {
|
||||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
facet_string.push_str(&step.to_string());
|
||||||
}
|
}
|
||||||
for step in step_it {
|
for step in step_it {
|
||||||
facet_bytes.push(FACET_SEP_BYTE);
|
facet_string.push(FACET_SEP_CHAR);
|
||||||
facet_bytes.extend_from_slice(step.to_string().as_bytes());
|
facet_string.push_str(&step.to_string());
|
||||||
}
|
}
|
||||||
Facet(facet_bytes)
|
Facet(facet_string)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor for the inner buffer of the `Facet`.
|
/// Accessor for the inner buffer of the `Facet`.
|
||||||
pub(crate) fn inner_buffer_mut(&mut self) -> &mut Vec<u8> {
|
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
|
||||||
&mut self.0
|
self.0.clear();
|
||||||
|
self.0.push_str(facet_str);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` iff other is a subfacet of `self`.
|
/// Returns `true` iff other is a subfacet of `self`.
|
||||||
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||||
let self_bytes: &[u8] = self.encoded_bytes();
|
let self_str = self.encoded_str();
|
||||||
let other_bytes: &[u8] = other.encoded_bytes();
|
let other_str = other.encoded_str();
|
||||||
self_bytes.len() < other_bytes.len()
|
self_str.len() < other_str.len()
|
||||||
&& other_bytes.starts_with(self_bytes)
|
&& other_str.starts_with(self_str)
|
||||||
&& other_bytes[self_bytes.len()] == 0u8
|
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Borrow<[u8]> for Facet {
|
impl Borrow<str> for Facet {
|
||||||
fn borrow(&self) -> &[u8] {
|
fn borrow(&self) -> &str {
|
||||||
self.encoded_bytes()
|
self.encoded_str()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,45 +132,51 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
|||||||
Idle,
|
Idle,
|
||||||
}
|
}
|
||||||
let path: &str = path_asref.as_ref();
|
let path: &str = path_asref.as_ref();
|
||||||
let mut facet_encoded = Vec::new();
|
assert!(!path.is_empty());
|
||||||
|
assert!(path.starts_with("/"));
|
||||||
|
let mut facet_encoded = String::new();
|
||||||
let mut state = State::Idle;
|
let mut state = State::Idle;
|
||||||
let path_bytes = path.as_bytes();
|
let path_bytes = path.as_bytes();
|
||||||
for &c in &path_bytes[1..] {
|
let mut last_offset = 1;
|
||||||
|
for i in 1..path_bytes.len() {
|
||||||
|
let c = path_bytes[i];
|
||||||
match (state, c) {
|
match (state, c) {
|
||||||
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
|
(State::Idle, ESCAPE_BYTE) => {
|
||||||
|
facet_encoded.push_str(&path[last_offset..i]);
|
||||||
|
last_offset = i + 1;
|
||||||
|
state = State::Escaped
|
||||||
|
}
|
||||||
(State::Idle, SLASH_BYTE) => {
|
(State::Idle, SLASH_BYTE) => {
|
||||||
facet_encoded.push(FACET_SEP_BYTE);
|
facet_encoded.push_str(&path[last_offset..i]);
|
||||||
|
facet_encoded.push(FACET_SEP_CHAR);
|
||||||
|
last_offset = i + 1;
|
||||||
}
|
}
|
||||||
(State::Escaped, any_char) => {
|
(State::Escaped, _escaped_char) => {
|
||||||
state = State::Idle;
|
state = State::Idle;
|
||||||
facet_encoded.push(any_char);
|
|
||||||
}
|
|
||||||
(State::Idle, other_char) => {
|
|
||||||
facet_encoded.push(other_char);
|
|
||||||
}
|
}
|
||||||
|
(State::Idle, _any_char) => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
facet_encoded.push_str(&path[last_offset..]);
|
||||||
Facet(facet_encoded)
|
Facet(facet_encoded)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BinarySerializable for Facet {
|
impl BinarySerializable for Facet {
|
||||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
<Vec<u8> as BinarySerializable>::serialize(&self.0, writer)
|
<String as BinarySerializable>::serialize(&self.0, writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
let bytes = <Vec<u8> as BinarySerializable>::deserialize(reader)?;
|
Ok(Facet(<String as BinarySerializable>::deserialize(reader)?))
|
||||||
Ok(Facet(bytes))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Display for Facet {
|
impl Display for Facet {
|
||||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||||
for step in self.0.split(|&b| b == FACET_SEP_BYTE) {
|
for step in self.0.split(FACET_SEP_CHAR) {
|
||||||
write!(f, "/")?;
|
write!(f, "/")?;
|
||||||
let step_str = unsafe { str::from_utf8_unchecked(step) };
|
write!(f, "{}", escape_slashes(step))?;
|
||||||
write!(f, "{}", escape_slashes(step_str))?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use std::fmt;
|
|||||||
/// - a field name
|
/// - a field name
|
||||||
/// - a field type, itself wrapping up options describing
|
/// - a field type, itself wrapping up options describing
|
||||||
/// how the field should be indexed.
|
/// how the field should be indexed.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
pub struct FieldEntry {
|
pub struct FieldEntry {
|
||||||
name: String,
|
name: String,
|
||||||
field_type: FieldType,
|
field_type: FieldType,
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ directory.
|
|||||||
|
|
||||||
```
|
```
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let title_options = TextOptions::default()
|
let title_options = TextOptions::default()
|
||||||
.set_stored()
|
.set_stored()
|
||||||
.set_indexing_options(TextFieldIndexing::default()
|
.set_indexing_options(TextFieldIndexing::default()
|
||||||
@@ -44,11 +44,11 @@ We can split the problem of generating a search result page into two phases :
|
|||||||
the search results page. (`doc_ids[] -> Document[]`)
|
the search results page. (`doc_ids[] -> Document[]`)
|
||||||
|
|
||||||
In the first phase, the ability to search for documents by the given field is determined by the
|
In the first phase, the ability to search for documents by the given field is determined by the
|
||||||
[`TextIndexingOptions`](enum.TextIndexingOptions.html) of our [`TextOptions`]
|
[`TextIndexingOptions`](enum.TextIndexingOptions.html) of our
|
||||||
(struct.TextOptions.html).
|
[`TextOptions`](struct.TextOptions.html).
|
||||||
|
|
||||||
The effect of each possible setting is described more in detail [`TextIndexingOptions`]
|
The effect of each possible setting is described more in detail
|
||||||
(enum.TextIndexingOptions.html).
|
[`TextIndexingOptions`](enum.TextIndexingOptions.html).
|
||||||
|
|
||||||
On the other hand setting the field as stored or not determines whether the field should be returned
|
On the other hand setting the field as stored or not determines whether the field should be returned
|
||||||
when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called.
|
when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called.
|
||||||
@@ -62,7 +62,7 @@ The example can be rewritten :
|
|||||||
|
|
||||||
```
|
```
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_text_field("title_options", TEXT | STORED);
|
schema_builder.add_text_field("title_options", TEXT | STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
```
|
```
|
||||||
@@ -75,7 +75,7 @@ let schema = schema_builder.build();
|
|||||||
|
|
||||||
```
|
```
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let num_stars_options = IntOptions::default()
|
let num_stars_options = IntOptions::default()
|
||||||
.set_stored()
|
.set_stored()
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
|
|||||||
@@ -23,13 +23,14 @@ use std::fmt;
|
|||||||
/// ```
|
/// ```
|
||||||
/// use tantivy::schema::*;
|
/// use tantivy::schema::*;
|
||||||
///
|
///
|
||||||
/// let mut schema_builder = SchemaBuilder::default();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let id_field = schema_builder.add_text_field("id", STRING);
|
/// let id_field = schema_builder.add_text_field("id", STRING);
|
||||||
/// let title_field = schema_builder.add_text_field("title", TEXT);
|
/// let title_field = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let body_field = schema_builder.add_text_field("body", TEXT);
|
/// let body_field = schema_builder.add_text_field("body", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
///
|
///
|
||||||
/// ```
|
/// ```
|
||||||
|
#[derive(Default)]
|
||||||
pub struct SchemaBuilder {
|
pub struct SchemaBuilder {
|
||||||
fields: Vec<FieldEntry>,
|
fields: Vec<FieldEntry>,
|
||||||
fields_map: HashMap<String, Field>,
|
fields_map: HashMap<String, Field>,
|
||||||
@@ -120,20 +121,19 @@ impl SchemaBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SchemaBuilder {
|
|
||||||
fn default() -> SchemaBuilder {
|
|
||||||
SchemaBuilder {
|
|
||||||
fields: Vec::new(),
|
|
||||||
fields_map: HashMap::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct InnerSchema {
|
struct InnerSchema {
|
||||||
fields: Vec<FieldEntry>,
|
fields: Vec<FieldEntry>,
|
||||||
fields_map: HashMap<String, Field>, // transient
|
fields_map: HashMap<String, Field>, // transient
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl PartialEq for InnerSchema {
|
||||||
|
fn eq(&self, other: &InnerSchema) -> bool {
|
||||||
|
self.fields == other.fields
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for InnerSchema {}
|
||||||
|
|
||||||
/// Tantivy has a very strict schema.
|
/// Tantivy has a very strict schema.
|
||||||
/// You need to specify in advance, whether a field is indexed or not,
|
/// You need to specify in advance, whether a field is indexed or not,
|
||||||
/// stored or not, and RAM-based or not.
|
/// stored or not, and RAM-based or not.
|
||||||
@@ -147,14 +147,14 @@ struct InnerSchema {
|
|||||||
/// ```
|
/// ```
|
||||||
/// use tantivy::schema::*;
|
/// use tantivy::schema::*;
|
||||||
///
|
///
|
||||||
/// let mut schema_builder = SchemaBuilder::default();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let id_field = schema_builder.add_text_field("id", STRING);
|
/// let id_field = schema_builder.add_text_field("id", STRING);
|
||||||
/// let title_field = schema_builder.add_text_field("title", TEXT);
|
/// let title_field = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let body_field = schema_builder.add_text_field("body", TEXT);
|
/// let body_field = schema_builder.add_text_field("body", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
///
|
///
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Clone)]
|
#[derive(Clone, Eq, PartialEq)]
|
||||||
pub struct Schema(Arc<InnerSchema>);
|
pub struct Schema(Arc<InnerSchema>);
|
||||||
|
|
||||||
impl Schema {
|
impl Schema {
|
||||||
@@ -173,6 +173,11 @@ impl Schema {
|
|||||||
&self.0.fields
|
&self.0.fields
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Creates a new builder.
|
||||||
|
pub fn builder() -> SchemaBuilder {
|
||||||
|
SchemaBuilder::default()
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the field options associated with a given name.
|
/// Returns the field options associated with a given name.
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
@@ -227,12 +232,14 @@ impl Schema {
|
|||||||
let field_entry = self.get_field_entry(field);
|
let field_entry = self.get_field_entry(field);
|
||||||
let field_type = field_entry.field_type();
|
let field_type = field_entry.field_type();
|
||||||
match *json_value {
|
match *json_value {
|
||||||
JsonValue::Array(ref json_items) => for json_item in json_items {
|
JsonValue::Array(ref json_items) => {
|
||||||
let value = field_type
|
for json_item in json_items {
|
||||||
.value_from_json(json_item)
|
let value = field_type.value_from_json(json_item).map_err(|e| {
|
||||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
DocParsingError::ValueError(field_name.clone(), e)
|
||||||
doc.add(FieldValue::new(field, value));
|
})?;
|
||||||
},
|
doc.add(FieldValue::new(field, value));
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
let value = field_type
|
let value = field_type
|
||||||
.value_from_json(json_value)
|
.value_from_json(json_value)
|
||||||
@@ -318,7 +325,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn is_indexed_test() {
|
pub fn is_indexed_test() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field_str = schema_builder.add_text_field("field_str", STRING);
|
let field_str = schema_builder.add_text_field("field_str", STRING);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
assert!(schema.get_field_entry(field_str).is_indexed());
|
assert!(schema.get_field_entry(field_str).is_indexed());
|
||||||
@@ -326,7 +333,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_schema_serialization() {
|
pub fn test_schema_serialization() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let count_options = IntOptions::default()
|
let count_options = IntOptions::default()
|
||||||
.set_stored()
|
.set_stored()
|
||||||
.set_fast(Cardinality::SingleValue);
|
.set_fast(Cardinality::SingleValue);
|
||||||
@@ -395,7 +402,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_document_to_json() {
|
pub fn test_document_to_json() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let count_options = IntOptions::default()
|
let count_options = IntOptions::default()
|
||||||
.set_stored()
|
.set_stored()
|
||||||
.set_fast(Cardinality::SingleValue);
|
.set_fast(Cardinality::SingleValue);
|
||||||
@@ -416,7 +423,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_parse_document() {
|
pub fn test_parse_document() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let count_options = IntOptions::default()
|
let count_options = IntOptions::default()
|
||||||
.set_stored()
|
.set_stored()
|
||||||
.set_fast(Cardinality::SingleValue);
|
.set_fast(Cardinality::SingleValue);
|
||||||
@@ -441,7 +448,8 @@ mod tests {
|
|||||||
"count": 4,
|
"count": 4,
|
||||||
"popularity": 10
|
"popularity": 10
|
||||||
}"#,
|
}"#,
|
||||||
).unwrap();
|
)
|
||||||
|
.unwrap();
|
||||||
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
|
assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title"));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
doc.get_first(author_field).unwrap().text(),
|
doc.get_first(author_field).unwrap().text(),
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ impl Term {
|
|||||||
|
|
||||||
/// Creates a `Term` given a facet.
|
/// Creates a `Term` given a facet.
|
||||||
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||||
let bytes = facet.encoded_bytes();
|
let bytes = facet.encoded_str().as_bytes();
|
||||||
let buffer = Vec::with_capacity(4 + bytes.len());
|
let buffer = Vec::with_capacity(4 + bytes.len());
|
||||||
let mut term = Term(buffer);
|
let mut term = Term(buffer);
|
||||||
term.set_field(field);
|
term.set_field(field);
|
||||||
@@ -68,12 +68,7 @@ impl Term {
|
|||||||
term
|
term
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new Term with an empty buffer,
|
/// Creates a new Term for a given field.
|
||||||
/// but with a given capacity.
|
|
||||||
///
|
|
||||||
/// It is declared unsafe, as the term content
|
|
||||||
/// is not initialized, and a call to `.field()`
|
|
||||||
/// would panic.
|
|
||||||
pub(crate) fn for_field(field: Field) -> Term {
|
pub(crate) fn for_field(field: Field) -> Term {
|
||||||
let mut term = Term(Vec::with_capacity(100));
|
let mut term = Term(Vec::with_capacity(100));
|
||||||
term.set_field(field);
|
term.set_field(field);
|
||||||
@@ -201,7 +196,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_term() {
|
pub fn test_term() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_text_field("text", STRING);
|
schema_builder.add_text_field("text", STRING);
|
||||||
let title_field = schema_builder.add_text_field("title", STRING);
|
let title_field = schema_builder.add_text_field("title", STRING);
|
||||||
let count_field = schema_builder.add_text_field("count", STRING);
|
let count_field = schema_builder.add_text_field("count", STRING);
|
||||||
|
|||||||
@@ -141,7 +141,7 @@ mod tests {
|
|||||||
assert!(field_options.get_indexing_options().is_some());
|
assert!(field_options.get_indexing_options().is_some());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
schema_builder.add_text_field("body", TEXT);
|
schema_builder.add_text_field("body", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let field = schema.get_field("body").unwrap();
|
let field = schema.get_field("body").unwrap();
|
||||||
|
|||||||
@@ -22,6 +22,11 @@ impl HighlightSection {
|
|||||||
fn new(start: usize, stop: usize) -> HighlightSection {
|
fn new(start: usize, stop: usize) -> HighlightSection {
|
||||||
HighlightSection { start, stop }
|
HighlightSection { start, stop }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the bounds of the `HighlightSection`.
|
||||||
|
pub fn bounds(&self) -> (usize, usize) {
|
||||||
|
(self.start, self.stop)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -65,6 +70,8 @@ impl FragmentCandidate {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `Snippet`
|
||||||
|
/// Contains a fragment of a document, and some highlighed parts inside it.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Snippet {
|
pub struct Snippet {
|
||||||
fragments: String,
|
fragments: String,
|
||||||
@@ -75,6 +82,7 @@ const HIGHLIGHTEN_PREFIX: &str = "<b>";
|
|||||||
const HIGHLIGHTEN_POSTFIX: &str = "</b>";
|
const HIGHLIGHTEN_POSTFIX: &str = "</b>";
|
||||||
|
|
||||||
impl Snippet {
|
impl Snippet {
|
||||||
|
/// Create a new, empty, `Snippet`
|
||||||
pub fn empty() -> Snippet {
|
pub fn empty() -> Snippet {
|
||||||
Snippet {
|
Snippet {
|
||||||
fragments: String::new(),
|
fragments: String::new(),
|
||||||
@@ -99,6 +107,16 @@ impl Snippet {
|
|||||||
));
|
));
|
||||||
html
|
html
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a fragment from the `Snippet`.
|
||||||
|
pub fn fragments(&self) -> &str {
|
||||||
|
&self.fragments
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a list of higlighted positions from the `Snippet`.
|
||||||
|
pub fn highlighted(&self) -> &[HighlightSection] {
|
||||||
|
&self.highlighted
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a non-empty list of "good" fragments.
|
/// Returns a non-empty list of "good" fragments.
|
||||||
@@ -174,7 +192,8 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
|||||||
item.start - fragment.start_offset,
|
item.start - fragment.start_offset,
|
||||||
item.stop - fragment.start_offset,
|
item.stop - fragment.start_offset,
|
||||||
)
|
)
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
Snippet {
|
Snippet {
|
||||||
fragments: fragment_text.to_string(),
|
fragments: fragment_text.to_string(),
|
||||||
highlighted,
|
highlighted,
|
||||||
@@ -197,12 +216,12 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
|||||||
/// # #[macro_use]
|
/// # #[macro_use]
|
||||||
/// # extern crate tantivy;
|
/// # extern crate tantivy;
|
||||||
/// # use tantivy::Index;
|
/// # use tantivy::Index;
|
||||||
/// # use tantivy::schema::{SchemaBuilder, TEXT};
|
/// # use tantivy::schema::{Schema, TEXT};
|
||||||
/// # use tantivy::query::QueryParser;
|
/// # use tantivy::query::QueryParser;
|
||||||
/// use tantivy::SnippetGenerator;
|
/// use tantivy::SnippetGenerator;
|
||||||
///
|
///
|
||||||
/// # fn main() -> tantivy::Result<()> {
|
/// # fn main() -> tantivy::Result<()> {
|
||||||
/// # let mut schema_builder = SchemaBuilder::default();
|
/// # let mut schema_builder = Schema::builder();
|
||||||
/// # let text_field = schema_builder.add_text_field("text", TEXT);
|
/// # let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
/// # let schema = schema_builder.build();
|
/// # let schema = schema_builder.build();
|
||||||
/// # let index = Index::create_in_ram(schema);
|
/// # let index = Index::create_in_ram(schema);
|
||||||
@@ -224,7 +243,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
|||||||
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
|
/// let query = query_parser.parse_query("haleurs flamands").unwrap();
|
||||||
/// # index.load_searchers()?;
|
/// # index.load_searchers()?;
|
||||||
/// # let searcher = index.searcher();
|
/// # let searcher = index.searcher();
|
||||||
/// let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field)?;
|
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
|
||||||
/// snippet_generator.set_max_num_chars(100);
|
/// snippet_generator.set_max_num_chars(100);
|
||||||
/// let snippet = snippet_generator.snippet_from_doc(&doc);
|
/// let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||||
/// let snippet_html: String = snippet.to_html();
|
/// let snippet_html: String = snippet.to_html();
|
||||||
@@ -241,7 +260,7 @@ pub struct SnippetGenerator {
|
|||||||
|
|
||||||
impl SnippetGenerator {
|
impl SnippetGenerator {
|
||||||
/// Creates a new snippet generator
|
/// Creates a new snippet generator
|
||||||
pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
|
pub fn create(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> {
|
||||||
let mut terms = BTreeSet::new();
|
let mut terms = BTreeSet::new();
|
||||||
query.query_terms(&mut terms);
|
query.query_terms(&mut terms);
|
||||||
let terms_text: BTreeMap<String, f32> = terms
|
let terms_text: BTreeMap<String, f32> = terms
|
||||||
@@ -306,7 +325,7 @@ impl SnippetGenerator {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::{search_fragments, select_best_fragment_combination};
|
use super::{search_fragments, select_best_fragment_combination};
|
||||||
use query::QueryParser;
|
use query::QueryParser;
|
||||||
use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions, TEXT};
|
use schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::iter::Iterator;
|
use std::iter::Iterator;
|
||||||
use tokenizer::{box_tokenizer, SimpleTokenizer};
|
use tokenizer::{box_tokenizer, SimpleTokenizer};
|
||||||
@@ -328,8 +347,6 @@ to the project are from community members.[15]
|
|||||||
Rust won first place for "most loved programming language" in the Stack Overflow Developer
|
Rust won first place for "most loved programming language" in the Stack Overflow Developer
|
||||||
Survey in 2016, 2017, and 2018."#;
|
Survey in 2016, 2017, and 2018."#;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet() {
|
fn test_snippet() {
|
||||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||||
@@ -345,13 +362,18 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
assert_eq!(first.stop_offset, 89);
|
assert_eq!(first.stop_offset, 89);
|
||||||
}
|
}
|
||||||
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
||||||
assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by \
|
assert_eq!(
|
||||||
Mozilla which\ndescribes it as a \"safe");
|
snippet.fragments,
|
||||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems programming <b>language</b> \
|
"Rust is a systems programming language sponsored by \
|
||||||
sponsored by Mozilla which\ndescribes it as a "safe")
|
Mozilla which\ndescribes it as a \"safe"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
snippet.to_html(),
|
||||||
|
"<b>Rust</b> is a systems programming <b>language</b> \
|
||||||
|
sponsored by Mozilla which\ndescribes it as a "safe"
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_scored_fragment() {
|
fn test_snippet_scored_fragment() {
|
||||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||||
@@ -385,10 +407,8 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
||||||
assert_eq!(snippet.to_html(), "programming <b>language</b>")
|
assert_eq!(snippet.to_html(), "programming <b>language</b>")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_in_second_fragment() {
|
fn test_snippet_in_second_fragment() {
|
||||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||||
@@ -495,10 +515,9 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
assert_eq!(snippet.to_html(), "");
|
assert_eq!(snippet.to_html(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_generator_term_score() {
|
fn test_snippet_generator_term_score() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -515,29 +534,42 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("e").unwrap();
|
let query = query_parser.parse_query("e").unwrap();
|
||||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
let snippet_generator =
|
||||||
|
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||||
assert!(snippet_generator.terms_text().is_empty());
|
assert!(snippet_generator.terms_text().is_empty());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("a").unwrap();
|
let query = query_parser.parse_query("a").unwrap();
|
||||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
let snippet_generator =
|
||||||
assert_eq!(&btreemap!("a".to_string() => 0.25f32), snippet_generator.terms_text());
|
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
&btreemap!("a".to_string() => 0.25f32),
|
||||||
|
snippet_generator.terms_text()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("a b").unwrap();
|
let query = query_parser.parse_query("a b").unwrap();
|
||||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
let snippet_generator =
|
||||||
assert_eq!(&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5), snippet_generator.terms_text());
|
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
|
||||||
|
snippet_generator.terms_text()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("a b c").unwrap();
|
let query = query_parser.parse_query("a b c").unwrap();
|
||||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
let snippet_generator =
|
||||||
assert_eq!(&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5), snippet_generator.terms_text());
|
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5),
|
||||||
|
snippet_generator.terms_text()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_generator() {
|
fn test_snippet_generator() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_options = TextOptions::default().set_indexing_options(
|
let text_options = TextOptions::default().set_indexing_options(
|
||||||
TextFieldIndexing::default()
|
TextFieldIndexing::default()
|
||||||
.set_tokenizer("en_stem")
|
.set_tokenizer("en_stem")
|
||||||
@@ -559,7 +591,8 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let query = query_parser.parse_query("rust design").unwrap();
|
let query = query_parser.parse_query("rust design").unwrap();
|
||||||
let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap();
|
let mut snippet_generator =
|
||||||
|
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
|
||||||
{
|
{
|
||||||
let snippet = snippet_generator.snippet(TEST_TEXT);
|
let snippet = snippet_generator.snippet(TEST_TEXT);
|
||||||
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
|
assert_eq!(snippet.to_html(), "imperative-procedural paradigms. <b>Rust</b> is syntactically similar to C++[according to whom?],\nbut its <b>designers</b> intend it to provide better memory safety");
|
||||||
|
|||||||
497
src/space_usage/mod.rs
Normal file
497
src/space_usage/mod.rs
Normal file
@@ -0,0 +1,497 @@
|
|||||||
|
/*!
|
||||||
|
Representations for the space usage of various parts of a Tantivy index.
|
||||||
|
|
||||||
|
This can be used programmatically, and will also be exposed in a human readable fashion in
|
||||||
|
tantivy-cli.
|
||||||
|
|
||||||
|
One important caveat for all of this functionality is that none of it currently takes storage-level
|
||||||
|
details into consideration. For example, if your file system block size is 4096 bytes, we can
|
||||||
|
under-count actual resultant space usage by up to 4095 bytes per file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use schema::Field;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use SegmentComponent;
|
||||||
|
|
||||||
|
/// Indicates space usage in bytes
|
||||||
|
pub type ByteCount = usize;
|
||||||
|
|
||||||
|
/// Enum containing any of the possible space usage results for segment components.
|
||||||
|
pub enum ComponentSpaceUsage {
|
||||||
|
/// Data is stored per field in a uniform way
|
||||||
|
PerField(PerFieldSpaceUsage),
|
||||||
|
/// Data is stored in separate pieces in the store
|
||||||
|
Store(StoreSpaceUsage),
|
||||||
|
/// Some sort of raw byte count
|
||||||
|
Basic(ByteCount),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents combined space usage of an entire searcher and its component segments.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct SearcherSpaceUsage {
|
||||||
|
segments: Vec<SegmentSpaceUsage>,
|
||||||
|
total: ByteCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearcherSpaceUsage {
|
||||||
|
pub(crate) fn new() -> SearcherSpaceUsage {
|
||||||
|
SearcherSpaceUsage {
|
||||||
|
segments: Vec::new(),
|
||||||
|
total: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a segment, to `self`.
|
||||||
|
/// Performs no deduplication or other intelligence.
|
||||||
|
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
|
||||||
|
self.total += segment.total();
|
||||||
|
self.segments.push(segment);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per segment space usage
|
||||||
|
pub fn segments(&self) -> &[SegmentSpaceUsage] {
|
||||||
|
&self.segments[..]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns total byte usage of this searcher, including all large subcomponents.
|
||||||
|
/// Does not account for smaller things like `meta.json`.
|
||||||
|
pub fn total(&self) -> ByteCount {
|
||||||
|
self.total
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents combined space usage for all of the large components comprising a segment.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct SegmentSpaceUsage {
|
||||||
|
num_docs: u32,
|
||||||
|
|
||||||
|
termdict: PerFieldSpaceUsage,
|
||||||
|
postings: PerFieldSpaceUsage,
|
||||||
|
positions: PerFieldSpaceUsage,
|
||||||
|
positions_idx: PerFieldSpaceUsage,
|
||||||
|
fast_fields: PerFieldSpaceUsage,
|
||||||
|
fieldnorms: PerFieldSpaceUsage,
|
||||||
|
|
||||||
|
store: StoreSpaceUsage,
|
||||||
|
|
||||||
|
deletes: ByteCount,
|
||||||
|
|
||||||
|
total: ByteCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentSpaceUsage {
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub(crate) fn new(
|
||||||
|
num_docs: u32,
|
||||||
|
termdict: PerFieldSpaceUsage,
|
||||||
|
postings: PerFieldSpaceUsage,
|
||||||
|
positions: PerFieldSpaceUsage,
|
||||||
|
positions_idx: PerFieldSpaceUsage,
|
||||||
|
fast_fields: PerFieldSpaceUsage,
|
||||||
|
fieldnorms: PerFieldSpaceUsage,
|
||||||
|
store: StoreSpaceUsage,
|
||||||
|
deletes: ByteCount,
|
||||||
|
) -> SegmentSpaceUsage {
|
||||||
|
let total = termdict.total()
|
||||||
|
+ postings.total()
|
||||||
|
+ positions.total()
|
||||||
|
+ fast_fields.total()
|
||||||
|
+ fieldnorms.total()
|
||||||
|
+ store.total()
|
||||||
|
+ deletes;
|
||||||
|
SegmentSpaceUsage {
|
||||||
|
num_docs,
|
||||||
|
termdict,
|
||||||
|
postings,
|
||||||
|
positions,
|
||||||
|
positions_idx,
|
||||||
|
fast_fields,
|
||||||
|
fieldnorms,
|
||||||
|
store,
|
||||||
|
deletes,
|
||||||
|
total,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for the given component
|
||||||
|
///
|
||||||
|
/// Clones the underlying data.
|
||||||
|
/// Use the components directly if this is somehow in performance critical code.
|
||||||
|
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
||||||
|
use self::ComponentSpaceUsage::*;
|
||||||
|
use SegmentComponent::*;
|
||||||
|
match component {
|
||||||
|
POSTINGS => PerField(self.postings().clone()),
|
||||||
|
POSITIONS => PerField(self.positions().clone()),
|
||||||
|
POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
|
||||||
|
FASTFIELDS => PerField(self.fast_fields().clone()),
|
||||||
|
FIELDNORMS => PerField(self.fieldnorms().clone()),
|
||||||
|
TERMS => PerField(self.termdict().clone()),
|
||||||
|
STORE => Store(self.store().clone()),
|
||||||
|
DELETE => Basic(self.deletes()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Num docs in segment
|
||||||
|
pub fn num_docs(&self) -> u32 {
|
||||||
|
self.num_docs
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for term dictionary
|
||||||
|
pub fn termdict(&self) -> &PerFieldSpaceUsage {
|
||||||
|
&self.termdict
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for postings list
|
||||||
|
pub fn postings(&self) -> &PerFieldSpaceUsage {
|
||||||
|
&self.postings
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for positions
|
||||||
|
pub fn positions(&self) -> &PerFieldSpaceUsage {
|
||||||
|
&self.positions
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for positions skip idx
|
||||||
|
pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
|
||||||
|
&self.positions_idx
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for fast fields
|
||||||
|
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
|
||||||
|
&self.fast_fields
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for field norms
|
||||||
|
pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
|
||||||
|
&self.fieldnorms
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for stored documents
|
||||||
|
pub fn store(&self) -> &StoreSpaceUsage {
|
||||||
|
&self.store
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for document deletions
|
||||||
|
pub fn deletes(&self) -> ByteCount {
|
||||||
|
self.deletes
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Total space usage in bytes for this segment.
|
||||||
|
pub fn total(&self) -> ByteCount {
|
||||||
|
self.total
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents space usage for the Store for this segment.
|
||||||
|
///
|
||||||
|
/// This is composed of two parts.
|
||||||
|
/// `data` represents the compressed data itself.
|
||||||
|
/// `offsets` represents a lookup to find the start of a block
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct StoreSpaceUsage {
|
||||||
|
data: ByteCount,
|
||||||
|
offsets: ByteCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StoreSpaceUsage {
|
||||||
|
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
|
||||||
|
StoreSpaceUsage { data, offsets }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for the data part of the store
|
||||||
|
pub fn data_usage(&self) -> ByteCount {
|
||||||
|
self.data
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for the offsets part of the store (doc ID -> offset)
|
||||||
|
pub fn offsets_usage(&self) -> ByteCount {
|
||||||
|
self.offsets
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Total space usage in bytes for this Store
|
||||||
|
pub fn total(&self) -> ByteCount {
|
||||||
|
self.data + self.offsets
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
|
||||||
|
///
|
||||||
|
/// A field can appear with a single index (typically 0) or with multiple indexes.
|
||||||
|
/// Multiple indexes are used to handle variable length things, where
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct PerFieldSpaceUsage {
|
||||||
|
fields: HashMap<Field, FieldUsage>,
|
||||||
|
total: ByteCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PerFieldSpaceUsage {
|
||||||
|
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
||||||
|
let total = fields.values().map(|x| x.total()).sum();
|
||||||
|
PerFieldSpaceUsage { fields, total }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per field space usage
|
||||||
|
pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
|
||||||
|
self.fields.iter()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bytes used by the represented file
|
||||||
|
pub fn total(&self) -> ByteCount {
|
||||||
|
self.total
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
|
||||||
|
/// comprise it.
|
||||||
|
///
|
||||||
|
/// See documentation for PerFieldSpaceUsage for slightly more information.
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct FieldUsage {
|
||||||
|
field: Field,
|
||||||
|
num_bytes: ByteCount,
|
||||||
|
/// A field can be composed of more than one piece.
|
||||||
|
/// These pieces are indexed by arbitrary numbers starting at zero.
|
||||||
|
/// `self.num_bytes` includes all of `self.sub_num_bytes`.
|
||||||
|
sub_num_bytes: Vec<Option<ByteCount>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FieldUsage {
|
||||||
|
pub(crate) fn empty(field: Field) -> FieldUsage {
|
||||||
|
FieldUsage {
|
||||||
|
field,
|
||||||
|
num_bytes: 0,
|
||||||
|
sub_num_bytes: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
|
||||||
|
if self.sub_num_bytes.len() < idx + 1 {
|
||||||
|
self.sub_num_bytes.resize(idx + 1, None);
|
||||||
|
}
|
||||||
|
assert!(self.sub_num_bytes[idx].is_none());
|
||||||
|
self.sub_num_bytes[idx] = Some(size);
|
||||||
|
self.num_bytes += size
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Field
|
||||||
|
pub fn field(&self) -> Field {
|
||||||
|
self.field
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Space usage for each index
|
||||||
|
pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
|
||||||
|
&self.sub_num_bytes[..]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Total bytes used for this field in this context
|
||||||
|
pub fn total(&self) -> ByteCount {
|
||||||
|
self.num_bytes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use core::Index;
|
||||||
|
use schema::Field;
|
||||||
|
use schema::Schema;
|
||||||
|
use schema::STORED;
|
||||||
|
use schema::{FAST, INT_INDEXED, TEXT};
|
||||||
|
use space_usage::ByteCount;
|
||||||
|
use space_usage::PerFieldSpaceUsage;
|
||||||
|
use Term;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty() {
|
||||||
|
let schema = Schema::builder().build();
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let searcher_space_usage = searcher.space_usage();
|
||||||
|
assert_eq!(0, searcher_space_usage.total());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn expect_single_field(
|
||||||
|
field_space: &PerFieldSpaceUsage,
|
||||||
|
field: &Field,
|
||||||
|
min_size: ByteCount,
|
||||||
|
max_size: ByteCount,
|
||||||
|
) {
|
||||||
|
assert!(field_space.total() >= min_size);
|
||||||
|
assert!(field_space.total() <= max_size);
|
||||||
|
assert_eq!(
|
||||||
|
vec![(field, field_space.total())],
|
||||||
|
field_space
|
||||||
|
.fields()
|
||||||
|
.map(|(x, y)| (x, y.total()))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fast_indexed() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(name => 1u64));
|
||||||
|
index_writer.add_document(doc!(name => 2u64));
|
||||||
|
index_writer.add_document(doc!(name => 10u64));
|
||||||
|
index_writer.add_document(doc!(name => 20u64));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let searcher_space_usage = searcher.space_usage();
|
||||||
|
assert!(searcher_space_usage.total() > 0);
|
||||||
|
assert_eq!(1, searcher_space_usage.segments().len());
|
||||||
|
|
||||||
|
let segment = &searcher_space_usage.segments()[0];
|
||||||
|
assert!(segment.total() > 0);
|
||||||
|
|
||||||
|
assert_eq!(4, segment.num_docs());
|
||||||
|
|
||||||
|
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||||
|
expect_single_field(segment.postings(), &name, 1, 512);
|
||||||
|
assert_eq!(0, segment.positions().total());
|
||||||
|
assert_eq!(0, segment.positions_skip_idx().total());
|
||||||
|
expect_single_field(segment.fast_fields(), &name, 1, 512);
|
||||||
|
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||||
|
// TODO: understand why the following fails
|
||||||
|
// assert_eq!(0, segment.store().total());
|
||||||
|
assert_eq!(0, segment.deletes());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_text() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let name = schema_builder.add_text_field("name", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(name => "hi"));
|
||||||
|
index_writer.add_document(doc!(name => "this is a test"));
|
||||||
|
index_writer.add_document(
|
||||||
|
doc!(name => "some more documents with some word overlap with the other test"),
|
||||||
|
);
|
||||||
|
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let searcher_space_usage = searcher.space_usage();
|
||||||
|
assert!(searcher_space_usage.total() > 0);
|
||||||
|
assert_eq!(1, searcher_space_usage.segments().len());
|
||||||
|
|
||||||
|
let segment = &searcher_space_usage.segments()[0];
|
||||||
|
assert!(segment.total() > 0);
|
||||||
|
|
||||||
|
assert_eq!(4, segment.num_docs());
|
||||||
|
|
||||||
|
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||||
|
expect_single_field(segment.postings(), &name, 1, 512);
|
||||||
|
expect_single_field(segment.positions(), &name, 1, 512);
|
||||||
|
expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
|
||||||
|
assert_eq!(0, segment.fast_fields().total());
|
||||||
|
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||||
|
// TODO: understand why the following fails
|
||||||
|
// assert_eq!(0, segment.store().total());
|
||||||
|
assert_eq!(0, segment.deletes());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_store() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let name = schema_builder.add_text_field("name", STORED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(name => "hi"));
|
||||||
|
index_writer.add_document(doc!(name => "this is a test"));
|
||||||
|
index_writer.add_document(
|
||||||
|
doc!(name => "some more documents with some word overlap with the other test"),
|
||||||
|
);
|
||||||
|
index_writer.add_document(doc!(name => "hello hi goodbye"));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let searcher_space_usage = searcher.space_usage();
|
||||||
|
assert!(searcher_space_usage.total() > 0);
|
||||||
|
assert_eq!(1, searcher_space_usage.segments().len());
|
||||||
|
|
||||||
|
let segment = &searcher_space_usage.segments()[0];
|
||||||
|
assert!(segment.total() > 0);
|
||||||
|
|
||||||
|
assert_eq!(4, segment.num_docs());
|
||||||
|
|
||||||
|
assert_eq!(0, segment.termdict().total());
|
||||||
|
assert_eq!(0, segment.postings().total());
|
||||||
|
assert_eq!(0, segment.positions().total());
|
||||||
|
assert_eq!(0, segment.positions_skip_idx().total());
|
||||||
|
assert_eq!(0, segment.fast_fields().total());
|
||||||
|
assert_eq!(0, segment.fieldnorms().total());
|
||||||
|
assert!(segment.store().total() > 0);
|
||||||
|
assert!(segment.store().total() < 512);
|
||||||
|
assert_eq!(0, segment.deletes());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_deletes() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let name = schema_builder.add_u64_field("name", INT_INDEXED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(name => 1u64));
|
||||||
|
index_writer.add_document(doc!(name => 2u64));
|
||||||
|
index_writer.add_document(doc!(name => 3u64));
|
||||||
|
index_writer.add_document(doc!(name => 4u64));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer2 = index.writer(50_000_000).unwrap();
|
||||||
|
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
|
||||||
|
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
|
||||||
|
|
||||||
|
// ok, now we should have a deleted doc
|
||||||
|
index_writer2.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let searcher_space_usage = searcher.space_usage();
|
||||||
|
assert!(searcher_space_usage.total() > 0);
|
||||||
|
assert_eq!(1, searcher_space_usage.segments().len());
|
||||||
|
|
||||||
|
let segment = &searcher_space_usage.segments()[0];
|
||||||
|
assert!(segment.total() > 0);
|
||||||
|
|
||||||
|
assert_eq!(2, segment.num_docs());
|
||||||
|
|
||||||
|
expect_single_field(segment.termdict(), &name, 1, 512);
|
||||||
|
expect_single_field(segment.postings(), &name, 1, 512);
|
||||||
|
assert_eq!(0, segment.positions().total());
|
||||||
|
assert_eq!(0, segment.positions_skip_idx().total());
|
||||||
|
assert_eq!(0, segment.fast_fields().total());
|
||||||
|
expect_single_field(segment.fieldnorms(), &name, 1, 512);
|
||||||
|
// TODO: understand why the following fails
|
||||||
|
// assert_eq!(0, segment.store().total());
|
||||||
|
assert!(segment.deletes() > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -56,12 +56,12 @@ pub mod tests {
|
|||||||
use directory::{Directory, RAMDirectory, WritePtr};
|
use directory::{Directory, RAMDirectory, WritePtr};
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::FieldValue;
|
use schema::FieldValue;
|
||||||
|
use schema::Schema;
|
||||||
use schema::TextOptions;
|
use schema::TextOptions;
|
||||||
use schema::{Schema, SchemaBuilder};
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
|
pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||||
let field_title =
|
let field_title =
|
||||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use common::BinarySerializable;
|
|||||||
use common::VInt;
|
use common::VInt;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
|
use space_usage::StoreSpaceUsage;
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
@@ -87,12 +88,14 @@ impl StoreReader {
|
|||||||
cursor = &cursor[..doc_length];
|
cursor = &cursor[..doc_length];
|
||||||
Ok(Document::deserialize(&mut cursor)?)
|
Ok(Document::deserialize(&mut cursor)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Summarize total space usage of this store reader.
|
||||||
|
pub fn space_usage(&self) -> StoreSpaceUsage {
|
||||||
|
StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))]
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::needless_pass_by_value)
|
|
||||||
)]
|
|
||||||
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
|
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
|
||||||
let data_len = data.len();
|
let data_len = data.len();
|
||||||
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();
|
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();
|
||||||
|
|||||||
@@ -53,7 +53,8 @@ impl<'a> TermMerger<'a> {
|
|||||||
.map(|(ord, streamer)| HeapItem {
|
.map(|(ord, streamer)| HeapItem {
|
||||||
streamer,
|
streamer,
|
||||||
segment_ord: ord,
|
segment_ord: ord,
|
||||||
}).collect(),
|
})
|
||||||
|
.collect(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -122,10 +123,7 @@ impl<'a> TermMerger<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Iterates through terms
|
/// Iterates through terms
|
||||||
#[cfg_attr(
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::should_implement_trait)
|
|
||||||
)]
|
|
||||||
pub fn next(&mut self) -> Option<Term<&[u8]>> {
|
pub fn next(&mut self) -> Option<Term<&[u8]>> {
|
||||||
if self.advance() {
|
if self.advance() {
|
||||||
Some(Term::wrap(self.current_streamers[0].streamer.key()))
|
Some(Term::wrap(self.current_streamers[0].streamer.key()))
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ mod tests {
|
|||||||
use core::Index;
|
use core::Index;
|
||||||
use directory::{Directory, RAMDirectory, ReadOnlySource};
|
use directory::{Directory, RAMDirectory, ReadOnlySource};
|
||||||
use postings::TermInfo;
|
use postings::TermInfo;
|
||||||
use schema::{Document, FieldType, SchemaBuilder, TEXT};
|
use schema::{Document, FieldType, Schema, TEXT};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str;
|
use std::str;
|
||||||
|
|
||||||
@@ -66,7 +66,7 @@ mod tests {
|
|||||||
let write = directory.open_write(&path).unwrap();
|
let write = directory.open_write(&path).unwrap();
|
||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(write, &field_type).unwrap();
|
TermDictionaryBuilder::create(write, &field_type).unwrap();
|
||||||
for term in COUNTRIES.iter() {
|
for term in COUNTRIES.iter() {
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
.insert(term.as_bytes(), &make_term_info(0u64))
|
.insert(term.as_bytes(), &make_term_info(0u64))
|
||||||
@@ -92,7 +92,7 @@ mod tests {
|
|||||||
let write = directory.open_write(&path).unwrap();
|
let write = directory.open_write(&path).unwrap();
|
||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(write, &field_type).unwrap();
|
TermDictionaryBuilder::create(write, &field_type).unwrap();
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
.insert("abc".as_bytes(), &make_term_info(34u64))
|
.insert("abc".as_bytes(), &make_term_info(34u64))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -129,7 +129,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_term_iterator() {
|
fn test_term_iterator() {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
@@ -167,7 +167,7 @@ mod tests {
|
|||||||
let mut term_string = String::new();
|
let mut term_string = String::new();
|
||||||
while term_it.advance() {
|
while term_it.advance() {
|
||||||
//let term = Term::from_bytes(term_it.key());
|
//let term = Term::from_bytes(term_it.key());
|
||||||
term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test
|
term_string.push_str(str::from_utf8(term_it.key()).expect("test"));
|
||||||
}
|
}
|
||||||
assert_eq!(&*term_string, "abcdef");
|
assert_eq!(&*term_string, "abcdef");
|
||||||
}
|
}
|
||||||
@@ -180,7 +180,7 @@ mod tests {
|
|||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let buffer: Vec<u8> = {
|
let buffer: Vec<u8> = {
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||||
for &(ref id, ref i) in &ids {
|
for &(ref id, ref i) in &ids {
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||||
@@ -210,7 +210,7 @@ mod tests {
|
|||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let buffer: Vec<u8> = {
|
let buffer: Vec<u8> = {
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||||
// term requires more than 16bits
|
// term requires more than 16bits
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
|
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
|
||||||
@@ -245,7 +245,7 @@ mod tests {
|
|||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let buffer: Vec<u8> = {
|
let buffer: Vec<u8> = {
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||||
for &(ref id, ref i) in &ids {
|
for &(ref id, ref i) in &ids {
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||||
@@ -314,7 +314,7 @@ mod tests {
|
|||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let buffer: Vec<u8> = {
|
let buffer: Vec<u8> = {
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
.insert(&[], &make_term_info(1 as u64))
|
.insert(&[], &make_term_info(1 as u64))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -338,7 +338,7 @@ mod tests {
|
|||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let buffer: Vec<u8> = {
|
let buffer: Vec<u8> = {
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(vec![], &field_type).unwrap();
|
TermDictionaryBuilder::create(vec![], &field_type).unwrap();
|
||||||
for i in 0u8..10u8 {
|
for i in 0u8..10u8 {
|
||||||
let number_arr = [i; 1];
|
let number_arr = [i; 1];
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
@@ -408,7 +408,7 @@ mod tests {
|
|||||||
let write = directory.open_write(&path).unwrap();
|
let write = directory.open_write(&path).unwrap();
|
||||||
let field_type = FieldType::Str(TEXT);
|
let field_type = FieldType::Str(TEXT);
|
||||||
let mut term_dictionary_builder =
|
let mut term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(write, &field_type).unwrap();
|
TermDictionaryBuilder::create(write, &field_type).unwrap();
|
||||||
for term in COUNTRIES.iter() {
|
for term in COUNTRIES.iter() {
|
||||||
term_dictionary_builder
|
term_dictionary_builder
|
||||||
.insert(term.as_bytes(), &make_term_info(0u64))
|
.insert(term.as_bytes(), &make_term_info(0u64))
|
||||||
|
|||||||
@@ -132,10 +132,7 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return the next `(key, value)` pair.
|
/// Return the next `(key, value)` pair.
|
||||||
#[cfg_attr(
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::should_implement_trait)
|
|
||||||
)]
|
|
||||||
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
|
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
|
||||||
if self.advance() {
|
if self.advance() {
|
||||||
Some((self.key(), self.value()))
|
Some((self.key(), self.value()))
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ where
|
|||||||
W: Write,
|
W: Write,
|
||||||
{
|
{
|
||||||
/// Creates a new `TermDictionaryBuilder`
|
/// Creates a new `TermDictionaryBuilder`
|
||||||
pub fn new(w: W, _field_type: &FieldType) -> io::Result<Self> {
|
pub fn create(w: W, _field_type: &FieldType) -> io::Result<Self> {
|
||||||
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
|
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
|
||||||
Ok(TermDictionaryBuilder {
|
Ok(TermDictionaryBuilder {
|
||||||
fst_builder,
|
fst_builder,
|
||||||
@@ -132,7 +132,7 @@ impl TermDictionary {
|
|||||||
/// Creates an empty term dictionary which contains no terms.
|
/// Creates an empty term dictionary which contains no terms.
|
||||||
pub fn empty(field_type: &FieldType) -> Self {
|
pub fn empty(field_type: &FieldType) -> Self {
|
||||||
let term_dictionary_data: Vec<u8> =
|
let term_dictionary_data: Vec<u8> =
|
||||||
TermDictionaryBuilder::new(Vec::<u8>::new(), &field_type)
|
TermDictionaryBuilder::create(Vec::<u8>::new(), &field_type)
|
||||||
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
|
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
|
||||||
.finish()
|
.finish()
|
||||||
.expect("Writing in a Vec<u8> should never fail");
|
.expect("Writing in a Vec<u8> should never fail");
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use super::{Token, TokenStream, Tokenizer};
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
use schema::FACET_SEP_BYTE;
|
use schema::FACET_SEP_BYTE;
|
||||||
use std::str;
|
|
||||||
|
|
||||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||||
/// and emits a token for all of its parent.
|
/// and emits a token for all of its parent.
|
||||||
@@ -57,12 +56,11 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
|||||||
.position(|b| b == FACET_SEP_BYTE)
|
.position(|b| b == FACET_SEP_BYTE)
|
||||||
.map(|pos| cursor + 1 + pos)
|
.map(|pos| cursor + 1 + pos)
|
||||||
{
|
{
|
||||||
let facet_part =
|
let facet_part = &self.text[cursor..next_sep_pos];
|
||||||
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
|
|
||||||
self.token.text.push_str(facet_part);
|
self.token.text.push_str(facet_part);
|
||||||
self.state = State::UpToPosition(next_sep_pos);
|
self.state = State::UpToPosition(next_sep_pos);
|
||||||
} else {
|
} else {
|
||||||
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..]) };
|
let facet_part = &self.text[cursor..];
|
||||||
self.token.text.push_str(facet_part);
|
self.token.text.push_str(facet_part);
|
||||||
self.state = State::Terminated;
|
self.state = State::Terminated;
|
||||||
}
|
}
|
||||||
@@ -86,7 +84,6 @@ mod tests {
|
|||||||
|
|
||||||
use super::FacetTokenizer;
|
use super::FacetTokenizer;
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use std::str;
|
|
||||||
use tokenizer::{Token, TokenStream, Tokenizer};
|
use tokenizer::{Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -95,11 +92,11 @@ mod tests {
|
|||||||
let mut tokens = vec![];
|
let mut tokens = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
|
||||||
tokens.push(format!("{}", facet));
|
tokens.push(format!("{}", facet));
|
||||||
};
|
};
|
||||||
FacetTokenizer
|
FacetTokenizer
|
||||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) })
|
.token_stream(facet.encoded_str())
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
assert_eq!(tokens.len(), 4);
|
assert_eq!(tokens.len(), 4);
|
||||||
@@ -115,11 +112,11 @@ mod tests {
|
|||||||
let mut tokens = vec![];
|
let mut tokens = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test
|
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
|
||||||
tokens.push(format!("{}", facet));
|
tokens.push(format!("{}", facet));
|
||||||
};
|
};
|
||||||
FacetTokenizer
|
FacetTokenizer
|
||||||
.token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test
|
.token_stream(facet.encoded_str()) // ok test
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
assert_eq!(tokens.len(), 1);
|
assert_eq!(tokens.len(), 1);
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
//! use tantivy::schema::*;
|
//! use tantivy::schema::*;
|
||||||
//!
|
//!
|
||||||
//! # fn main() {
|
//! # fn main() {
|
||||||
//! let mut schema_builder = SchemaBuilder::new();
|
//! let mut schema_builder = Schema::builder();
|
||||||
//!
|
//!
|
||||||
//! let text_options = TextOptions::default()
|
//! let text_options = TextOptions::default()
|
||||||
//! .set_indexing_options(
|
//! .set_indexing_options(
|
||||||
@@ -82,12 +82,12 @@
|
|||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//! # extern crate tantivy;
|
//! # extern crate tantivy;
|
||||||
//! # use tantivy::schema::SchemaBuilder;
|
//! # use tantivy::schema::Schema;
|
||||||
//! # use tantivy::tokenizer::*;
|
//! # use tantivy::tokenizer::*;
|
||||||
//! # use tantivy::Index;
|
//! # use tantivy::Index;
|
||||||
//! # fn main() {
|
//! # fn main() {
|
||||||
//! # let custom_en_tokenizer = SimpleTokenizer;
|
//! # let custom_en_tokenizer = SimpleTokenizer;
|
||||||
//! # let schema = SchemaBuilder::new().build();
|
//! # let schema = Schema::builder().build();
|
||||||
//! let index = Index::create_in_ram(schema);
|
//! let index = Index::create_in_ram(schema);
|
||||||
//! index.tokenizers()
|
//! index.tokenizers()
|
||||||
//! .register("custom_en", custom_en_tokenizer);
|
//! .register("custom_en", custom_en_tokenizer);
|
||||||
@@ -101,12 +101,12 @@
|
|||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//! extern crate tantivy;
|
//! extern crate tantivy;
|
||||||
//! use tantivy::schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
|
//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing};
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//! use tantivy::Index;
|
//! use tantivy::Index;
|
||||||
//!
|
//!
|
||||||
//! # fn main() {
|
//! # fn main() {
|
||||||
//! let mut schema_builder = SchemaBuilder::new();
|
//! let mut schema_builder = Schema::builder();
|
||||||
//! let text_field_indexing = TextFieldIndexing::default()
|
//! let text_field_indexing = TextFieldIndexing::default()
|
||||||
//! .set_tokenizer("custom_en")
|
//! .set_tokenizer("custom_en")
|
||||||
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
@@ -157,35 +157,32 @@ pub use self::tokenizer::BoxedTokenizer;
|
|||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
/// This is a function that can be used in tests and doc tests
|
|
||||||
/// to assert a token's correctness.
|
|
||||||
/// TODO: can this be wrapped in #[cfg(test)] so as not to be in the
|
|
||||||
/// public api?
|
|
||||||
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
|
||||||
assert_eq!(
|
|
||||||
token.position, position,
|
|
||||||
"expected position {} but {:?}",
|
|
||||||
position, token
|
|
||||||
);
|
|
||||||
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
|
||||||
assert_eq!(
|
|
||||||
token.offset_from, from,
|
|
||||||
"expected offset_from {} but {:?}",
|
|
||||||
from, token
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
token.offset_to, to,
|
|
||||||
"expected offset_to {} but {:?}",
|
|
||||||
to, token
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod test {
|
pub mod tests {
|
||||||
use super::assert_token;
|
|
||||||
use super::Token;
|
use super::Token;
|
||||||
use super::TokenizerManager;
|
use super::TokenizerManager;
|
||||||
|
|
||||||
|
/// This is a function that can be used in tests and doc tests
|
||||||
|
/// to assert a token's correctness.
|
||||||
|
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
|
||||||
|
assert_eq!(
|
||||||
|
token.position, position,
|
||||||
|
"expected position {} but {:?}",
|
||||||
|
position, token
|
||||||
|
);
|
||||||
|
assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
|
||||||
|
assert_eq!(
|
||||||
|
token.offset_from, from,
|
||||||
|
"expected offset_from {} but {:?}",
|
||||||
|
from, token
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
token.offset_to, to,
|
||||||
|
"expected offset_to {} but {:?}",
|
||||||
|
to, token
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_raw_tokenizer() {
|
fn test_raw_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
@@ -224,72 +221,6 @@ pub mod test {
|
|||||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ngram_tokenizer() {
|
|
||||||
use super::{LowerCaser, NgramTokenizer};
|
|
||||||
use tokenizer::tokenizer::TokenStream;
|
|
||||||
use tokenizer::tokenizer::Tokenizer;
|
|
||||||
|
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
|
||||||
tokenizer_manager.register("ngram12", NgramTokenizer::new(1, 2, false));
|
|
||||||
tokenizer_manager.register(
|
|
||||||
"ngram3",
|
|
||||||
NgramTokenizer::new(3, 3, false).filter(LowerCaser),
|
|
||||||
);
|
|
||||||
tokenizer_manager.register(
|
|
||||||
"edgegram5",
|
|
||||||
NgramTokenizer::new(2, 5, true).filter(LowerCaser),
|
|
||||||
);
|
|
||||||
|
|
||||||
let tokenizer = NgramTokenizer::new(1, 2, false);
|
|
||||||
let mut tokens: Vec<Token> = vec![];
|
|
||||||
{
|
|
||||||
let mut add_token = |token: &Token| {
|
|
||||||
tokens.push(token.clone());
|
|
||||||
};
|
|
||||||
tokenizer.token_stream("hello").process(&mut add_token);
|
|
||||||
}
|
|
||||||
assert_eq!(tokens.len(), 9);
|
|
||||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
|
||||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
|
||||||
assert_token(&tokens[2], 1, "e", 1, 2);
|
|
||||||
assert_token(&tokens[3], 1, "el", 1, 3);
|
|
||||||
assert_token(&tokens[4], 2, "l", 2, 3);
|
|
||||||
assert_token(&tokens[5], 2, "ll", 2, 4);
|
|
||||||
assert_token(&tokens[6], 3, "l", 3, 4);
|
|
||||||
assert_token(&tokens[7], 3, "lo", 3, 5);
|
|
||||||
assert_token(&tokens[8], 4, "o", 4, 5);
|
|
||||||
|
|
||||||
let tokenizer = tokenizer_manager.get("ngram3").unwrap();
|
|
||||||
let mut tokens: Vec<Token> = vec![];
|
|
||||||
{
|
|
||||||
let mut add_token = |token: &Token| {
|
|
||||||
tokens.push(token.clone());
|
|
||||||
};
|
|
||||||
tokenizer.token_stream("Hello").process(&mut add_token);
|
|
||||||
}
|
|
||||||
assert_eq!(tokens.len(), 3);
|
|
||||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
|
||||||
assert_token(&tokens[1], 1, "ell", 1, 4);
|
|
||||||
assert_token(&tokens[2], 2, "llo", 2, 5);
|
|
||||||
|
|
||||||
let tokenizer = tokenizer_manager.get("edgegram5").unwrap();
|
|
||||||
let mut tokens: Vec<Token> = vec![];
|
|
||||||
{
|
|
||||||
let mut add_token = |token: &Token| {
|
|
||||||
tokens.push(token.clone());
|
|
||||||
};
|
|
||||||
tokenizer
|
|
||||||
.token_stream("Frankenstein")
|
|
||||||
.process(&mut add_token);
|
|
||||||
}
|
|
||||||
assert_eq!(tokens.len(), 4);
|
|
||||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
|
||||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
|
||||||
assert_token(&tokens[2], 0, "fran", 0, 4);
|
|
||||||
assert_token(&tokens[3], 0, "frank", 0, 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_tokenizer_empty() {
|
fn test_tokenizer_empty() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
|
|||||||
@@ -2,14 +2,15 @@ use super::{Token, TokenStream, Tokenizer};
|
|||||||
|
|
||||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||||
///
|
///
|
||||||
/// With this tokenizer, the `position` field expresses the starting offset of the ngram
|
/// With this tokenizer, the `position` is always 0.
|
||||||
/// rather than the `token` offset.
|
/// Beware however, in presence of multiple value for the same field,
|
||||||
|
/// the position will be `POSITION_GAP * index of value`.
|
||||||
///
|
///
|
||||||
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
|
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
|
||||||
///
|
///
|
||||||
/// | Term | he | hel | el | ell | ll | llo | lo |
|
/// | Term | he | hel | el | ell | ll | llo | lo |
|
||||||
/// |----------|-----|-----|-----|-----|-----|-----|----|
|
/// |----------|-----|-----|-----|-----|-----|-----|----|
|
||||||
/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 |
|
/// | Position | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
|
||||||
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
|
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
|
||||||
///
|
///
|
||||||
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
||||||
@@ -19,24 +20,63 @@ use super::{Token, TokenStream, Tokenizer};
|
|||||||
/// | Position | 0 | 0 | 0 | 0 |
|
/// | Position | 0 | 0 | 0 | 0 |
|
||||||
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
|
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
|
||||||
///
|
///
|
||||||
|
/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
|
||||||
|
///
|
||||||
|
/// | Term | hε | hεl | hεll | hεllo |
|
||||||
|
/// |----------|-----|-----|-------|-------|
|
||||||
|
/// | Position | 0 | 0 | 0 | 0 |
|
||||||
|
/// | Offsets | 0,3 | 0,4 | 0,5 | 0,6 |
|
||||||
|
///
|
||||||
/// # Example
|
/// # Example
|
||||||
///
|
///
|
||||||
/// ```
|
/// ```
|
||||||
/// extern crate tantivy;
|
/// # extern crate tantivy;
|
||||||
/// use tantivy::tokenizer::*;
|
/// use tantivy::tokenizer::*;
|
||||||
/// use tantivy::tokenizer::assert_token;
|
|
||||||
///
|
|
||||||
/// # fn main() {
|
/// # fn main() {
|
||||||
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
||||||
/// let mut stream = tokenizer.token_stream("hello");
|
/// let mut stream = tokenizer.token_stream("hello");
|
||||||
///
|
/// {
|
||||||
/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
|
/// let token = stream.next().unwrap();
|
||||||
/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
|
/// assert_eq!(token.text, "he");
|
||||||
/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
|
/// assert_eq!(token.offset_from, 0);
|
||||||
/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
|
/// assert_eq!(token.offset_to, 2);
|
||||||
/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
|
/// }
|
||||||
/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
|
/// {
|
||||||
/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
|
/// let token = stream.next().unwrap();
|
||||||
|
/// assert_eq!(token.text, "hel");
|
||||||
|
/// assert_eq!(token.offset_from, 0);
|
||||||
|
/// assert_eq!(token.offset_to, 3);
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// let token = stream.next().unwrap();
|
||||||
|
/// assert_eq!(token.text, "el");
|
||||||
|
/// assert_eq!(token.offset_from, 1);
|
||||||
|
/// assert_eq!(token.offset_to, 3);
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// let token = stream.next().unwrap();
|
||||||
|
/// assert_eq!(token.text, "ell");
|
||||||
|
/// assert_eq!(token.offset_from, 1);
|
||||||
|
/// assert_eq!(token.offset_to, 4);
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// let token = stream.next().unwrap();
|
||||||
|
/// assert_eq!(token.text, "ll");
|
||||||
|
/// assert_eq!(token.offset_from, 2);
|
||||||
|
/// assert_eq!(token.offset_to, 4);
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// let token = stream.next().unwrap();
|
||||||
|
/// assert_eq!(token.text, "llo");
|
||||||
|
/// assert_eq!(token.offset_from, 2);
|
||||||
|
/// assert_eq!(token.offset_to, 5);
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// let token = stream.next().unwrap();
|
||||||
|
/// assert_eq!(token.text, "lo");
|
||||||
|
/// assert_eq!(token.offset_from, 3);
|
||||||
|
/// assert_eq!(token.offset_to, 5);
|
||||||
|
/// }
|
||||||
/// assert!(stream.next().is_none());
|
/// assert!(stream.next().is_none());
|
||||||
/// # }
|
/// # }
|
||||||
/// ```
|
/// ```
|
||||||
@@ -58,23 +98,37 @@ impl NgramTokenizer {
|
|||||||
min_gram <= max_gram,
|
min_gram <= max_gram,
|
||||||
"min_gram must not be greater than max_gram"
|
"min_gram must not be greater than max_gram"
|
||||||
);
|
);
|
||||||
|
|
||||||
NgramTokenizer {
|
NgramTokenizer {
|
||||||
min_gram,
|
min_gram,
|
||||||
max_gram,
|
max_gram,
|
||||||
prefix_only,
|
prefix_only,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Create a `NGramTokenizer` which generates tokens for all inner ngrams.
|
||||||
|
///
|
||||||
|
/// This is as opposed to only prefix ngrams .
|
||||||
|
pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer {
|
||||||
|
Self::new(min_gram, max_gram, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a `NGramTokenizer` which only generates tokens for the
|
||||||
|
/// prefix ngrams.
|
||||||
|
pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer {
|
||||||
|
Self::new(min_gram, max_gram, true)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// TokenStream associate to the `NgramTokenizer`
|
||||||
pub struct NgramTokenStream<'a> {
|
pub struct NgramTokenStream<'a> {
|
||||||
text: &'a str,
|
/// parameters
|
||||||
position: usize,
|
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
|
||||||
text_length: usize,
|
/// true if the NgramTokenStream is in prefix mode.
|
||||||
token: Token,
|
|
||||||
min_gram: usize,
|
|
||||||
max_gram: usize,
|
|
||||||
gram_size: usize,
|
|
||||||
prefix_only: bool,
|
prefix_only: bool,
|
||||||
|
/// input
|
||||||
|
text: &'a str,
|
||||||
|
/// output
|
||||||
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||||
@@ -82,65 +136,29 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
|
|||||||
|
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||||
NgramTokenStream {
|
NgramTokenStream {
|
||||||
text,
|
ngram_charidx_iterator: StutteringIterator::new(
|
||||||
position: 0,
|
CodepointFrontiers::for_str(text),
|
||||||
text_length: text.len(),
|
self.min_gram,
|
||||||
token: Token::default(),
|
self.max_gram,
|
||||||
min_gram: self.min_gram,
|
),
|
||||||
max_gram: self.max_gram,
|
|
||||||
prefix_only: self.prefix_only,
|
prefix_only: self.prefix_only,
|
||||||
gram_size: self.min_gram,
|
text,
|
||||||
|
token: Token::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> NgramTokenStream<'a> {
|
|
||||||
/// Get the next set of token options
|
|
||||||
/// cycle through 1,2 (min..=max)
|
|
||||||
/// returning None if processing should stop
|
|
||||||
fn chomp(&mut self) -> Option<(usize, usize)> {
|
|
||||||
// Have we exceeded the bounds of the text we are indexing?
|
|
||||||
if self.gram_size > self.max_gram {
|
|
||||||
if self.prefix_only {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
// since we aren't just processing edges
|
|
||||||
// we need to reset the gram size
|
|
||||||
self.gram_size = self.min_gram;
|
|
||||||
|
|
||||||
// and move down the chain of letters
|
|
||||||
self.position += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = if (self.position + self.gram_size) <= self.text_length {
|
|
||||||
Some((self.position, self.gram_size))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// increase the gram size for the next pass
|
|
||||||
self.gram_size += 1;
|
|
||||||
|
|
||||||
result
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> TokenStream for NgramTokenStream<'a> {
|
impl<'a> TokenStream for NgramTokenStream<'a> {
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
// clear out working token text
|
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
|
||||||
self.token.text.clear();
|
if self.prefix_only && offset_from > 0 {
|
||||||
|
return false;
|
||||||
if let Some((position, size)) = self.chomp() {
|
}
|
||||||
self.token.position = position;
|
self.token.position = 0;
|
||||||
let offset_from = position;
|
|
||||||
let offset_to = offset_from + size;
|
|
||||||
|
|
||||||
self.token.offset_from = offset_from;
|
self.token.offset_from = offset_from;
|
||||||
self.token.offset_to = offset_to;
|
self.token.offset_to = offset_to;
|
||||||
|
self.token.text.clear();
|
||||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||||
|
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
@@ -150,8 +168,298 @@ impl<'a> TokenStream for NgramTokenStream<'a> {
|
|||||||
fn token(&self) -> &Token {
|
fn token(&self) -> &Token {
|
||||||
&self.token
|
&self.token
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
&mut self.token
|
&mut self.token
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This iterator takes an underlying Iterator
|
||||||
|
/// and emits all of the pairs `(a,b)` such that
|
||||||
|
/// a and b are items emitted by the iterator at
|
||||||
|
/// an interval between `min_gram` and `max_gram`.
|
||||||
|
///
|
||||||
|
/// The elements are emitted in the order of appearance
|
||||||
|
/// of `a` first, `b` then.
|
||||||
|
///
|
||||||
|
/// See `test_stutterring_iterator` for an example of its
|
||||||
|
/// output.
|
||||||
|
struct StutteringIterator<T> {
|
||||||
|
underlying: T,
|
||||||
|
min_gram: usize,
|
||||||
|
max_gram: usize,
|
||||||
|
|
||||||
|
memory: Vec<usize>,
|
||||||
|
cursor: usize,
|
||||||
|
gram_len: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> StutteringIterator<T>
|
||||||
|
where
|
||||||
|
T: Iterator<Item = usize>,
|
||||||
|
{
|
||||||
|
pub fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
|
||||||
|
assert!(min_gram > 0);
|
||||||
|
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
|
||||||
|
if memory.len() <= min_gram {
|
||||||
|
// returns an empty iterator
|
||||||
|
StutteringIterator {
|
||||||
|
underlying,
|
||||||
|
min_gram: 1,
|
||||||
|
max_gram: 0,
|
||||||
|
memory,
|
||||||
|
cursor: 0,
|
||||||
|
gram_len: 0,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
StutteringIterator {
|
||||||
|
underlying,
|
||||||
|
min_gram,
|
||||||
|
max_gram: memory.len() - 1,
|
||||||
|
memory,
|
||||||
|
cursor: 0,
|
||||||
|
gram_len: min_gram,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Iterator for StutteringIterator<T>
|
||||||
|
where
|
||||||
|
T: Iterator<Item = usize>,
|
||||||
|
{
|
||||||
|
type Item = (usize, usize);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<(usize, usize)> {
|
||||||
|
if self.gram_len > self.max_gram {
|
||||||
|
// we have exhausted all options
|
||||||
|
// starting at `self.memory[self.cursor]`.
|
||||||
|
//
|
||||||
|
// Time to advance.
|
||||||
|
self.gram_len = self.min_gram;
|
||||||
|
if let Some(next_val) = self.underlying.next() {
|
||||||
|
self.memory[self.cursor] = next_val;
|
||||||
|
} else {
|
||||||
|
self.max_gram -= 1;
|
||||||
|
}
|
||||||
|
self.cursor += 1;
|
||||||
|
if self.cursor >= self.memory.len() {
|
||||||
|
self.cursor = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if self.max_gram < self.min_gram {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let start = self.memory[self.cursor % self.memory.len()];
|
||||||
|
let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()];
|
||||||
|
self.gram_len += 1;
|
||||||
|
Some((start, stop))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Emits all of the offsets where a codepoint starts
|
||||||
|
/// or a codepoint ends.
|
||||||
|
///
|
||||||
|
/// By convention, we emit [0] for the empty string.
|
||||||
|
struct CodepointFrontiers<'a> {
|
||||||
|
s: &'a str,
|
||||||
|
next_el: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> CodepointFrontiers<'a> {
|
||||||
|
fn for_str(s: &'a str) -> Self {
|
||||||
|
CodepointFrontiers {
|
||||||
|
s,
|
||||||
|
next_el: Some(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for CodepointFrontiers<'a> {
|
||||||
|
type Item = usize;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<usize> {
|
||||||
|
self.next_el.map(|offset| {
|
||||||
|
if self.s.is_empty() {
|
||||||
|
self.next_el = None;
|
||||||
|
} else {
|
||||||
|
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
|
||||||
|
self.s = &self.s[first_codepoint_width..];
|
||||||
|
self.next_el = Some(offset + first_codepoint_width);
|
||||||
|
}
|
||||||
|
offset
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const CODEPOINT_UTF8_WIDTH: [u8; 16] = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4];
|
||||||
|
|
||||||
|
// Number of bytes to encode a codepoint in UTF-8 given
|
||||||
|
// the first byte.
|
||||||
|
//
|
||||||
|
// To do that we count the number of higher significant bits set to `1`.
|
||||||
|
fn utf8_codepoint_width(b: u8) -> usize {
|
||||||
|
let higher_4_bits = (b as usize) >> 4;
|
||||||
|
CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::utf8_codepoint_width;
|
||||||
|
use super::CodepointFrontiers;
|
||||||
|
use super::NgramTokenizer;
|
||||||
|
use super::StutteringIterator;
|
||||||
|
use tokenizer::tests::assert_token;
|
||||||
|
use tokenizer::tokenizer::{TokenStream, Tokenizer};
|
||||||
|
use tokenizer::Token;
|
||||||
|
|
||||||
|
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||||
|
tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_utf8_codepoint_width() {
|
||||||
|
// 0xxx
|
||||||
|
for i in 0..128 {
|
||||||
|
assert_eq!(utf8_codepoint_width(i), 1);
|
||||||
|
}
|
||||||
|
// 110xx
|
||||||
|
for i in (128 | 64)..(128 | 64 | 32) {
|
||||||
|
assert_eq!(utf8_codepoint_width(i), 2);
|
||||||
|
}
|
||||||
|
// 1110xx
|
||||||
|
for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) {
|
||||||
|
assert_eq!(utf8_codepoint_width(i), 3);
|
||||||
|
}
|
||||||
|
// 1111xx
|
||||||
|
for i in (128 | 64 | 32 | 16)..256 {
|
||||||
|
assert_eq!(utf8_codepoint_width(i as u8), 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_codepoint_frontiers() {
|
||||||
|
assert_eq!(CodepointFrontiers::for_str("").collect::<Vec<_>>(), vec![0]);
|
||||||
|
assert_eq!(
|
||||||
|
CodepointFrontiers::for_str("abcd").collect::<Vec<_>>(),
|
||||||
|
vec![0, 1, 2, 3, 4]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
CodepointFrontiers::for_str("aあ").collect::<Vec<_>>(),
|
||||||
|
vec![0, 1, 4]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_tokenizer_1_2_false() {
|
||||||
|
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
|
||||||
|
assert_eq!(tokens.len(), 9);
|
||||||
|
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||||
|
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||||
|
assert_token(&tokens[2], 0, "e", 1, 2);
|
||||||
|
assert_token(&tokens[3], 0, "el", 1, 3);
|
||||||
|
assert_token(&tokens[4], 0, "l", 2, 3);
|
||||||
|
assert_token(&tokens[5], 0, "ll", 2, 4);
|
||||||
|
assert_token(&tokens[6], 0, "l", 3, 4);
|
||||||
|
assert_token(&tokens[7], 0, "lo", 3, 5);
|
||||||
|
assert_token(&tokens[8], 0, "o", 4, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_tokenizer_min_max_equal() {
|
||||||
|
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
|
||||||
|
assert_eq!(tokens.len(), 3);
|
||||||
|
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||||
|
assert_token(&tokens[1], 0, "ell", 1, 4);
|
||||||
|
assert_token(&tokens[2], 0, "llo", 2, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_tokenizer_2_5_prefix() {
|
||||||
|
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
|
||||||
|
assert_eq!(tokens.len(), 4);
|
||||||
|
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||||
|
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||||
|
assert_token(&tokens[2], 0, "fran", 0, 4);
|
||||||
|
assert_token(&tokens[3], 0, "frank", 0, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_non_ascii_1_2() {
|
||||||
|
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
|
||||||
|
assert_eq!(tokens.len(), 9);
|
||||||
|
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||||
|
assert_token(&tokens[1], 0, "hε", 0, 3);
|
||||||
|
assert_token(&tokens[2], 0, "ε", 1, 3);
|
||||||
|
assert_token(&tokens[3], 0, "εl", 1, 4);
|
||||||
|
assert_token(&tokens[4], 0, "l", 3, 4);
|
||||||
|
assert_token(&tokens[5], 0, "ll", 3, 5);
|
||||||
|
assert_token(&tokens[6], 0, "l", 4, 5);
|
||||||
|
assert_token(&tokens[7], 0, "lo", 4, 6);
|
||||||
|
assert_token(&tokens[8], 0, "o", 5, 6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_non_ascii_2_5_prefix() {
|
||||||
|
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
|
||||||
|
assert_eq!(tokens.len(), 4);
|
||||||
|
assert_token(&tokens[0], 0, "hε", 0, 3);
|
||||||
|
assert_token(&tokens[1], 0, "hεl", 0, 4);
|
||||||
|
assert_token(&tokens[2], 0, "hεll", 0, 5);
|
||||||
|
assert_token(&tokens[3], 0, "hεllo", 0, 6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_empty() {
|
||||||
|
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
|
||||||
|
assert!(tokens.is_empty());
|
||||||
|
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
|
||||||
|
assert!(tokens.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "min_gram must be greater than 0")]
|
||||||
|
fn test_ngram_min_max_interval_empty() {
|
||||||
|
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "min_gram must not be greater than max_gram")]
|
||||||
|
fn test_invalid_interval_should_panic_if_smaller() {
|
||||||
|
NgramTokenizer::all_ngrams(2, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stutterring_iterator_empty() {
|
||||||
|
let rg: Vec<usize> = vec![0];
|
||||||
|
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
|
||||||
|
assert_eq!(it.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stutterring_iterator() {
|
||||||
|
let rg: Vec<usize> = (0..10).collect();
|
||||||
|
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
|
||||||
|
assert_eq!(it.next(), Some((0, 1)));
|
||||||
|
assert_eq!(it.next(), Some((0, 2)));
|
||||||
|
assert_eq!(it.next(), Some((1, 2)));
|
||||||
|
assert_eq!(it.next(), Some((1, 3)));
|
||||||
|
assert_eq!(it.next(), Some((2, 3)));
|
||||||
|
assert_eq!(it.next(), Some((2, 4)));
|
||||||
|
assert_eq!(it.next(), Some((3, 4)));
|
||||||
|
assert_eq!(it.next(), Some((3, 5)));
|
||||||
|
assert_eq!(it.next(), Some((4, 5)));
|
||||||
|
assert_eq!(it.next(), Some((4, 6)));
|
||||||
|
assert_eq!(it.next(), Some((5, 6)));
|
||||||
|
assert_eq!(it.next(), Some((5, 7)));
|
||||||
|
assert_eq!(it.next(), Some((6, 7)));
|
||||||
|
assert_eq!(it.next(), Some((6, 8)));
|
||||||
|
assert_eq!(it.next(), Some((7, 8)));
|
||||||
|
assert_eq!(it.next(), Some((7, 9)));
|
||||||
|
assert_eq!(it.next(), Some((8, 9)));
|
||||||
|
assert_eq!(it.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user