mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
Compare commits
46 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e14701e9cd | ||
|
|
45e62d4329 | ||
|
|
76d2b4dab6 | ||
|
|
04e9606638 | ||
|
|
a5c57ebbd9 | ||
|
|
96eaa5bc63 | ||
|
|
f1d30ab196 | ||
|
|
4507df9255 | ||
|
|
e8625548b7 | ||
|
|
50ed6fb534 | ||
|
|
76609deadf | ||
|
|
749e62c40b | ||
|
|
259ce567d1 | ||
|
|
4c93b096eb | ||
|
|
6a547b0b5f | ||
|
|
e99d1a2355 | ||
|
|
c7bddc5fe3 | ||
|
|
7b97dde335 | ||
|
|
644b4bd0a1 | ||
|
|
bf94fd77db | ||
|
|
097eaf4aa6 | ||
|
|
1fd46c1e9b | ||
|
|
2fb219d017 | ||
|
|
63b593bd0a | ||
|
|
286bb75a0c | ||
|
|
222b7f2580 | ||
|
|
5292e78860 | ||
|
|
c0cc6aac83 | ||
|
|
0b0bf59a32 | ||
|
|
74f70a5c2c | ||
|
|
1acfb2ebb5 | ||
|
|
4dfd091e67 | ||
|
|
8eba4ab807 | ||
|
|
5e8e03882b | ||
|
|
7df3260a15 | ||
|
|
176f67a266 | ||
|
|
19babff849 | ||
|
|
bf2576adf9 | ||
|
|
0e8fcd5727 | ||
|
|
f745c83bb7 | ||
|
|
ffb16d9103 | ||
|
|
98ca703daa | ||
|
|
b9d25cda5d | ||
|
|
beb4289ec2 | ||
|
|
bdd72e4683 | ||
|
|
45c3cd19be |
17
CHANGELOG.md
17
CHANGELOG.md
@@ -1,3 +1,20 @@
|
|||||||
|
Tantivy 0.9.0
|
||||||
|
=====================
|
||||||
|
*0.9.0 index format is not compatible with the
|
||||||
|
previous index format.*
|
||||||
|
- Removed most unsafe (@fulmicoton)
|
||||||
|
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
|
||||||
|
- Stemming in other language possible (@pentlander)
|
||||||
|
- Segments with no docs are deleted earlier (@barrotsteindev)
|
||||||
|
|
||||||
|
Tantivy 0.8.1
|
||||||
|
=====================
|
||||||
|
Hotfix of #476.
|
||||||
|
|
||||||
|
Merge was reflecting deletes before commit was passed.
|
||||||
|
Thanks @barrotsteindev for reporting the bug.
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.8.0
|
Tantivy 0.8.0
|
||||||
=====================
|
=====================
|
||||||
*No change in the index format*
|
*No change in the index format*
|
||||||
|
|||||||
13
Cargo.toml
13
Cargo.toml
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.8.0"
|
version = "0.9.0-dev"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -29,6 +29,7 @@ serde = "1.0"
|
|||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
num_cpus = "1.2"
|
num_cpus = "1.2"
|
||||||
|
fs2={version="0.4", optional=true}
|
||||||
itertools = "0.8"
|
itertools = "0.8"
|
||||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||||
bit-set = "0.5"
|
bit-set = "0.5"
|
||||||
@@ -38,11 +39,11 @@ futures = "0.1"
|
|||||||
futures-cpupool = "0.1"
|
futures-cpupool = "0.1"
|
||||||
owning_ref = "0.4"
|
owning_ref = "0.4"
|
||||||
stable_deref_trait = "1.0.0"
|
stable_deref_trait = "1.0.0"
|
||||||
rust-stemmers = "1"
|
rust-stemmers = "1.1"
|
||||||
downcast = { version="0.9" }
|
downcast-rs = { version="1.0" }
|
||||||
matches = "0.1"
|
matches = "0.1"
|
||||||
bitpacking = "0.5"
|
bitpacking = "0.6"
|
||||||
census = "0.1"
|
census = "0.2"
|
||||||
fnv = "1.0.6"
|
fnv = "1.0.6"
|
||||||
owned-read = "0.4"
|
owned-read = "0.4"
|
||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
@@ -70,7 +71,7 @@ overflow-checks = true
|
|||||||
[features]
|
[features]
|
||||||
# by default no-fail is disabled. We manually enable it when running test.
|
# by default no-fail is disabled. We manually enable it when running test.
|
||||||
default = ["mmap", "no_fail"]
|
default = ["mmap", "no_fail"]
|
||||||
mmap = ["fst/mmap", "atomicwrites"]
|
mmap = ["fst/mmap", "atomicwrites", "fs2"]
|
||||||
lz4-compression = ["lz4"]
|
lz4-compression = ["lz4"]
|
||||||
no_fail = ["fail/no_fail"]
|
no_fail = ["fail/no_fail"]
|
||||||
unstable = [] # useful for benches.
|
unstable = [] # useful for benches.
|
||||||
|
|||||||
@@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in rust.
|
**Tantivy** is a **full text search engine library** written in rust.
|
||||||
|
|
||||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||||
an off-the-shelf search engine server, but rather a crate that can be used
|
an off-the-shelf search engine server, but rather a crate that can be used
|
||||||
to build such a search engine.
|
to build such a search engine.
|
||||||
|
|
||||||
@@ -76,7 +76,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
|||||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||||
To check out and run tests, you can simply run :
|
To check out and run tests, you can simply run :
|
||||||
|
|
||||||
git clone git@github.com:tantivy-search/tantivy.git
|
git clone https://github.com/tantivy-search/tantivy.git
|
||||||
cd tantivy
|
cd tantivy
|
||||||
cargo build
|
cargo build
|
||||||
|
|
||||||
|
|||||||
41
examples/integer_range_search.rs
Normal file
41
examples/integer_range_search.rs
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
// # Searching a range on an indexed int field.
|
||||||
|
//
|
||||||
|
// Below is an example of creating an indexed integer field in your schema
|
||||||
|
// You can use RangeQuery to get a Count of all occurrences in a given range.
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
|
extern crate tantivy;
|
||||||
|
use tantivy::collector::Count;
|
||||||
|
use tantivy::query::RangeQuery;
|
||||||
|
use tantivy::schema::{Schema, INT_INDEXED};
|
||||||
|
use tantivy::Index;
|
||||||
|
use tantivy::Result;
|
||||||
|
|
||||||
|
fn run() -> Result<()> {
|
||||||
|
// For the sake of simplicity, this schema will only have 1 field
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
// INT_INDEXED is shorthand for such fields
|
||||||
|
let year_field = schema_builder.add_u64_field("year", INT_INDEXED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||||
|
for year in 1950u64..2019u64 {
|
||||||
|
index_writer.add_document(doc!(year_field => year));
|
||||||
|
}
|
||||||
|
index_writer.commit()?;
|
||||||
|
// The index will be a range of years
|
||||||
|
}
|
||||||
|
index.load_searchers()?;
|
||||||
|
let searcher = index.searcher();
|
||||||
|
// The end is excluded i.e. here we are searching up to 1969
|
||||||
|
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
||||||
|
// Uses a Count collector to sum the total number of docs in the range
|
||||||
|
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||||
|
assert_eq!(num_60s_books, 10);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
run().unwrap()
|
||||||
|
}
|
||||||
@@ -88,7 +88,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
for i in 0u64..10u64 {
|
for i in 0u64..10u64 {
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ See the `custom_collector` example.
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use downcast;
|
use downcast_rs;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -111,9 +111,9 @@ pub use self::facet_collector::FacetCollector;
|
|||||||
|
|
||||||
/// `Fruit` is the type for the result of our collection.
|
/// `Fruit` is the type for the result of our collection.
|
||||||
/// e.g. `usize` for the `Count` collector.
|
/// e.g. `usize` for the `Count` collector.
|
||||||
pub trait Fruit: Send + downcast::Any {}
|
pub trait Fruit: Send + downcast_rs::Downcast {}
|
||||||
|
|
||||||
impl<T> Fruit for T where T: Send + downcast::Any {}
|
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
|
||||||
|
|
||||||
/// Collectors are in charge of collecting and retaining relevant
|
/// Collectors are in charge of collecting and retaining relevant
|
||||||
/// information from the document found and scored by the query.
|
/// information from the document found and scored by the query.
|
||||||
@@ -358,10 +358,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(missing_docs)]
|
impl_downcast!(Fruit);
|
||||||
mod downcast_impl {
|
|
||||||
downcast!(super::Fruit);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests;
|
pub mod tests;
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use super::SegmentCollector;
|
use super::SegmentCollector;
|
||||||
use collector::Fruit;
|
use collector::Fruit;
|
||||||
use downcast::Downcast;
|
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -37,11 +36,11 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
|||||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
let typed_fruit: Vec<TCollector::Fruit> = children
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|untyped_fruit| {
|
.map(|untyped_fruit| {
|
||||||
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
|
untyped_fruit
|
||||||
|
.downcast::<TCollector::Fruit>()
|
||||||
.map(|boxed_but_typed| *boxed_but_typed)
|
.map(|boxed_but_typed| *boxed_but_typed)
|
||||||
.map_err(|e| {
|
.map_err(|_| {
|
||||||
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
TantivyError::InvalidArgument("Failed to cast child fruit.".to_string())
|
||||||
TantivyError::InvalidArgument(err_msg)
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.collect::<Result<_>>()?;
|
.collect::<Result<_>>()?;
|
||||||
@@ -89,14 +88,20 @@ pub struct FruitHandle<TFruit: Fruit> {
|
|||||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||||
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
|
*boxed_fruit
|
||||||
|
.downcast::<TFruit>()
|
||||||
|
.map_err(|_| ())
|
||||||
|
.expect("Failed to downcast collector fruit.")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Multicollector makes it possible to collect on more than one collector.
|
/// Multicollector makes it possible to collect on more than one collector.
|
||||||
/// It should only be used for use cases where the Collector types is unknown
|
/// It should only be used for use cases where the Collector types is unknown
|
||||||
/// at compile time.
|
/// at compile time.
|
||||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
///
|
||||||
|
/// If the type of the collectors is known, you can just group yours collectors
|
||||||
|
/// in a tuple. See the
|
||||||
|
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
|
||||||
///
|
///
|
||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
||||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
||||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ pub struct BitUnpacker<Data>
|
|||||||
where
|
where
|
||||||
Data: Deref<Target = [u8]>,
|
Data: Deref<Target = [u8]>,
|
||||||
{
|
{
|
||||||
num_bits: usize,
|
num_bits: u64,
|
||||||
mask: u64,
|
mask: u64,
|
||||||
data: Data,
|
data: Data,
|
||||||
}
|
}
|
||||||
@@ -80,13 +80,13 @@ where
|
|||||||
(1u64 << num_bits) - 1u64
|
(1u64 << num_bits) - 1u64
|
||||||
};
|
};
|
||||||
BitUnpacker {
|
BitUnpacker {
|
||||||
num_bits: num_bits as usize,
|
num_bits: num_bits as u64,
|
||||||
mask,
|
mask,
|
||||||
data,
|
data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get(&self, idx: usize) -> u64 {
|
pub fn get(&self, idx: u64) -> u64 {
|
||||||
if self.num_bits == 0 {
|
if self.num_bits == 0 {
|
||||||
return 0u64;
|
return 0u64;
|
||||||
}
|
}
|
||||||
@@ -97,38 +97,13 @@ where
|
|||||||
let addr = addr_in_bits >> 3;
|
let addr = addr_in_bits >> 3;
|
||||||
let bit_shift = addr_in_bits & 7;
|
let bit_shift = addr_in_bits & 7;
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
addr + 8 <= data.len(),
|
addr + 8 <= data.len() as u64,
|
||||||
"The fast field field should have been padded with 7 bytes."
|
"The fast field field should have been padded with 7 bytes."
|
||||||
);
|
);
|
||||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
|
||||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||||
val_shifted & mask
|
val_shifted & mask
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Reads a range of values from the fast field.
|
|
||||||
///
|
|
||||||
/// The range of values read is from
|
|
||||||
/// `[start..start + output.len()[`
|
|
||||||
pub fn get_range(&self, start: u32, output: &mut [u64]) {
|
|
||||||
if self.num_bits == 0 {
|
|
||||||
for val in output.iter_mut() {
|
|
||||||
*val = 0u64;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let data: &[u8] = &*self.data;
|
|
||||||
let num_bits = self.num_bits;
|
|
||||||
let mask = self.mask;
|
|
||||||
let mut addr_in_bits = (start as usize) * num_bits;
|
|
||||||
for output_val in output.iter_mut() {
|
|
||||||
let addr = addr_in_bits >> 3;
|
|
||||||
let bit_shift = addr_in_bits & 7;
|
|
||||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
|
||||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
|
||||||
*output_val = val_shifted & mask;
|
|
||||||
addr_in_bits += num_bits;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -154,7 +129,7 @@ mod test {
|
|||||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||||
for (i, val) in vals.iter().enumerate() {
|
for (i, val) in vals.iter().enumerate() {
|
||||||
assert_eq!(bitunpacker.get(i), *val);
|
assert_eq!(bitunpacker.get(i as u64), *val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -166,17 +141,4 @@ mod test {
|
|||||||
test_bitpacker_util(6, 14);
|
test_bitpacker_util(6, 14);
|
||||||
test_bitpacker_util(1000, 14);
|
test_bitpacker_util(1000, 14);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_bitpacker_range() {
|
|
||||||
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
|
|
||||||
let buffer_len = 100;
|
|
||||||
let mut buffer = vec![0u64; buffer_len];
|
|
||||||
for start in vec![0, 10, 20, 100, 1_000] {
|
|
||||||
bitunpacker.get_range(start as u32, &mut buffer[..]);
|
|
||||||
for i in 0..buffer_len {
|
|
||||||
assert_eq!(buffer[i], vals[start + i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ impl BinarySerializable for FileAddr {
|
|||||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||||
pub struct CompositeWrite<W = WritePtr> {
|
pub struct CompositeWrite<W = WritePtr> {
|
||||||
write: CountingWriter<W>,
|
write: CountingWriter<W>,
|
||||||
offsets: HashMap<FileAddr, usize>,
|
offsets: HashMap<FileAddr, u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: Write> CompositeWrite<W> {
|
impl<W: Write> CompositeWrite<W> {
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::io::Write;
|
|||||||
|
|
||||||
pub struct CountingWriter<W> {
|
pub struct CountingWriter<W> {
|
||||||
underlying: W,
|
underlying: W,
|
||||||
written_bytes: usize,
|
written_bytes: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: Write> CountingWriter<W> {
|
impl<W: Write> CountingWriter<W> {
|
||||||
@@ -14,11 +14,11 @@ impl<W: Write> CountingWriter<W> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn written_bytes(&self) -> usize {
|
pub fn written_bytes(&self) -> u64 {
|
||||||
self.written_bytes
|
self.written_bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(mut self) -> io::Result<(W, usize)> {
|
pub fn finish(mut self) -> io::Result<(W, u64)> {
|
||||||
self.flush()?;
|
self.flush()?;
|
||||||
Ok((self.underlying, self.written_bytes))
|
Ok((self.underlying, self.written_bytes))
|
||||||
}
|
}
|
||||||
@@ -27,10 +27,16 @@ impl<W: Write> CountingWriter<W> {
|
|||||||
impl<W: Write> Write for CountingWriter<W> {
|
impl<W: Write> Write for CountingWriter<W> {
|
||||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||||
let written_size = self.underlying.write(buf)?;
|
let written_size = self.underlying.write(buf)?;
|
||||||
self.written_bytes += written_size;
|
self.written_bytes += written_size as u64;
|
||||||
Ok(written_size)
|
Ok(written_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||||
|
self.underlying.write_all(buf)?;
|
||||||
|
self.written_bytes += buf.len() as u64;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
fn flush(&mut self) -> io::Result<()> {
|
||||||
self.underlying.flush()
|
self.underlying.flush()
|
||||||
}
|
}
|
||||||
@@ -48,8 +54,8 @@ mod test {
|
|||||||
let mut counting_writer = CountingWriter::wrap(buffer);
|
let mut counting_writer = CountingWriter::wrap(buffer);
|
||||||
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
||||||
counting_writer.write_all(&bytes).unwrap();
|
counting_writer.write_all(&bytes).unwrap();
|
||||||
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
|
let (w, len): (Vec<u8>, u64) = counting_writer.finish().unwrap();
|
||||||
assert_eq!(len, 10);
|
assert_eq!(len, 10u64);
|
||||||
assert_eq!(w.len(), 10);
|
assert_eq!(w.len(), 10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ pub(crate) use self::bitset::TinySet;
|
|||||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||||
pub use self::counting_writer::CountingWriter;
|
pub use self::counting_writer::CountingWriter;
|
||||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||||
pub use self::vint::VInt;
|
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
||||||
pub use byteorder::LittleEndian as Endianness;
|
pub use byteorder::LittleEndian as Endianness;
|
||||||
|
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::BinarySerializable;
|
use super::BinarySerializable;
|
||||||
|
use byteorder::{ByteOrder, LittleEndian};
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -9,6 +10,100 @@ pub struct VInt(pub u64);
|
|||||||
|
|
||||||
const STOP_BIT: u8 = 128;
|
const STOP_BIT: u8 = 128;
|
||||||
|
|
||||||
|
pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
|
||||||
|
const START_2: u64 = 1 << 7;
|
||||||
|
const START_3: u64 = 1 << 14;
|
||||||
|
const START_4: u64 = 1 << 21;
|
||||||
|
const START_5: u64 = 1 << 28;
|
||||||
|
|
||||||
|
const STOP_1: u64 = START_2 - 1;
|
||||||
|
const STOP_2: u64 = START_3 - 1;
|
||||||
|
const STOP_3: u64 = START_4 - 1;
|
||||||
|
const STOP_4: u64 = START_5 - 1;
|
||||||
|
|
||||||
|
const MASK_1: u64 = 127;
|
||||||
|
const MASK_2: u64 = MASK_1 << 7;
|
||||||
|
const MASK_3: u64 = MASK_2 << 7;
|
||||||
|
const MASK_4: u64 = MASK_3 << 7;
|
||||||
|
const MASK_5: u64 = MASK_4 << 7;
|
||||||
|
|
||||||
|
let val = u64::from(val);
|
||||||
|
const STOP_BIT: u64 = 128u64;
|
||||||
|
match val {
|
||||||
|
0...STOP_1 => (val | STOP_BIT, 1),
|
||||||
|
START_2...STOP_2 => (
|
||||||
|
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
|
||||||
|
2,
|
||||||
|
),
|
||||||
|
START_3...STOP_3 => (
|
||||||
|
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
|
||||||
|
3,
|
||||||
|
),
|
||||||
|
START_4...STOP_4 => (
|
||||||
|
(val & MASK_1)
|
||||||
|
| ((val & MASK_2) << 1)
|
||||||
|
| ((val & MASK_3) << 2)
|
||||||
|
| ((val & MASK_4) << 3)
|
||||||
|
| (STOP_BIT << (8 * 3)),
|
||||||
|
4,
|
||||||
|
),
|
||||||
|
_ => (
|
||||||
|
(val & MASK_1)
|
||||||
|
| ((val & MASK_2) << 1)
|
||||||
|
| ((val & MASK_3) << 2)
|
||||||
|
| ((val & MASK_4) << 3)
|
||||||
|
| ((val & MASK_5) << 4)
|
||||||
|
| (STOP_BIT << (8 * 4)),
|
||||||
|
5,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of bytes covered by a
|
||||||
|
/// serialized vint `u32`.
|
||||||
|
///
|
||||||
|
/// Expects a buffer data that starts
|
||||||
|
/// by the serialized `vint`, scans at most 5 bytes ahead until
|
||||||
|
/// it finds the vint final byte.
|
||||||
|
///
|
||||||
|
/// # May Panic
|
||||||
|
/// If the payload does not start by a valid `vint`
|
||||||
|
fn vint_len(data: &[u8]) -> usize {
|
||||||
|
for (i, &val) in data.iter().enumerate().take(5) {
|
||||||
|
if val >= STOP_BIT {
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
panic!("Corrupted data. Invalid VInt 32");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reads a vint `u32` from a buffer, and
|
||||||
|
/// consumes its payload data.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// If the buffer does not start by a valid
|
||||||
|
/// vint payload
|
||||||
|
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
|
||||||
|
let vlen = vint_len(*data);
|
||||||
|
let mut result = 0u32;
|
||||||
|
let mut shift = 0u64;
|
||||||
|
for &b in &data[..vlen] {
|
||||||
|
result |= u32::from(b & 127u8) << shift;
|
||||||
|
shift += 7;
|
||||||
|
}
|
||||||
|
*data = &data[vlen..];
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write a `u32` as a vint payload.
|
||||||
|
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
|
||||||
|
let (val, num_bytes) = serialize_vint_u32(val);
|
||||||
|
let mut buffer = [0u8; 8];
|
||||||
|
LittleEndian::write_u64(&mut buffer, val);
|
||||||
|
writer.write_all(&buffer[..num_bytes])
|
||||||
|
}
|
||||||
|
|
||||||
impl VInt {
|
impl VInt {
|
||||||
pub fn val(&self) -> u64 {
|
pub fn val(&self) -> u64 {
|
||||||
self.0
|
self.0
|
||||||
@@ -24,7 +119,7 @@ impl VInt {
|
|||||||
output.extend(&buffer[0..num_bytes]);
|
output.extend(&buffer[0..num_bytes]);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
pub fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
||||||
let mut remaining = self.0;
|
let mut remaining = self.0;
|
||||||
for (i, b) in buffer.iter_mut().enumerate() {
|
for (i, b) in buffer.iter_mut().enumerate() {
|
||||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||||
@@ -64,7 +159,7 @@ impl BinarySerializable for VInt {
|
|||||||
return Err(io::Error::new(
|
return Err(io::Error::new(
|
||||||
io::ErrorKind::InvalidData,
|
io::ErrorKind::InvalidData,
|
||||||
"Reach end of buffer while reading VInt",
|
"Reach end of buffer while reading VInt",
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -74,7 +169,9 @@ impl BinarySerializable for VInt {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use super::serialize_vint_u32;
|
||||||
use super::VInt;
|
use super::VInt;
|
||||||
|
use byteorder::{ByteOrder, LittleEndian};
|
||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
|
|
||||||
fn aux_test_vint(val: u64) {
|
fn aux_test_vint(val: u64) {
|
||||||
@@ -108,4 +205,28 @@ mod tests {
|
|||||||
}
|
}
|
||||||
aux_test_vint(10);
|
aux_test_vint(10);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn aux_test_serialize_vint_u32(val: u32) {
|
||||||
|
let mut buffer = [0u8; 10];
|
||||||
|
let mut buffer2 = [0u8; 10];
|
||||||
|
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
|
||||||
|
let (vint, len) = serialize_vint_u32(val);
|
||||||
|
assert_eq!(len, len_vint, "len wrong for val {}", val);
|
||||||
|
LittleEndian::write_u64(&mut buffer2, vint);
|
||||||
|
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vint_u32() {
|
||||||
|
aux_test_serialize_vint_u32(0);
|
||||||
|
aux_test_serialize_vint_u32(1);
|
||||||
|
aux_test_serialize_vint_u32(5);
|
||||||
|
for i in 1..3 {
|
||||||
|
let power_of_128 = 1u32 << (7 * i);
|
||||||
|
aux_test_serialize_vint_u32(power_of_128 - 1u32);
|
||||||
|
aux_test_serialize_vint_u32(power_of_128);
|
||||||
|
aux_test_serialize_vint_u32(power_of_128 + 1u32);
|
||||||
|
}
|
||||||
|
aux_test_serialize_vint_u32(u32::max_value());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ impl Executor {
|
|||||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||||
// terminate.
|
// terminate.
|
||||||
};
|
};
|
||||||
// This is lame, but it does not use unsafe code.
|
// This is lame, but safe.
|
||||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||||
for (pos, fruit_res) in fruit_receiver {
|
for (pos, fruit_res) in fruit_receiver {
|
||||||
let fruit = fruit_res?;
|
let fruit = fruit_res?;
|
||||||
|
|||||||
@@ -12,13 +12,14 @@ use core::META_FILEPATH;
|
|||||||
use directory::ManagedDirectory;
|
use directory::ManagedDirectory;
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
use directory::MmapDirectory;
|
use directory::MmapDirectory;
|
||||||
|
use directory::INDEX_WRITER_LOCK;
|
||||||
|
use directory::META_LOCK;
|
||||||
use directory::{Directory, RAMDirectory};
|
use directory::{Directory, RAMDirectory};
|
||||||
use error::DataCorruption;
|
use error::DataCorruption;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use indexer::index_writer::open_index_writer;
|
use indexer::index_writer::open_index_writer;
|
||||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||||
use indexer::segment_updater::save_new_metas;
|
use indexer::segment_updater::save_new_metas;
|
||||||
use indexer::LockType;
|
|
||||||
use num_cpus;
|
use num_cpus;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
@@ -110,7 +111,6 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Opens or creates a new index in the provided directory
|
/// Opens or creates a new index in the provided directory
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||||
if Index::exists(&dir) {
|
if Index::exists(&dir) {
|
||||||
let index = Index::open(dir)?;
|
let index = Index::open(dir)?;
|
||||||
@@ -150,7 +150,7 @@ impl Index {
|
|||||||
///
|
///
|
||||||
/// This will overwrite existing meta.json
|
/// This will overwrite existing meta.json
|
||||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
save_new_metas(schema.clone(), directory.borrow_mut())?;
|
||||||
let metas = IndexMeta::with_schema(schema);
|
let metas = IndexMeta::with_schema(schema);
|
||||||
Index::create_from_metas(directory, &metas)
|
Index::create_from_metas(directory, &metas)
|
||||||
}
|
}
|
||||||
@@ -232,7 +232,8 @@ impl Index {
|
|||||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
|
||||||
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// If the heap size per thread is too small, panics.
|
/// If the heap size per thread is too small, panics.
|
||||||
pub fn writer_with_num_threads(
|
pub fn writer_with_num_threads(
|
||||||
@@ -240,7 +241,21 @@ impl Index {
|
|||||||
num_threads: usize,
|
num_threads: usize,
|
||||||
overall_heap_size_in_bytes: usize,
|
overall_heap_size_in_bytes: usize,
|
||||||
) -> Result<IndexWriter> {
|
) -> Result<IndexWriter> {
|
||||||
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
|
let directory_lock = self
|
||||||
|
.directory
|
||||||
|
.acquire_lock(&INDEX_WRITER_LOCK)
|
||||||
|
.map_err(|err| {
|
||||||
|
TantivyError::LockFailure(
|
||||||
|
err,
|
||||||
|
Some(
|
||||||
|
"Failed to acquire index lock. If you are using\
|
||||||
|
a regular directory, this means there is already an \
|
||||||
|
`IndexWriter` working on this `Directory`, in this process \
|
||||||
|
or in a different process."
|
||||||
|
.to_string(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||||
open_index_writer(
|
open_index_writer(
|
||||||
self,
|
self,
|
||||||
@@ -339,7 +354,7 @@ impl Index {
|
|||||||
/// get the freshest `index` at all time, is to watch `meta.json` and
|
/// get the freshest `index` at all time, is to watch `meta.json` and
|
||||||
/// call `load_searchers` whenever a changes happen.
|
/// call `load_searchers` whenever a changes happen.
|
||||||
pub fn load_searchers(&self) -> Result<()> {
|
pub fn load_searchers(&self) -> Result<()> {
|
||||||
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
|
let _meta_lock = self.directory().acquire_lock(&META_LOCK)?;
|
||||||
let searchable_segments = self.searchable_segments()?;
|
let searchable_segments = self.searchable_segments()?;
|
||||||
let segment_readers: Vec<SegmentReader> = searchable_segments
|
let segment_readers: Vec<SegmentReader> = searchable_segments
|
||||||
.iter()
|
.iter()
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use crossbeam::queue::MsQueue;
|
use crossbeam::queue::MsQueue;
|
||||||
use std::mem;
|
|
||||||
use std::ops::{Deref, DerefMut};
|
use std::ops::{Deref, DerefMut};
|
||||||
use std::sync::atomic::AtomicUsize;
|
use std::sync::atomic::AtomicUsize;
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::atomic::Ordering;
|
||||||
@@ -10,6 +9,12 @@ pub struct GenerationItem<T> {
|
|||||||
item: T,
|
item: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// An object pool
|
||||||
|
///
|
||||||
|
/// This is used in tantivy to create a pool of `Searcher`.
|
||||||
|
/// Object are wrapped in a `LeasedItem` wrapper and are
|
||||||
|
/// released automatically back into the pool on `Drop`.
|
||||||
pub struct Pool<T> {
|
pub struct Pool<T> {
|
||||||
queue: Arc<MsQueue<GenerationItem<T>>>,
|
queue: Arc<MsQueue<GenerationItem<T>>>,
|
||||||
freshest_generation: AtomicUsize,
|
freshest_generation: AtomicUsize,
|
||||||
@@ -26,6 +31,10 @@ impl<T> Pool<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Publishes a new generation of `Searcher`.
|
||||||
|
///
|
||||||
|
/// After publish, all new `Searcher` acquired will be
|
||||||
|
/// of the new generation.
|
||||||
pub fn publish_new_generation(&self, items: Vec<T>) {
|
pub fn publish_new_generation(&self, items: Vec<T>) {
|
||||||
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
|
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
|
||||||
for item in items {
|
for item in items {
|
||||||
@@ -61,6 +70,10 @@ impl<T> Pool<T> {
|
|||||||
self.freshest_generation.load(Ordering::Acquire)
|
self.freshest_generation.load(Ordering::Acquire)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Acquires a new searcher.
|
||||||
|
///
|
||||||
|
/// If no searcher is available, this methods block until
|
||||||
|
/// a searcher is released.
|
||||||
pub fn acquire(&self) -> LeasedItem<T> {
|
pub fn acquire(&self) -> LeasedItem<T> {
|
||||||
let generation = self.generation();
|
let generation = self.generation();
|
||||||
loop {
|
loop {
|
||||||
@@ -107,9 +120,9 @@ impl<T> DerefMut for LeasedItem<T> {
|
|||||||
|
|
||||||
impl<T> Drop for LeasedItem<T> {
|
impl<T> Drop for LeasedItem<T> {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
|
if let Some(gen_item) = self.gen_item.take() {
|
||||||
.expect("Unwrapping a leased item should never fail");
|
self.recycle_queue.push(gen_item);
|
||||||
self.recycle_queue.push(gen_item);
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -41,6 +41,6 @@ impl SegmentComponent {
|
|||||||
SegmentComponent::STORE,
|
SegmentComponent::STORE,
|
||||||
SegmentComponent::DELETE,
|
SegmentComponent::DELETE,
|
||||||
];
|
];
|
||||||
SEGMENT_COMPONENTS.into_iter()
|
SEGMENT_COMPONENTS.iter()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,102 @@
|
|||||||
|
use directory::directory_lock::Lock;
|
||||||
|
use directory::error::LockError;
|
||||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||||
use directory::{ReadOnlySource, WritePtr};
|
use directory::{ReadOnlySource, WritePtr};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
use std::io::Write;
|
||||||
use std::marker::Send;
|
use std::marker::Send;
|
||||||
use std::marker::Sync;
|
use std::marker::Sync;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::result;
|
use std::result;
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// Retry the logic of acquiring locks is pretty simple.
|
||||||
|
/// We just retry `n` times after a given `duratio`, both
|
||||||
|
/// depending on the type of lock.
|
||||||
|
struct RetryPolicy {
|
||||||
|
num_retries: usize,
|
||||||
|
wait_in_ms: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RetryPolicy {
|
||||||
|
fn no_retry() -> RetryPolicy {
|
||||||
|
RetryPolicy {
|
||||||
|
num_retries: 0,
|
||||||
|
wait_in_ms: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn wait_and_retry(&mut self) -> bool {
|
||||||
|
if self.num_retries == 0 {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
self.num_retries -= 1;
|
||||||
|
let wait_duration = Duration::from_millis(self.wait_in_ms);
|
||||||
|
thread::sleep(wait_duration);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The `DirectoryLock` is an object that represents a file lock.
|
||||||
|
/// See [`LockType`](struct.LockType.html)
|
||||||
|
///
|
||||||
|
/// It is transparently associated to a lock file, that gets deleted
|
||||||
|
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||||
|
pub struct DirectoryLock(Box<Drop + Send + 'static>);
|
||||||
|
|
||||||
|
struct DirectoryLockGuard {
|
||||||
|
directory: Box<Directory>,
|
||||||
|
path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
|
||||||
|
fn from(underlying: Box<T>) -> Self {
|
||||||
|
DirectoryLock(underlying)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for DirectoryLockGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if let Err(e) = self.directory.delete(&*self.path) {
|
||||||
|
error!("Failed to remove the lock file. {:?}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum TryAcquireLockError {
|
||||||
|
FileExists,
|
||||||
|
IOError(io::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_acquire_lock(
|
||||||
|
filepath: &Path,
|
||||||
|
directory: &mut Directory,
|
||||||
|
) -> Result<DirectoryLock, TryAcquireLockError> {
|
||||||
|
let mut write = directory.open_write(filepath).map_err(|e| match e {
|
||||||
|
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
|
||||||
|
OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()),
|
||||||
|
})?;
|
||||||
|
write.flush().map_err(TryAcquireLockError::IOError)?;
|
||||||
|
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
|
||||||
|
directory: directory.box_clone(),
|
||||||
|
path: filepath.to_owned(),
|
||||||
|
})))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
||||||
|
if is_blocking {
|
||||||
|
RetryPolicy {
|
||||||
|
num_retries: 100,
|
||||||
|
wait_in_ms: 100,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
RetryPolicy::no_retry()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Write-once read many (WORM) abstraction for where
|
/// Write-once read many (WORM) abstraction for where
|
||||||
/// tantivy's data should be stored.
|
/// tantivy's data should be stored.
|
||||||
@@ -73,6 +164,29 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
|||||||
///
|
///
|
||||||
/// The file may or may not previously exist.
|
/// The file may or may not previously exist.
|
||||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||||
|
|
||||||
|
/// Acquire a lock in the given directory.
|
||||||
|
///
|
||||||
|
/// The method is blocking or not depending on the `Lock` object.
|
||||||
|
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
||||||
|
let mut box_directory = self.box_clone();
|
||||||
|
let mut retry_policy = retry_policy(lock.is_blocking);
|
||||||
|
loop {
|
||||||
|
match try_acquire_lock(&lock.filepath, &mut *box_directory) {
|
||||||
|
Ok(result) => {
|
||||||
|
return Ok(result);
|
||||||
|
}
|
||||||
|
Err(TryAcquireLockError::FileExists) => {
|
||||||
|
if !retry_policy.wait_and_retry() {
|
||||||
|
return Err(LockError::LockBusy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(TryAcquireLockError::IOError(io_error)) => {
|
||||||
|
return Err(LockError::IOError(io_error));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// DirectoryClone
|
/// DirectoryClone
|
||||||
|
|||||||
56
src/directory/directory_lock.rs
Normal file
56
src/directory/directory_lock.rs
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
/// A directory lock.
|
||||||
|
///
|
||||||
|
/// A lock is associated to a specific path and some
|
||||||
|
/// [`LockParams`](./enum.LockParams.html).
|
||||||
|
/// Tantivy itself uses only two locks but client application
|
||||||
|
/// can use the directory facility to define their own locks.
|
||||||
|
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
|
||||||
|
/// - [META_LOCK](./struct.META_LOCK.html)
|
||||||
|
///
|
||||||
|
/// Check out these locks documentation for more information.
|
||||||
|
///
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Lock {
|
||||||
|
/// The lock needs to be associated with its own file `path`.
|
||||||
|
/// Depending on the platform, the lock might rely on the creation
|
||||||
|
/// and deletion of this filepath.
|
||||||
|
pub filepath: PathBuf,
|
||||||
|
/// `lock_params` describes whether acquiring the lock is meant
|
||||||
|
/// to be a blocking operation or a non-blocking.
|
||||||
|
///
|
||||||
|
/// Acquiring a blocking lock blocks until the lock is
|
||||||
|
/// available.
|
||||||
|
/// Acquiring a blocking lock returns rapidly, either successfully
|
||||||
|
/// or with an error signifying that someone is already holding
|
||||||
|
/// the lock.
|
||||||
|
pub is_blocking: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
/// Only one process should be able to write tantivy's index at a time.
|
||||||
|
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||||
|
///
|
||||||
|
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||||
|
///
|
||||||
|
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
||||||
|
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
||||||
|
/// lock file remaining after a crash. In the latter case, removing the file after
|
||||||
|
/// checking no process running tantivy is running is safe.
|
||||||
|
pub static ref INDEX_WRITER_LOCK: Lock = Lock {
|
||||||
|
filepath: PathBuf::from(".tantivy-writer.lock"),
|
||||||
|
is_blocking: false
|
||||||
|
};
|
||||||
|
/// The meta lock file is here to protect the segment files being opened by
|
||||||
|
/// `.load_searchers()` from being garbage collected.
|
||||||
|
/// It makes it possible for another process to safely consume
|
||||||
|
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
||||||
|
/// here, but it is difficult to achieve on Windows.
|
||||||
|
///
|
||||||
|
/// Opening segment readers is a very fast process.
|
||||||
|
pub static ref META_LOCK: Lock = Lock {
|
||||||
|
filepath: PathBuf::from(".tantivy-meta.lock"),
|
||||||
|
is_blocking: true
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -3,6 +3,22 @@ use std::fmt;
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
/// Error while trying to acquire a directory lock.
|
||||||
|
#[derive(Debug, Fail)]
|
||||||
|
pub enum LockError {
|
||||||
|
/// Failed to acquired a lock as it is already hold by another
|
||||||
|
/// client.
|
||||||
|
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
|
||||||
|
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
||||||
|
#[fail(
|
||||||
|
display = "Could not acquire lock as it is already held, possibly by a different process."
|
||||||
|
)]
|
||||||
|
LockBusy,
|
||||||
|
/// Trying to acquire a lock failed with an `IOError`
|
||||||
|
#[fail(display = "Failed to acquire the lock due to an io:Error.")]
|
||||||
|
IOError(io::Error),
|
||||||
|
}
|
||||||
|
|
||||||
/// General IO error with an optional path to the offending file.
|
/// General IO error with an optional path to the offending file.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct IOError {
|
pub struct IOError {
|
||||||
@@ -10,6 +26,12 @@ pub struct IOError {
|
|||||||
err: io::Error,
|
err: io::Error,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Into<io::Error> for IOError {
|
||||||
|
fn into(self) -> io::Error {
|
||||||
|
self.err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl fmt::Display for IOError {
|
impl fmt::Display for IOError {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
match self.path {
|
match self.path {
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
use core::MANAGED_FILEPATH;
|
use core::MANAGED_FILEPATH;
|
||||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
|
||||||
|
use directory::DirectoryLock;
|
||||||
|
use directory::Lock;
|
||||||
|
use directory::META_LOCK;
|
||||||
use directory::{ReadOnlySource, WritePtr};
|
use directory::{ReadOnlySource, WritePtr};
|
||||||
use error::DataCorruption;
|
use error::DataCorruption;
|
||||||
use indexer::LockType;
|
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io;
|
use std::io;
|
||||||
@@ -92,6 +94,9 @@ impl ManagedDirectory {
|
|||||||
///
|
///
|
||||||
/// * `living_files` - List of files that are still used by the index.
|
/// * `living_files` - List of files that are still used by the index.
|
||||||
///
|
///
|
||||||
|
/// The use a callback ensures that the list of living_files is computed
|
||||||
|
/// while we hold the lock on meta.
|
||||||
|
///
|
||||||
/// This method does not panick nor returns errors.
|
/// This method does not panick nor returns errors.
|
||||||
/// If a file cannot be deleted (for permission reasons for instance)
|
/// If a file cannot be deleted (for permission reasons for instance)
|
||||||
/// an error is simply logged, and the file remains in the list of managed
|
/// an error is simply logged, and the file remains in the list of managed
|
||||||
@@ -122,7 +127,7 @@ impl ManagedDirectory {
|
|||||||
// 2) writer change meta.json (for instance after a merge or a commit)
|
// 2) writer change meta.json (for instance after a merge or a commit)
|
||||||
// 3) gc kicks in.
|
// 3) gc kicks in.
|
||||||
// 4) gc removes a file that was useful for process B, before process B opened it.
|
// 4) gc removes a file that was useful for process B, before process B opened it.
|
||||||
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
|
if let Ok(_meta_lock) = self.acquire_lock(&META_LOCK) {
|
||||||
let living_files = get_living_files();
|
let living_files = get_living_files();
|
||||||
for managed_path in &meta_informations_rlock.managed_paths {
|
for managed_path in &meta_informations_rlock.managed_paths {
|
||||||
if !living_files.contains(managed_path) {
|
if !living_files.contains(managed_path) {
|
||||||
@@ -232,6 +237,10 @@ impl Directory for ManagedDirectory {
|
|||||||
fn exists(&self, path: &Path) -> bool {
|
fn exists(&self, path: &Path) -> bool {
|
||||||
self.directory.exists(path)
|
self.directory.exists(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn acquire_lock(&self, lock: &Lock) -> result::Result<DirectoryLock, LockError> {
|
||||||
|
self.directory.acquire_lock(lock)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for ManagedDirectory {
|
impl Clone for ManagedDirectory {
|
||||||
|
|||||||
@@ -1,8 +1,14 @@
|
|||||||
|
extern crate fs2;
|
||||||
|
|
||||||
|
use self::fs2::FileExt;
|
||||||
use atomicwrites;
|
use atomicwrites;
|
||||||
use common::make_io_err;
|
use common::make_io_err;
|
||||||
|
use directory::error::LockError;
|
||||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||||
use directory::shared_vec_slice::SharedVecSlice;
|
use directory::shared_vec_slice::SharedVecSlice;
|
||||||
use directory::Directory;
|
use directory::Directory;
|
||||||
|
use directory::DirectoryLock;
|
||||||
|
use directory::Lock;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use fst::raw::MmapReadOnly;
|
use fst::raw::MmapReadOnly;
|
||||||
@@ -115,6 +121,14 @@ impl MmapCache {
|
|||||||
///
|
///
|
||||||
/// The Mmap object are cached to limit the
|
/// The Mmap object are cached to limit the
|
||||||
/// system calls.
|
/// system calls.
|
||||||
|
///
|
||||||
|
/// In the `MmapDirectory`, locks are implemented using the `fs2` crate definition of locks.
|
||||||
|
///
|
||||||
|
/// On MacOS & linux, it relies on `flock` (aka `BSD Lock`). These locks solve most of the
|
||||||
|
/// problems related to POSIX Locks, but may their contract may not be respected on `NFS`
|
||||||
|
/// depending on the implementation.
|
||||||
|
///
|
||||||
|
/// On Windows the semantics are again different.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct MmapDirectory {
|
pub struct MmapDirectory {
|
||||||
root_path: PathBuf,
|
root_path: PathBuf,
|
||||||
@@ -213,6 +227,21 @@ impl MmapDirectory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// We rely on fs2 for file locking. On Windows & MacOS this
|
||||||
|
/// uses BSD locks (`flock`). The lock is actually released when
|
||||||
|
/// the `File` object is dropped and its associated file descriptor
|
||||||
|
/// is closed.
|
||||||
|
struct ReleaseLockFile {
|
||||||
|
_file: File,
|
||||||
|
path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for ReleaseLockFile {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
debug!("Releasing lock {:?}", self.path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// This Write wraps a File, but has the specificity of
|
/// This Write wraps a File, but has the specificity of
|
||||||
/// call `sync_all` on flush.
|
/// call `sync_all` on flush.
|
||||||
struct SafeFileWriter(File);
|
struct SafeFileWriter(File);
|
||||||
@@ -354,6 +383,26 @@ impl Directory for MmapDirectory {
|
|||||||
meta_file.write(|f| f.write_all(data))?;
|
meta_file.write(|f| f.write_all(data))?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
||||||
|
let full_path = self.resolve_path(&lock.filepath);
|
||||||
|
// We make sure that the file exists.
|
||||||
|
let file: File = OpenOptions::new()
|
||||||
|
.write(true)
|
||||||
|
.create(true) //< if the file does not exist yet, create it.
|
||||||
|
.open(&full_path)
|
||||||
|
.map_err(LockError::IOError)?;
|
||||||
|
if lock.is_blocking {
|
||||||
|
file.lock_exclusive().map_err(LockError::IOError)?;
|
||||||
|
} else {
|
||||||
|
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
|
||||||
|
}
|
||||||
|
// dropping the file handle will release the lock.
|
||||||
|
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
|
||||||
|
path: lock.filepath.clone(),
|
||||||
|
_file: file,
|
||||||
|
})))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ WORM directory abstraction.
|
|||||||
mod mmap_directory;
|
mod mmap_directory;
|
||||||
|
|
||||||
mod directory;
|
mod directory;
|
||||||
|
mod directory_lock;
|
||||||
mod managed_directory;
|
mod managed_directory;
|
||||||
mod ram_directory;
|
mod ram_directory;
|
||||||
mod read_only_source;
|
mod read_only_source;
|
||||||
@@ -16,11 +17,12 @@ mod shared_vec_slice;
|
|||||||
/// Errors specific to the directory module.
|
/// Errors specific to the directory module.
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
|
||||||
use std::io::{BufWriter, Seek, Write};
|
pub use self::directory::DirectoryLock;
|
||||||
|
|
||||||
pub use self::directory::{Directory, DirectoryClone};
|
pub use self::directory::{Directory, DirectoryClone};
|
||||||
|
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||||
pub use self::ram_directory::RAMDirectory;
|
pub use self::ram_directory::RAMDirectory;
|
||||||
pub use self::read_only_source::ReadOnlySource;
|
pub use self::read_only_source::ReadOnlySource;
|
||||||
|
use std::io::{BufWriter, Seek, Write};
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub use self::mmap_directory::MmapDirectory;
|
pub use self::mmap_directory::MmapDirectory;
|
||||||
@@ -38,128 +40,4 @@ impl<T: Seek + Write> SeekableWrite for T {}
|
|||||||
pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests;
|
||||||
|
|
||||||
use super::*;
|
|
||||||
use std::io::{Seek, SeekFrom, Write};
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
lazy_static! {
|
|
||||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ram_directory() {
|
|
||||||
let mut ram_directory = RAMDirectory::create();
|
|
||||||
test_directory(&mut ram_directory);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
fn test_mmap_directory() {
|
|
||||||
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
|
||||||
test_directory(&mut mmap_directory);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn ram_directory_panics_if_flush_forgotten() {
|
|
||||||
let mut ram_directory = RAMDirectory::create();
|
|
||||||
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(write_file.write_all(&[4]).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_simple(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
{
|
|
||||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(directory.exists(*TEST_PATH));
|
|
||||||
write_file.write_all(&[4]).unwrap();
|
|
||||||
write_file.write_all(&[3]).unwrap();
|
|
||||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
}
|
|
||||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
|
||||||
let data: &[u8] = &*read_file;
|
|
||||||
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
assert!(!directory.exists(*TEST_PATH));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_seek(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
{
|
|
||||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
|
|
||||||
write_file.seek(SeekFrom::Start(0)).unwrap();
|
|
||||||
write_file.write_all(&[3, 1]).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
}
|
|
||||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
|
||||||
let data: &[u8] = &*read_file;
|
|
||||||
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_rewrite_forbidden(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(directory.exists(*TEST_PATH));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(directory.open_write(*TEST_PATH).is_err());
|
|
||||||
}
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_write_create_the_file(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
|
||||||
let _w = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(directory.exists(*TEST_PATH));
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_ok());
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_directory_delete(directory: &mut Directory) {
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
|
||||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
write_file.write_all(&[1, 2, 3, 4]).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
{
|
|
||||||
let read_handle = directory.open_read(*TEST_PATH).unwrap();
|
|
||||||
{
|
|
||||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
|
||||||
|
|
||||||
// Mapped files can't be deleted on Windows
|
|
||||||
if !cfg!(windows) {
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg!(windows) {
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_directory(directory: &mut Directory) {
|
|
||||||
test_simple(directory);
|
|
||||||
test_seek(directory);
|
|
||||||
test_rewrite_forbidden(directory);
|
|
||||||
test_write_create_the_file(directory);
|
|
||||||
test_directory_delete(directory);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|||||||
182
src/directory/tests.rs
Normal file
182
src/directory/tests.rs
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
use super::*;
|
||||||
|
use std::io::{Seek, SeekFrom, Write};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::time;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ram_directory() {
|
||||||
|
let mut ram_directory = RAMDirectory::create();
|
||||||
|
test_directory(&mut ram_directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
|
fn test_mmap_directory() {
|
||||||
|
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||||
|
test_directory(&mut mmap_directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn ram_directory_panics_if_flush_forgotten() {
|
||||||
|
let mut ram_directory = RAMDirectory::create();
|
||||||
|
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(write_file.write_all(&[4]).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_simple(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
{
|
||||||
|
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(directory.exists(*TEST_PATH));
|
||||||
|
write_file.write_all(&[4]).unwrap();
|
||||||
|
write_file.write_all(&[3]).unwrap();
|
||||||
|
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||||
|
write_file.flush().unwrap();
|
||||||
|
}
|
||||||
|
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||||
|
let data: &[u8] = &*read_file;
|
||||||
|
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
assert!(!directory.exists(*TEST_PATH));
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_seek(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
{
|
||||||
|
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
|
||||||
|
write_file.seek(SeekFrom::Start(0)).unwrap();
|
||||||
|
write_file.write_all(&[3, 1]).unwrap();
|
||||||
|
write_file.flush().unwrap();
|
||||||
|
}
|
||||||
|
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||||
|
let data: &[u8] = &*read_file;
|
||||||
|
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_rewrite_forbidden(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(directory.exists(*TEST_PATH));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
assert!(directory.open_write(*TEST_PATH).is_err());
|
||||||
|
}
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_write_create_the_file(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||||
|
let _w = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(directory.exists(*TEST_PATH));
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_ok());
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_directory_delete(directory: &mut Directory) {
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||||
|
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
write_file.write_all(&[1, 2, 3, 4]).unwrap();
|
||||||
|
write_file.flush().unwrap();
|
||||||
|
{
|
||||||
|
let read_handle = directory.open_read(*TEST_PATH).unwrap();
|
||||||
|
{
|
||||||
|
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||||
|
|
||||||
|
// Mapped files can't be deleted on Windows
|
||||||
|
if !cfg!(windows) {
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg!(windows) {
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_directory(directory: &mut Directory) {
|
||||||
|
test_simple(directory);
|
||||||
|
test_seek(directory);
|
||||||
|
test_rewrite_forbidden(directory);
|
||||||
|
test_write_create_the_file(directory);
|
||||||
|
test_directory_delete(directory);
|
||||||
|
test_lock_non_blocking(directory);
|
||||||
|
test_lock_blocking(directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_lock_non_blocking(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
let lock_a_res = directory.acquire_lock(&Lock {
|
||||||
|
filepath: PathBuf::from("a.lock"),
|
||||||
|
is_blocking: false,
|
||||||
|
});
|
||||||
|
assert!(lock_a_res.is_ok());
|
||||||
|
let lock_b_res = directory.acquire_lock(&Lock {
|
||||||
|
filepath: PathBuf::from("b.lock"),
|
||||||
|
is_blocking: false,
|
||||||
|
});
|
||||||
|
assert!(lock_b_res.is_ok());
|
||||||
|
let lock_a_res2 = directory.acquire_lock(&Lock {
|
||||||
|
filepath: PathBuf::from("a.lock"),
|
||||||
|
is_blocking: false,
|
||||||
|
});
|
||||||
|
assert!(lock_a_res2.is_err());
|
||||||
|
}
|
||||||
|
let lock_a_res = directory.acquire_lock(&Lock {
|
||||||
|
filepath: PathBuf::from("a.lock"),
|
||||||
|
is_blocking: false,
|
||||||
|
});
|
||||||
|
assert!(lock_a_res.is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_lock_blocking(directory: &mut Directory) {
|
||||||
|
let lock_a_res = directory.acquire_lock(&Lock {
|
||||||
|
filepath: PathBuf::from("a.lock"),
|
||||||
|
is_blocking: true,
|
||||||
|
});
|
||||||
|
assert!(lock_a_res.is_ok());
|
||||||
|
std::thread::spawn(move || {
|
||||||
|
//< lock_a_res is sent to the thread.
|
||||||
|
std::thread::sleep(time::Duration::from_millis(10));
|
||||||
|
// explicitely droping lock_a_res. It would have been sufficient to just force it
|
||||||
|
// to be part of the move, but the intent seems clearer that way.
|
||||||
|
drop(lock_a_res);
|
||||||
|
});
|
||||||
|
{
|
||||||
|
// A non-blocking call should fail, as the thread is running and holding the lock.
|
||||||
|
let lock_a_res = directory.acquire_lock(&Lock {
|
||||||
|
filepath: PathBuf::from("a.lock"),
|
||||||
|
is_blocking: false,
|
||||||
|
});
|
||||||
|
assert!(lock_a_res.is_err());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// the blocking call should wait for at least 10ms.
|
||||||
|
let start = time::Instant::now();
|
||||||
|
let lock_a_res = directory.acquire_lock(&Lock {
|
||||||
|
filepath: PathBuf::from("a.lock"),
|
||||||
|
is_blocking: true,
|
||||||
|
});
|
||||||
|
assert!(lock_a_res.is_ok());
|
||||||
|
assert!(start.elapsed().subsec_millis() >= 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
15
src/error.rs
15
src/error.rs
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
|
use directory::error::LockError;
|
||||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||||
use fastfield::FastFieldNotAvailableError;
|
use fastfield::FastFieldNotAvailableError;
|
||||||
use indexer::LockType;
|
|
||||||
use query;
|
use query;
|
||||||
use schema;
|
use schema;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
@@ -57,11 +57,8 @@ pub enum TantivyError {
|
|||||||
#[fail(display = "Index already exists")]
|
#[fail(display = "Index already exists")]
|
||||||
IndexAlreadyExists,
|
IndexAlreadyExists,
|
||||||
/// Failed to acquire file lock
|
/// Failed to acquire file lock
|
||||||
#[fail(
|
#[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)]
|
||||||
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
LockFailure(LockError, Option<String>),
|
||||||
_0
|
|
||||||
)]
|
|
||||||
LockFailure(LockType),
|
|
||||||
/// IO Error.
|
/// IO Error.
|
||||||
#[fail(display = "An IO error occurred: '{}'", _0)]
|
#[fail(display = "An IO error occurred: '{}'", _0)]
|
||||||
IOError(#[cause] IOError),
|
IOError(#[cause] IOError),
|
||||||
@@ -100,6 +97,12 @@ impl From<FastFieldNotAvailableError> for TantivyError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<LockError> for TantivyError {
|
||||||
|
fn from(lock_error: LockError) -> TantivyError {
|
||||||
|
TantivyError::LockFailure(lock_error, None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl From<IOError> for TantivyError {
|
impl From<IOError> for TantivyError {
|
||||||
fn from(io_error: IOError) -> TantivyError {
|
fn from(io_error: IOError) -> TantivyError {
|
||||||
TantivyError::IOError(io_error)
|
TantivyError::IOError(io_error)
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
let (start, stop) = self.range(doc);
|
let (start, stop) = self.range(doc);
|
||||||
let len = (stop - start) as usize;
|
let len = (stop - start) as usize;
|
||||||
vals.resize(len, Item::default());
|
vals.resize(len, Item::default());
|
||||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,29 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
/// May panic if `doc` is greater than the segment
|
/// May panic if `doc` is greater than the segment
|
||||||
// `maxdoc`.
|
// `maxdoc`.
|
||||||
pub fn get(&self, doc: DocId) -> Item {
|
pub fn get(&self, doc: DocId) -> Item {
|
||||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
|
self.get_u64(doc as u64)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||||
|
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||||
|
/// It works as follows... A first column contains the list of start index
|
||||||
|
/// for each document, a second column contains the actual values.
|
||||||
|
///
|
||||||
|
/// The values associated to a given doc, are then
|
||||||
|
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
|
||||||
|
///
|
||||||
|
/// Which means single value fast field reader can be indexed internally with
|
||||||
|
/// something different from a `DocId`. For this use case, we want to use `u64`
|
||||||
|
/// values.
|
||||||
|
///
|
||||||
|
/// See `get_range` for an actual documentation about this method.
|
||||||
|
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
|
||||||
|
for (i, out) in output.iter_mut().enumerate() {
|
||||||
|
*out = self.get_u64(start + (i as u64));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fills an output buffer with the fast field values
|
/// Fills an output buffer with the fast field values
|
||||||
@@ -75,16 +97,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
///
|
///
|
||||||
/// May panic if `start + output.len()` is greater than
|
/// May panic if `start + output.len()` is greater than
|
||||||
/// the segment's `maxdoc`.
|
/// the segment's `maxdoc`.
|
||||||
///
|
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||||
// TODO change start to `u64`.
|
self.get_range_u64(start as u64, output);
|
||||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
|
||||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
|
||||||
// ok: Item is either `u64` or `i64`
|
|
||||||
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
|
|
||||||
self.bit_unpacker.get_range(start, output_u64);
|
|
||||||
for out in output_u64.iter_mut() {
|
|
||||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the minimum value for this fast field.
|
/// Returns the minimum value for this fast field.
|
||||||
|
|||||||
@@ -1,131 +0,0 @@
|
|||||||
use directory::error::OpenWriteError;
|
|
||||||
use std::io::Write;
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
use std::thread;
|
|
||||||
use std::time::Duration;
|
|
||||||
use Directory;
|
|
||||||
use TantivyError;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
|
||||||
pub enum LockType {
|
|
||||||
/// Only one process should be able to write tantivy's index at a time.
|
|
||||||
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
|
||||||
///
|
|
||||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
|
||||||
///
|
|
||||||
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
|
||||||
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
|
||||||
/// lock file remaining after a crash. In the latter case, removing the file after
|
|
||||||
/// checking no process running tantivy is running is safe.
|
|
||||||
IndexWriterLock,
|
|
||||||
/// The meta lock file is here to protect the segment files being opened by
|
|
||||||
/// `.load_searchers()` from being garbage collected.
|
|
||||||
/// It makes it possible for another process to safely consume
|
|
||||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
|
||||||
/// here, but it is difficult to achieve on Windows.
|
|
||||||
///
|
|
||||||
/// Opening segment readers is a very fast process.
|
|
||||||
/// Right now if the lock cannot be acquire on the first attempt, the logic
|
|
||||||
/// is very simplistic. We retry after `100ms` until we effectively
|
|
||||||
/// acquire the lock.
|
|
||||||
/// This lock should not have much contention in normal usage.
|
|
||||||
MetaLock,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Retry the logic of acquiring locks is pretty simple.
|
|
||||||
/// We just retry `n` times after a given `duratio`, both
|
|
||||||
/// depending on the type of lock.
|
|
||||||
struct RetryPolicy {
|
|
||||||
num_retries: usize,
|
|
||||||
wait_in_ms: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RetryPolicy {
|
|
||||||
fn no_retry() -> RetryPolicy {
|
|
||||||
RetryPolicy {
|
|
||||||
num_retries: 0,
|
|
||||||
wait_in_ms: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn wait_and_retry(&mut self) -> bool {
|
|
||||||
if self.num_retries == 0 {
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
self.num_retries -= 1;
|
|
||||||
let wait_duration = Duration::from_millis(self.wait_in_ms);
|
|
||||||
thread::sleep(wait_duration);
|
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LockType {
|
|
||||||
fn retry_policy(self) -> RetryPolicy {
|
|
||||||
match self {
|
|
||||||
LockType::IndexWriterLock => RetryPolicy::no_retry(),
|
|
||||||
LockType::MetaLock => RetryPolicy {
|
|
||||||
num_retries: 100,
|
|
||||||
wait_in_ms: 100,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn try_acquire_lock(self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
|
|
||||||
let path = self.filename();
|
|
||||||
let mut write = directory.open_write(path).map_err(|e| match e {
|
|
||||||
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self),
|
|
||||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
|
||||||
})?;
|
|
||||||
write.flush()?;
|
|
||||||
Ok(DirectoryLock {
|
|
||||||
directory: directory.box_clone(),
|
|
||||||
path: path.to_owned(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Acquire a lock in the given directory.
|
|
||||||
pub fn acquire_lock(self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
|
|
||||||
let mut box_directory = directory.box_clone();
|
|
||||||
let mut retry_policy = self.retry_policy();
|
|
||||||
loop {
|
|
||||||
let lock_result = self.try_acquire_lock(&mut *box_directory);
|
|
||||||
match lock_result {
|
|
||||||
Ok(result) => {
|
|
||||||
return Ok(result);
|
|
||||||
}
|
|
||||||
Err(TantivyError::LockFailure(ref filepath)) => {
|
|
||||||
if !retry_policy.wait_and_retry() {
|
|
||||||
return Err(TantivyError::LockFailure(filepath.to_owned()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_) => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn filename(&self) -> &Path {
|
|
||||||
match *self {
|
|
||||||
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
|
|
||||||
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The `DirectoryLock` is an object that represents a file lock.
|
|
||||||
/// See [`LockType`](struct.LockType.html)
|
|
||||||
///
|
|
||||||
/// It is transparently associated to a lock file, that gets deleted
|
|
||||||
/// on `Drop.` The lock is release automatically on `Drop`.
|
|
||||||
pub struct DirectoryLock {
|
|
||||||
directory: Box<Directory>,
|
|
||||||
path: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for DirectoryLock {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
if let Err(e) = self.directory.delete(&*self.path) {
|
|
||||||
error!("Failed to remove the lock file. {:?}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
use super::operation::AddOperation;
|
use super::operation::{AddOperation, UserOperation};
|
||||||
use super::segment_updater::SegmentUpdater;
|
use super::segment_updater::SegmentUpdater;
|
||||||
use super::PreparedCommit;
|
use super::PreparedCommit;
|
||||||
use bit_set::BitSet;
|
use bit_set::BitSet;
|
||||||
@@ -9,15 +9,15 @@ use core::SegmentId;
|
|||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use crossbeam::channel;
|
use crossbeam::channel;
|
||||||
|
use directory::DirectoryLock;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fastfield::write_delete_bitset;
|
use fastfield::write_delete_bitset;
|
||||||
use futures::sync::oneshot::Receiver;
|
use futures::{Canceled, Future};
|
||||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||||
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||||
use indexer::operation::DeleteOperation;
|
use indexer::operation::DeleteOperation;
|
||||||
use indexer::stamper::Stamper;
|
use indexer::stamper::Stamper;
|
||||||
use indexer::DirectoryLock;
|
|
||||||
use indexer::MergePolicy;
|
use indexer::MergePolicy;
|
||||||
use indexer::SegmentEntry;
|
use indexer::SegmentEntry;
|
||||||
use indexer::SegmentWriter;
|
use indexer::SegmentWriter;
|
||||||
@@ -26,7 +26,8 @@ use schema::Document;
|
|||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::mem::swap;
|
use std::ops::Range;
|
||||||
|
use std::sync::Arc;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::thread::JoinHandle;
|
use std::thread::JoinHandle;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -43,8 +44,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
|||||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||||
|
|
||||||
type DocumentSender = channel::Sender<AddOperation>;
|
type DocumentSender = channel::Sender<Vec<AddOperation>>;
|
||||||
type DocumentReceiver = channel::Receiver<AddOperation>;
|
type DocumentReceiver = channel::Receiver<Vec<AddOperation>>;
|
||||||
|
|
||||||
/// Split the thread memory budget into
|
/// Split the thread memory budget into
|
||||||
/// - the heap size
|
/// - the heap size
|
||||||
@@ -52,17 +53,19 @@ type DocumentReceiver = channel::Receiver<AddOperation>;
|
|||||||
///
|
///
|
||||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||||
|
assert!(per_thread_memory_budget > 1_000);
|
||||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||||
(1..)
|
if let Some(limit) = (1..)
|
||||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||||
.last()
|
.last()
|
||||||
.unwrap_or_else(|| {
|
{
|
||||||
panic!(
|
limit.min(19) // we cap it at 2^19 = 512K.
|
||||||
"Per thread memory is too small: {}",
|
} else {
|
||||||
per_thread_memory_budget
|
unreachable!(
|
||||||
)
|
"Per thread memory is too small: {}",
|
||||||
})
|
per_thread_memory_budget
|
||||||
.min(19) // we cap it at 512K
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||||
@@ -256,7 +259,7 @@ pub fn advance_deletes(
|
|||||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
segment_entry.set_meta((*segment.meta()).clone());
|
segment_entry.set_meta(segment.meta().clone());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -264,7 +267,7 @@ fn index_documents(
|
|||||||
memory_budget: usize,
|
memory_budget: usize,
|
||||||
segment: &Segment,
|
segment: &Segment,
|
||||||
generation: usize,
|
generation: usize,
|
||||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
|
||||||
segment_updater: &mut SegmentUpdater,
|
segment_updater: &mut SegmentUpdater,
|
||||||
mut delete_cursor: DeleteCursor,
|
mut delete_cursor: DeleteCursor,
|
||||||
) -> Result<bool> {
|
) -> Result<bool> {
|
||||||
@@ -272,11 +275,11 @@ fn index_documents(
|
|||||||
let segment_id = segment.id();
|
let segment_id = segment.id();
|
||||||
let table_size = initial_table_size(memory_budget);
|
let table_size = initial_table_size(memory_budget);
|
||||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||||
for doc in document_iterator {
|
for documents in document_iterator {
|
||||||
segment_writer.add_document(doc, &schema)?;
|
for doc in documents {
|
||||||
|
segment_writer.add_document(doc, &schema)?;
|
||||||
|
}
|
||||||
let mem_usage = segment_writer.mem_usage();
|
let mem_usage = segment_writer.mem_usage();
|
||||||
|
|
||||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||||
info!(
|
info!(
|
||||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||||
@@ -302,7 +305,7 @@ fn index_documents(
|
|||||||
|
|
||||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||||
|
|
||||||
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
|
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
||||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||||
let segment_reader = SegmentReader::open(segment)?;
|
let segment_reader = SegmentReader::open(segment)?;
|
||||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||||
@@ -313,18 +316,17 @@ fn index_documents(
|
|||||||
&doc_to_opstamps,
|
&doc_to_opstamps,
|
||||||
last_docstamp,
|
last_docstamp,
|
||||||
)?;
|
)?;
|
||||||
SegmentEntry::new(segment_meta, delete_cursor, {
|
if may_have_deletes {
|
||||||
if may_have_deletes {
|
Some(deleted_bitset)
|
||||||
Some(deleted_bitset)
|
} else {
|
||||||
} else {
|
None
|
||||||
None
|
}
|
||||||
}
|
|
||||||
})
|
|
||||||
} else {
|
} else {
|
||||||
// if there are no delete operation in the queue, no need
|
// if there are no delete operation in the queue, no need
|
||||||
// to even open the segment.
|
// to even open the segment.
|
||||||
SegmentEntry::new(segment_meta, delete_cursor, None)
|
None
|
||||||
};
|
};
|
||||||
|
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -366,13 +368,16 @@ impl IndexWriter {
|
|||||||
.add_segment(self.generation, segment_entry);
|
.add_segment(self.generation, segment_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// *Experimental & Advanced API* Creates a new segment.
|
/// Creates a new segment.
|
||||||
/// and marks it as currently in write.
|
|
||||||
///
|
///
|
||||||
/// This method is useful only for users trying to do complex
|
/// This method is useful only for users trying to do complex
|
||||||
/// operations, like converting an index format to another.
|
/// operations, like converting an index format to another.
|
||||||
|
///
|
||||||
|
/// It is safe to start writing file associated to the new `Segment`.
|
||||||
|
/// These will not be garbage collected as long as an instance object of
|
||||||
|
/// `SegmentMeta` object associated to the new `Segment` is "alive".
|
||||||
pub fn new_segment(&self) -> Segment {
|
pub fn new_segment(&self) -> Segment {
|
||||||
self.segment_updater.new_segment()
|
self.index.new_segment()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Spawns a new worker thread for indexing.
|
/// Spawns a new worker thread for indexing.
|
||||||
@@ -387,6 +392,7 @@ impl IndexWriter {
|
|||||||
let mut delete_cursor = self.delete_queue.cursor();
|
let mut delete_cursor = self.delete_queue.cursor();
|
||||||
|
|
||||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||||
|
let index = self.index.clone();
|
||||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||||
.name(format!(
|
.name(format!(
|
||||||
"thrd-tantivy-index{}-gen{}",
|
"thrd-tantivy-index{}-gen{}",
|
||||||
@@ -404,15 +410,19 @@ impl IndexWriter {
|
|||||||
// this is a valid guarantee as the
|
// this is a valid guarantee as the
|
||||||
// peeked document now belongs to
|
// peeked document now belongs to
|
||||||
// our local iterator.
|
// our local iterator.
|
||||||
if let Some(operation) = document_iterator.peek() {
|
if let Some(operations) = document_iterator.peek() {
|
||||||
delete_cursor.skip_to(operation.opstamp);
|
if let Some(first) = operations.first() {
|
||||||
|
delete_cursor.skip_to(first.opstamp);
|
||||||
|
} else {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// No more documents.
|
// No more documents.
|
||||||
// Happens when there is a commit, or if the `IndexWriter`
|
// Happens when there is a commit, or if the `IndexWriter`
|
||||||
// was dropped.
|
// was dropped.
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let segment = segment_updater.new_segment();
|
let segment = index.new_segment();
|
||||||
index_documents(
|
index_documents(
|
||||||
mem_budget,
|
mem_budget,
|
||||||
&segment,
|
&segment,
|
||||||
@@ -429,7 +439,7 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the merge policy.
|
/// Accessor to the merge policy.
|
||||||
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
|
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
|
||||||
self.segment_updater.get_merge_policy()
|
self.segment_updater.get_merge_policy()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -454,7 +464,10 @@ impl IndexWriter {
|
|||||||
/// Merges a given list of segments
|
/// Merges a given list of segments
|
||||||
///
|
///
|
||||||
/// `segment_ids` is required to be non-empty.
|
/// `segment_ids` is required to be non-empty.
|
||||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
pub fn merge(
|
||||||
|
&mut self,
|
||||||
|
segment_ids: &[SegmentId],
|
||||||
|
) -> Result<impl Future<Item = SegmentMeta, Error = Canceled>> {
|
||||||
self.segment_updater.start_merge(segment_ids)
|
self.segment_updater.start_merge(segment_ids)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -467,11 +480,10 @@ impl IndexWriter {
|
|||||||
///
|
///
|
||||||
/// Returns the former segment_ready channel.
|
/// Returns the former segment_ready channel.
|
||||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||||
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
|
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||||
swap(&mut self.document_sender, &mut document_sender);
|
mem::replace(&mut self.document_sender, document_sender);
|
||||||
swap(&mut self.document_receiver, &mut document_receiver);
|
mem::replace(&mut self.document_receiver, document_receiver)
|
||||||
document_receiver
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Rollback to the last commit
|
/// Rollback to the last commit
|
||||||
@@ -558,17 +570,12 @@ impl IndexWriter {
|
|||||||
// and recreate a new one channels.
|
// and recreate a new one channels.
|
||||||
self.recreate_document_channel();
|
self.recreate_document_channel();
|
||||||
|
|
||||||
let mut former_workers_join_handle = Vec::new();
|
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||||
swap(
|
|
||||||
&mut former_workers_join_handle,
|
|
||||||
&mut self.workers_join_handle,
|
|
||||||
);
|
|
||||||
|
|
||||||
for worker_handle in former_workers_join_handle {
|
for worker_handle in former_workers_join_handle {
|
||||||
let indexing_worker_result = worker_handle
|
let indexing_worker_result = worker_handle
|
||||||
.join()
|
.join()
|
||||||
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
||||||
|
|
||||||
indexing_worker_result?;
|
indexing_worker_result?;
|
||||||
// add a new worker for the next generation.
|
// add a new worker for the next generation.
|
||||||
self.add_indexing_worker()?;
|
self.add_indexing_worker()?;
|
||||||
@@ -641,32 +648,176 @@ impl IndexWriter {
|
|||||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||||
let opstamp = self.stamper.stamp();
|
let opstamp = self.stamper.stamp();
|
||||||
let add_operation = AddOperation { opstamp, document };
|
let add_operation = AddOperation { opstamp, document };
|
||||||
let send_result = self.document_sender.send(add_operation);
|
let send_result = self.document_sender.send(vec![add_operation]);
|
||||||
if let Err(e) = send_result {
|
if let Err(e) = send_result {
|
||||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||||
}
|
}
|
||||||
opstamp
|
opstamp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Gets a range of stamps from the stamper and "pops" the last stamp
|
||||||
|
/// from the range returning a tuple of the last optstamp and the popped
|
||||||
|
/// range.
|
||||||
|
///
|
||||||
|
/// The total number of stamps generated by this method is `count + 1`;
|
||||||
|
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
||||||
|
/// is for the batch itself.
|
||||||
|
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
|
||||||
|
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||||
|
let last_opstamp = end - 1;
|
||||||
|
let stamps = Range {
|
||||||
|
start: start,
|
||||||
|
end: last_opstamp,
|
||||||
|
};
|
||||||
|
(last_opstamp, stamps)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runs a group of document operations ensuring that the operations are
|
||||||
|
/// assigned contigous u64 opstamps and that add operations of the same
|
||||||
|
/// group are flushed into the same segment.
|
||||||
|
///
|
||||||
|
/// If the indexing pipeline is full, this call may block.
|
||||||
|
///
|
||||||
|
/// Each operation of the given `user_operations` will receive an in-order,
|
||||||
|
/// contiguous u64 opstamp. The entire batch itself is also given an
|
||||||
|
/// opstamp that is 1 greater than the last given operation. This
|
||||||
|
/// `batch_opstamp` is the return value of `run`. An empty group of
|
||||||
|
/// `user_operations`, an empty `Vec<UserOperation>`, still receives
|
||||||
|
/// a valid opstamp even though no changes were _actually_ made to the index.
|
||||||
|
///
|
||||||
|
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||||
|
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||||
|
/// visible to readers only after calling `commit()`.
|
||||||
|
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
|
||||||
|
let count = user_operations.len() as u64;
|
||||||
|
if count == 0 {
|
||||||
|
return self.stamper.stamp();
|
||||||
|
}
|
||||||
|
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
||||||
|
|
||||||
|
let mut adds: Vec<AddOperation> = Vec::new();
|
||||||
|
|
||||||
|
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
||||||
|
match user_op {
|
||||||
|
UserOperation::Delete(term) => {
|
||||||
|
let delete_operation = DeleteOperation {
|
||||||
|
opstamp: opstamp,
|
||||||
|
term: term,
|
||||||
|
};
|
||||||
|
self.delete_queue.push(delete_operation);
|
||||||
|
}
|
||||||
|
UserOperation::Add(doc) => {
|
||||||
|
let add_operation = AddOperation {
|
||||||
|
opstamp: opstamp,
|
||||||
|
document: doc,
|
||||||
|
};
|
||||||
|
adds.push(add_operation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let send_result = self.document_sender.send(adds);
|
||||||
|
if let Err(e) = send_result {
|
||||||
|
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||||
|
};
|
||||||
|
|
||||||
|
batch_opstamp
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use super::super::operation::UserOperation;
|
||||||
use super::initial_table_size;
|
use super::initial_table_size;
|
||||||
|
use directory::error::LockError;
|
||||||
use error::*;
|
use error::*;
|
||||||
use indexer::NoMergePolicy;
|
use indexer::NoMergePolicy;
|
||||||
use schema::{self, Document};
|
use schema::{self, Document, IndexRecordOption};
|
||||||
|
use query::{TermQuery};
|
||||||
|
use collector::TopDocs;
|
||||||
use Index;
|
use Index;
|
||||||
use Term;
|
use Term;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_operations_group() {
|
||||||
|
// an operations group with 2 items should cause 3 opstamps 0, 1, and 2.
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
let operations = vec![
|
||||||
|
UserOperation::Add(doc!(text_field=>"a")),
|
||||||
|
UserOperation::Add(doc!(text_field=>"b")),
|
||||||
|
];
|
||||||
|
let batch_opstamp1 = index_writer.run(operations);
|
||||||
|
assert_eq!(batch_opstamp1, 2u64);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ordered_batched_operations() {
|
||||||
|
// * one delete for `doc!(field=>"a")`
|
||||||
|
// * one add for `doc!(field=>"a")`
|
||||||
|
// * one add for `doc!(field=>"b")`
|
||||||
|
// * one delete for `doc!(field=>"b")`
|
||||||
|
// after commit there is one doc with "a" and 0 doc with "b"
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
let a_term = Term::from_field_text(text_field, "a");
|
||||||
|
let b_term = Term::from_field_text(text_field, "b");
|
||||||
|
let operations = vec![
|
||||||
|
UserOperation::Delete(a_term),
|
||||||
|
UserOperation::Add(doc!(text_field=>"a")),
|
||||||
|
UserOperation::Add(doc!(text_field=>"b")),
|
||||||
|
UserOperation::Delete(b_term),
|
||||||
|
];
|
||||||
|
|
||||||
|
index_writer.run(operations);
|
||||||
|
index_writer.commit().expect("failed to commit");
|
||||||
|
index.load_searchers().expect("failed to load searchers");
|
||||||
|
|
||||||
|
let a_term = Term::from_field_text(text_field, "a");
|
||||||
|
let b_term = Term::from_field_text(text_field, "b");
|
||||||
|
|
||||||
|
let a_query = TermQuery::new(a_term, IndexRecordOption::Basic);
|
||||||
|
let b_query = TermQuery::new(b_term, IndexRecordOption::Basic);
|
||||||
|
|
||||||
|
let searcher = index.searcher();
|
||||||
|
|
||||||
|
let a_docs = searcher
|
||||||
|
.search(&a_query, &TopDocs::with_limit(1))
|
||||||
|
.expect("search for a failed");
|
||||||
|
|
||||||
|
let b_docs = searcher
|
||||||
|
.search(&b_query, &TopDocs::with_limit(1))
|
||||||
|
.expect("search for b failed");
|
||||||
|
|
||||||
|
assert_eq!(a_docs.len(), 1);
|
||||||
|
assert_eq!(b_docs.len(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty_operations_group() {
|
||||||
|
let schema_builder = schema::Schema::builder();
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||||
|
let operations1 = vec![];
|
||||||
|
let batch_opstamp1 = index_writer.run(operations1);
|
||||||
|
assert_eq!(batch_opstamp1, 0u64);
|
||||||
|
let operations2 = vec![];
|
||||||
|
let batch_opstamp2 = index_writer.run(operations2);
|
||||||
|
assert_eq!(batch_opstamp2, 1u64);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_stops_duplicates() {
|
fn test_lockfile_stops_duplicates() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let _index_writer = index.writer(40_000_000).unwrap();
|
let _index_writer = index.writer(3_000_000).unwrap();
|
||||||
match index.writer(40_000_000) {
|
match index.writer(3_000_000) {
|
||||||
Err(TantivyError::LockFailure(_)) => {}
|
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
|
||||||
_ => panic!("Expected FileAlreadyExists error"),
|
_ => panic!("Expected a `LockFailure` error"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -678,8 +829,7 @@ mod tests {
|
|||||||
match index.writer_with_num_threads(1, 3_000_000) {
|
match index.writer_with_num_threads(1, 3_000_000) {
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
let err_msg = err.to_string();
|
let err_msg = err.to_string();
|
||||||
assert!(err_msg.contains("Lockfile"));
|
assert!(err_msg.contains("already an `IndexWriter`"));
|
||||||
assert!(err_msg.contains("Possible causes:"))
|
|
||||||
}
|
}
|
||||||
_ => panic!("Expected LockfileAlreadyExists error"),
|
_ => panic!("Expected LockfileAlreadyExists error"),
|
||||||
}
|
}
|
||||||
@@ -689,7 +839,7 @@ mod tests {
|
|||||||
fn test_set_merge_policy() {
|
fn test_set_merge_policy() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let index_writer = index.writer(40_000_000).unwrap();
|
let index_writer = index.writer(3_000_000).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
format!("{:?}", index_writer.get_merge_policy()),
|
format!("{:?}", index_writer.get_merge_policy()),
|
||||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||||
@@ -708,11 +858,11 @@ mod tests {
|
|||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let _index_writer = index.writer(40_000_000).unwrap();
|
let _index_writer = index.writer(3_000_000).unwrap();
|
||||||
// the lock should be released when the
|
// the lock should be released when the
|
||||||
// index_writer leaves the scope.
|
// index_writer leaves the scope.
|
||||||
}
|
}
|
||||||
let _index_writer_two = index.writer(40_000_000).unwrap();
|
let _index_writer_two = index.writer(3_000_000).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -739,7 +889,7 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(text_field=>"b"));
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
index_writer.add_document(doc!(text_field=>"c"));
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
}
|
}
|
||||||
assert_eq!(index_writer.commit().unwrap(), 2u64);
|
assert!(index_writer.commit().is_ok());
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
assert_eq!(num_docs_containing("a"), 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
assert_eq!(num_docs_containing("b"), 1);
|
assert_eq!(num_docs_containing("b"), 1);
|
||||||
@@ -802,7 +952,6 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
prepared_commit.set_payload("first commit");
|
prepared_commit.set_payload("first commit");
|
||||||
assert_eq!(prepared_commit.opstamp(), 100);
|
|
||||||
prepared_commit.commit().expect("commit failed");
|
prepared_commit.commit().expect("commit failed");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -836,7 +985,6 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
prepared_commit.set_payload("first commit");
|
prepared_commit.set_payload("first commit");
|
||||||
assert_eq!(prepared_commit.opstamp(), 100);
|
|
||||||
prepared_commit.abort().expect("commit failed");
|
prepared_commit.abort().expect("commit failed");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
|
|||||||
64
src/indexer/merge_operation.rs
Normal file
64
src/indexer/merge_operation.rs
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
use census::{Inventory, TrackedObject};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use SegmentId;
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
|
||||||
|
|
||||||
|
impl MergeOperationInventory {
|
||||||
|
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
|
||||||
|
let mut segment_in_merge = HashSet::default();
|
||||||
|
for merge_op in self.0.list() {
|
||||||
|
for &segment_id in &merge_op.segment_ids {
|
||||||
|
segment_in_merge.insert(segment_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
segment_in_merge
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A `MergeOperation` has two role.
|
||||||
|
/// It carries all of the information required to describe a merge :
|
||||||
|
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
||||||
|
/// delete queue and reflect their deletes.
|
||||||
|
/// - `segment_ids` is the list of segment to be merged.
|
||||||
|
///
|
||||||
|
/// The second role is to ensure keep track of the fact that these
|
||||||
|
/// segments are in merge and avoid starting a merge operation that
|
||||||
|
/// may conflict with this one.
|
||||||
|
///
|
||||||
|
/// This works by tracking merge operations. When considering computing
|
||||||
|
/// merge candidates, we simply list tracked merge operations and remove
|
||||||
|
/// their segments from possible merge candidates.
|
||||||
|
pub struct MergeOperation {
|
||||||
|
inner: TrackedObject<InnerMergeOperation>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct InnerMergeOperation {
|
||||||
|
target_opstamp: u64,
|
||||||
|
segment_ids: Vec<SegmentId>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MergeOperation {
|
||||||
|
pub fn new(
|
||||||
|
inventory: &MergeOperationInventory,
|
||||||
|
target_opstamp: u64,
|
||||||
|
segment_ids: Vec<SegmentId>,
|
||||||
|
) -> MergeOperation {
|
||||||
|
let inner_merge_operation = InnerMergeOperation {
|
||||||
|
target_opstamp,
|
||||||
|
segment_ids,
|
||||||
|
};
|
||||||
|
MergeOperation {
|
||||||
|
inner: inventory.0.track(inner_merge_operation),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn target_opstamp(&self) -> u64 {
|
||||||
|
self.inner.target_opstamp
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn segment_ids(&self) -> &[SegmentId] {
|
||||||
|
&self.inner.segment_ids[..]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,7 +11,7 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
|
|||||||
///
|
///
|
||||||
/// Every time a the list of segments changes, the segment updater
|
/// Every time a the list of segments changes, the segment updater
|
||||||
/// asks the merge policy if some segments should be merged.
|
/// asks the merge policy if some segments should be merged.
|
||||||
pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
|
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
||||||
/// Given the list of segment metas, returns the list of merge candidates.
|
/// Given the list of segment metas, returns the list of merge candidates.
|
||||||
///
|
///
|
||||||
/// This call happens on the segment updater thread, and will block
|
/// This call happens on the segment updater thread, and will block
|
||||||
@@ -19,21 +19,6 @@ pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
|
|||||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// MergePolicyClone
|
|
||||||
pub trait MergePolicyClone {
|
|
||||||
/// Returns a boxed clone of the MergePolicy.
|
|
||||||
fn box_clone(&self) -> Box<MergePolicy>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> MergePolicyClone for T
|
|
||||||
where
|
|
||||||
T: 'static + MergePolicy + Clone,
|
|
||||||
{
|
|
||||||
fn box_clone(&self) -> Box<MergePolicy> {
|
|
||||||
Box::new(self.clone())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Never merge segments.
|
/// Never merge segments.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct NoMergePolicy;
|
pub struct NoMergePolicy;
|
||||||
|
|||||||
@@ -654,6 +654,7 @@ mod tests {
|
|||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use schema::TextFieldIndexing;
|
use schema::TextFieldIndexing;
|
||||||
|
use schema::INT_INDEXED;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
@@ -835,7 +836,7 @@ mod tests {
|
|||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
|
||||||
let search_term = |searcher: &Searcher, term: Term| {
|
let search_term = |searcher: &Searcher, term: Term| {
|
||||||
let collector = FastFieldTestCollector::for_field(score_field);
|
let collector = FastFieldTestCollector::for_field(score_field);
|
||||||
@@ -983,7 +984,7 @@ mod tests {
|
|||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let ref searcher = *index.searcher();
|
let searcher = index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert_eq!(searcher.num_docs(), 3);
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||||
@@ -1029,7 +1030,7 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let ref searcher = *index.searcher();
|
let searcher = index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||||
@@ -1125,18 +1126,15 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// Test removing all docs
|
// Test removing all docs
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
index_writer
|
|
||||||
.merge(&segment_ids)
|
|
||||||
.expect("Failed to initiate merge")
|
|
||||||
.wait()
|
|
||||||
.expect("Merging failed");
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
let ref searcher = *index.searcher();
|
let ref searcher = *index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert!(segment_ids.is_empty());
|
||||||
|
assert!(searcher.segment_readers().is_empty());
|
||||||
assert_eq!(searcher.num_docs(), 0);
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1147,7 +1145,7 @@ mod tests {
|
|||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
for facet in doc_facets {
|
for facet in doc_facets {
|
||||||
@@ -1212,7 +1210,7 @@ mod tests {
|
|||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer
|
index_writer
|
||||||
.merge(&segment_ids)
|
.merge(&segment_ids)
|
||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
@@ -1235,7 +1233,7 @@ mod tests {
|
|||||||
|
|
||||||
// Deleting one term
|
// Deleting one term
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||||
let facet_term = Term::from_facet(facet_field, &facet);
|
let facet_term = Term::from_facet(facet_field, &facet);
|
||||||
index_writer.delete_term(facet_term);
|
index_writer.delete_term(facet_term);
|
||||||
@@ -1255,6 +1253,34 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bug_merge() {
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(int_field => 1u64));
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
index_writer.add_document(doc!(int_field => 1u64));
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
|
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||||
|
let segment_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
index_writer
|
||||||
|
.merge(&segment_ids)
|
||||||
|
.expect("Failed to initiate merge")
|
||||||
|
.wait()
|
||||||
|
.expect("Merging failed");
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
// commit has not been called yet. The document should still be
|
||||||
|
// there.
|
||||||
|
assert_eq!(index.searcher().num_docs(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
@@ -1265,7 +1291,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_u64(int_field, 1);
|
doc.add_u64(int_field, 1);
|
||||||
index_writer.add_document(doc.clone());
|
index_writer.add_document(doc.clone());
|
||||||
@@ -1273,24 +1299,26 @@ mod tests {
|
|||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
index_writer.commit().expect("commit failed");
|
index_writer.commit().expect("commit failed");
|
||||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||||
index_writer.commit().expect("commit failed");
|
|
||||||
}
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
let searcher = index.searcher();
|
|
||||||
assert_eq!(searcher.num_docs(), 0);
|
|
||||||
// Merging the segments
|
|
||||||
{
|
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
|
||||||
index_writer
|
index_writer
|
||||||
.merge(&segment_ids)
|
.merge(&segment_ids)
|
||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
|
|
||||||
|
// assert delete has not been committed
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
|
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
index_writer.wait_merging_threads().unwrap();
|
index_writer.wait_merging_threads().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 0);
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
@@ -1306,7 +1334,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
for &val in int_vals {
|
for &val in int_vals {
|
||||||
@@ -1395,7 +1423,7 @@ mod tests {
|
|||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer
|
index_writer
|
||||||
.merge(&segment_ids)
|
.merge(&segment_ids)
|
||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
pub mod delete_queue;
|
pub mod delete_queue;
|
||||||
mod directory_lock;
|
|
||||||
mod doc_opstamp_mapping;
|
mod doc_opstamp_mapping;
|
||||||
pub mod index_writer;
|
pub mod index_writer;
|
||||||
mod log_merge_policy;
|
mod log_merge_policy;
|
||||||
|
mod merge_operation;
|
||||||
pub mod merge_policy;
|
pub mod merge_policy;
|
||||||
pub mod merger;
|
pub mod merger;
|
||||||
pub mod operation;
|
pub mod operation;
|
||||||
@@ -15,14 +16,12 @@ pub mod segment_updater;
|
|||||||
mod segment_writer;
|
mod segment_writer;
|
||||||
mod stamper;
|
mod stamper;
|
||||||
|
|
||||||
pub(crate) use self::directory_lock::DirectoryLock;
|
|
||||||
pub use self::directory_lock::LockType;
|
|
||||||
|
|
||||||
pub use self::index_writer::IndexWriter;
|
pub use self::index_writer::IndexWriter;
|
||||||
pub use self::log_merge_policy::LogMergePolicy;
|
pub use self::log_merge_policy::LogMergePolicy;
|
||||||
|
pub use self::merge_operation::{MergeOperation, MergeOperationInventory};
|
||||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||||
pub use self::prepared_commit::PreparedCommit;
|
pub use self::prepared_commit::PreparedCommit;
|
||||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
pub use self::segment_entry::SegmentEntry;
|
||||||
pub use self::segment_manager::SegmentManager;
|
pub use self::segment_manager::SegmentManager;
|
||||||
pub use self::segment_serializer::SegmentSerializer;
|
pub use self::segment_serializer::SegmentSerializer;
|
||||||
pub use self::segment_writer::SegmentWriter;
|
pub use self::segment_writer::SegmentWriter;
|
||||||
|
|||||||
@@ -14,3 +14,10 @@ pub struct AddOperation {
|
|||||||
pub opstamp: u64,
|
pub opstamp: u64,
|
||||||
pub document: Document,
|
pub document: Document,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// UserOperation is an enum type that encapsulates other operation types.
|
||||||
|
#[derive(Eq, PartialEq, Debug)]
|
||||||
|
pub enum UserOperation {
|
||||||
|
Add(Document),
|
||||||
|
Delete(Term),
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,21 +4,6 @@ use core::SegmentMeta;
|
|||||||
use indexer::delete_queue::DeleteCursor;
|
use indexer::delete_queue::DeleteCursor;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
|
||||||
pub enum SegmentState {
|
|
||||||
Ready,
|
|
||||||
InMerge,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentState {
|
|
||||||
pub fn letter_code(self) -> char {
|
|
||||||
match self {
|
|
||||||
SegmentState::InMerge => 'M',
|
|
||||||
SegmentState::Ready => 'R',
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A segment entry describes the state of
|
/// A segment entry describes the state of
|
||||||
/// a given segment, at a given instant.
|
/// a given segment, at a given instant.
|
||||||
///
|
///
|
||||||
@@ -35,7 +20,6 @@ impl SegmentState {
|
|||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SegmentEntry {
|
pub struct SegmentEntry {
|
||||||
meta: SegmentMeta,
|
meta: SegmentMeta,
|
||||||
state: SegmentState,
|
|
||||||
delete_bitset: Option<BitSet>,
|
delete_bitset: Option<BitSet>,
|
||||||
delete_cursor: DeleteCursor,
|
delete_cursor: DeleteCursor,
|
||||||
}
|
}
|
||||||
@@ -49,7 +33,6 @@ impl SegmentEntry {
|
|||||||
) -> SegmentEntry {
|
) -> SegmentEntry {
|
||||||
SegmentEntry {
|
SegmentEntry {
|
||||||
meta: segment_meta,
|
meta: segment_meta,
|
||||||
state: SegmentState::Ready,
|
|
||||||
delete_bitset,
|
delete_bitset,
|
||||||
delete_cursor,
|
delete_cursor,
|
||||||
}
|
}
|
||||||
@@ -72,14 +55,6 @@ impl SegmentEntry {
|
|||||||
&mut self.delete_cursor
|
&mut self.delete_cursor
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the `SegmentEntry`.
|
|
||||||
///
|
|
||||||
/// The state describes whether the segment is available for
|
|
||||||
/// a merge or not.
|
|
||||||
pub fn state(&self) -> SegmentState {
|
|
||||||
self.state
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the segment id.
|
/// Returns the segment id.
|
||||||
pub fn segment_id(&self) -> SegmentId {
|
pub fn segment_id(&self) -> SegmentId {
|
||||||
self.meta.id()
|
self.meta.id()
|
||||||
@@ -89,33 +64,10 @@ impl SegmentEntry {
|
|||||||
pub fn meta(&self) -> &SegmentMeta {
|
pub fn meta(&self) -> &SegmentMeta {
|
||||||
&self.meta
|
&self.meta
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Mark the `SegmentEntry` as in merge.
|
|
||||||
///
|
|
||||||
/// Only segments that are not already
|
|
||||||
/// in a merge are elligible for future merge.
|
|
||||||
pub fn start_merge(&mut self) {
|
|
||||||
self.state = SegmentState::InMerge;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Cancel a merge
|
|
||||||
///
|
|
||||||
/// If a merge fails, it is important to switch
|
|
||||||
/// the segment back to a idle state, so that it
|
|
||||||
/// may be elligible for future merges.
|
|
||||||
pub fn cancel_merge(&mut self) {
|
|
||||||
self.state = SegmentState::Ready;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true iff a segment should
|
|
||||||
/// be considered for a merge.
|
|
||||||
pub fn is_ready(&self) -> bool {
|
|
||||||
self.state == SegmentState::Ready
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for SegmentEntry {
|
impl fmt::Debug for SegmentEntry {
|
||||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||||
write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state)
|
write!(formatter, "SegmentEntry({:?})", self.meta)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ use Result as TantivyResult;
|
|||||||
struct SegmentRegisters {
|
struct SegmentRegisters {
|
||||||
uncommitted: SegmentRegister,
|
uncommitted: SegmentRegister,
|
||||||
committed: SegmentRegister,
|
committed: SegmentRegister,
|
||||||
writing: HashSet<SegmentId>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The segment manager stores the list of segments
|
/// The segment manager stores the list of segments
|
||||||
@@ -41,12 +40,17 @@ impl Debug for SegmentManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_mergeable_segments(
|
pub fn get_mergeable_segments(
|
||||||
|
in_merge_segment_ids: &HashSet<SegmentId>,
|
||||||
segment_manager: &SegmentManager,
|
segment_manager: &SegmentManager,
|
||||||
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||||
let registers_lock = segment_manager.read();
|
let registers_lock = segment_manager.read();
|
||||||
(
|
(
|
||||||
registers_lock.committed.get_mergeable_segments(),
|
registers_lock
|
||||||
registers_lock.uncommitted.get_mergeable_segments(),
|
.committed
|
||||||
|
.get_mergeable_segments(in_merge_segment_ids),
|
||||||
|
registers_lock
|
||||||
|
.uncommitted
|
||||||
|
.get_mergeable_segments(in_merge_segment_ids),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -59,7 +63,6 @@ impl SegmentManager {
|
|||||||
registers: RwLock::new(SegmentRegisters {
|
registers: RwLock::new(SegmentRegisters {
|
||||||
uncommitted: SegmentRegister::default(),
|
uncommitted: SegmentRegister::default(),
|
||||||
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
||||||
writing: HashSet::new(),
|
|
||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -72,12 +75,6 @@ impl SegmentManager {
|
|||||||
segment_entries
|
segment_entries
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the overall number of segments in the `SegmentManager`
|
|
||||||
pub fn num_segments(&self) -> usize {
|
|
||||||
let registers_lock = self.read();
|
|
||||||
registers_lock.committed.len() + registers_lock.uncommitted.len()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// List the files that are useful to the index.
|
/// List the files that are useful to the index.
|
||||||
///
|
///
|
||||||
/// This does not include lock files, or files that are obsolete
|
/// This does not include lock files, or files that are obsolete
|
||||||
@@ -106,6 +103,21 @@ impl SegmentManager {
|
|||||||
.expect("Failed to acquire write lock on SegmentManager.")
|
.expect("Failed to acquire write lock on SegmentManager.")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Deletes all empty segments
|
||||||
|
fn remove_empty_segments(&self) {
|
||||||
|
let mut registers_lock = self.write();
|
||||||
|
registers_lock
|
||||||
|
.committed
|
||||||
|
.segment_entries()
|
||||||
|
.iter()
|
||||||
|
.filter(|segment| segment.meta().num_docs() == 0)
|
||||||
|
.for_each(|segment| {
|
||||||
|
registers_lock
|
||||||
|
.committed
|
||||||
|
.remove_segment(&segment.segment_id())
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
registers_lock.committed.clear();
|
registers_lock.committed.clear();
|
||||||
@@ -121,25 +133,22 @@ impl SegmentManager {
|
|||||||
/// the `segment_ids` are not either all committed or all
|
/// the `segment_ids` are not either all committed or all
|
||||||
/// uncommitted.
|
/// uncommitted.
|
||||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
||||||
let mut registers_lock = self.write();
|
let registers_lock = self.read();
|
||||||
let mut segment_entries = vec![];
|
let mut segment_entries = vec![];
|
||||||
if registers_lock.uncommitted.contains_all(segment_ids) {
|
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||||
for segment_id in segment_ids {
|
for segment_id in segment_ids {
|
||||||
let segment_entry = registers_lock.uncommitted
|
let segment_entry = registers_lock.uncommitted
|
||||||
.start_merge(segment_id)
|
.get(segment_id)
|
||||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||||
segment_entries.push(segment_entry);
|
segment_entries.push(segment_entry);
|
||||||
}
|
}
|
||||||
} else if registers_lock.committed.contains_all(segment_ids) {
|
} else if registers_lock.committed.contains_all(segment_ids) {
|
||||||
for segment_id in segment_ids {
|
for segment_id in segment_ids {
|
||||||
let segment_entry = registers_lock.committed
|
let segment_entry = registers_lock.committed
|
||||||
.start_merge(segment_id)
|
.get(segment_id)
|
||||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||||
segment_entries.push(segment_entry);
|
segment_entries.push(segment_entry);
|
||||||
}
|
}
|
||||||
for segment_id in segment_ids {
|
|
||||||
registers_lock.committed.start_merge(segment_id);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
let error_msg = "Merge operation sent for segments that are not \
|
let error_msg = "Merge operation sent for segments that are not \
|
||||||
all uncommited or commited."
|
all uncommited or commited."
|
||||||
@@ -149,50 +158,8 @@ impl SegmentManager {
|
|||||||
Ok(segment_entries)
|
Ok(segment_entries)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn cancel_merge(
|
|
||||||
&self,
|
|
||||||
before_merge_segment_ids: &[SegmentId],
|
|
||||||
after_merge_segment_id: SegmentId,
|
|
||||||
) {
|
|
||||||
let mut registers_lock = self.write();
|
|
||||||
|
|
||||||
// we mark all segments are ready for merge.
|
|
||||||
{
|
|
||||||
let target_segment_register: &mut SegmentRegister;
|
|
||||||
target_segment_register = {
|
|
||||||
if registers_lock
|
|
||||||
.uncommitted
|
|
||||||
.contains_all(before_merge_segment_ids)
|
|
||||||
{
|
|
||||||
&mut registers_lock.uncommitted
|
|
||||||
} else if registers_lock
|
|
||||||
.committed
|
|
||||||
.contains_all(before_merge_segment_ids)
|
|
||||||
{
|
|
||||||
&mut registers_lock.committed
|
|
||||||
} else {
|
|
||||||
warn!("couldn't find segment in SegmentManager");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
for segment_id in before_merge_segment_ids {
|
|
||||||
target_segment_register.cancel_merge(segment_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ... and we make sure the target segment entry
|
|
||||||
// can be garbage collected.
|
|
||||||
registers_lock.writing.remove(&after_merge_segment_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write_segment(&self, segment_id: SegmentId) {
|
|
||||||
let mut registers_lock = self.write();
|
|
||||||
registers_lock.writing.insert(segment_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_segment(&self, segment_entry: SegmentEntry) {
|
pub fn add_segment(&self, segment_entry: SegmentEntry) {
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
registers_lock.writing.remove(&segment_entry.segment_id());
|
|
||||||
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -202,10 +169,6 @@ impl SegmentManager {
|
|||||||
after_merge_segment_entry: SegmentEntry,
|
after_merge_segment_entry: SegmentEntry,
|
||||||
) {
|
) {
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
registers_lock
|
|
||||||
.writing
|
|
||||||
.remove(&after_merge_segment_entry.segment_id());
|
|
||||||
|
|
||||||
let target_register: &mut SegmentRegister = {
|
let target_register: &mut SegmentRegister = {
|
||||||
if registers_lock
|
if registers_lock
|
||||||
.uncommitted
|
.uncommitted
|
||||||
@@ -229,6 +192,7 @@ impl SegmentManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||||
|
self.remove_empty_segments();
|
||||||
let registers_lock = self.read();
|
let registers_lock = self.read();
|
||||||
registers_lock.committed.segment_metas()
|
registers_lock.committed.segment_metas()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use core::SegmentMeta;
|
|||||||
use indexer::delete_queue::DeleteCursor;
|
use indexer::delete_queue::DeleteCursor;
|
||||||
use indexer::segment_entry::SegmentEntry;
|
use indexer::segment_entry::SegmentEntry;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::collections::HashSet;
|
||||||
use std::fmt::{self, Debug, Formatter};
|
use std::fmt::{self, Debug, Formatter};
|
||||||
|
|
||||||
/// The segment register keeps track
|
/// The segment register keeps track
|
||||||
@@ -21,8 +22,8 @@ pub struct SegmentRegister {
|
|||||||
impl Debug for SegmentRegister {
|
impl Debug for SegmentRegister {
|
||||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||||
write!(f, "SegmentRegister(")?;
|
write!(f, "SegmentRegister(")?;
|
||||||
for (k, v) in &self.segment_states {
|
for k in self.segment_states.keys() {
|
||||||
write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())?;
|
write!(f, "{}, ", k.short_uuid_string())?;
|
||||||
}
|
}
|
||||||
write!(f, ")")?;
|
write!(f, ")")?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -34,14 +35,13 @@ impl SegmentRegister {
|
|||||||
self.segment_states.clear();
|
self.segment_states.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn get_mergeable_segments(
|
||||||
self.segment_states.len()
|
&self,
|
||||||
}
|
in_merge_segment_ids: &HashSet<SegmentId>,
|
||||||
|
) -> Vec<SegmentMeta> {
|
||||||
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
|
|
||||||
self.segment_states
|
self.segment_states
|
||||||
.values()
|
.values()
|
||||||
.filter(|segment_entry| segment_entry.is_ready())
|
.filter(|segment_entry| !in_merge_segment_ids.contains(&segment_entry.segment_id()))
|
||||||
.map(|segment_entry| segment_entry.meta().clone())
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
@@ -60,7 +60,7 @@ impl SegmentRegister {
|
|||||||
segment_ids
|
segment_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
|
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
||||||
segment_ids
|
segment_ids
|
||||||
.iter()
|
.iter()
|
||||||
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
||||||
@@ -75,20 +75,8 @@ impl SegmentRegister {
|
|||||||
self.segment_states.remove(segment_id);
|
self.segment_states.remove(segment_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
|
pub fn get(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||||
self.segment_states
|
self.segment_states.get(segment_id).cloned()
|
||||||
.get_mut(segment_id)
|
|
||||||
.expect("Received a merge notification for a segment that is not registered")
|
|
||||||
.cancel_merge();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn start_merge(&mut self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
|
||||||
if let Some(segment_entry) = self.segment_states.get_mut(segment_id) {
|
|
||||||
segment_entry.start_merge();
|
|
||||||
Some(segment_entry.clone())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
|
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
|
||||||
@@ -100,11 +88,6 @@ impl SegmentRegister {
|
|||||||
}
|
}
|
||||||
SegmentRegister { segment_states }
|
SegmentRegister { segment_states }
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
|
||||||
self.segment_states.get(segment_id).cloned()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -113,7 +96,6 @@ mod tests {
|
|||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use indexer::delete_queue::*;
|
use indexer::delete_queue::*;
|
||||||
use indexer::SegmentState;
|
|
||||||
|
|
||||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||||
segment_register
|
segment_register
|
||||||
@@ -137,42 +119,12 @@ mod tests {
|
|||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||||
segment_register.add_segment_entry(segment_entry);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
}
|
}
|
||||||
assert_eq!(
|
|
||||||
segment_register
|
|
||||||
.segment_entry(&segment_id_a)
|
|
||||||
.unwrap()
|
|
||||||
.state(),
|
|
||||||
SegmentState::Ready
|
|
||||||
);
|
|
||||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||||
{
|
{
|
||||||
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
|
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||||
segment_register.add_segment_entry(segment_entry);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
}
|
}
|
||||||
assert_eq!(
|
|
||||||
segment_register
|
|
||||||
.segment_entry(&segment_id_b)
|
|
||||||
.unwrap()
|
|
||||||
.state(),
|
|
||||||
SegmentState::Ready
|
|
||||||
);
|
|
||||||
segment_register.start_merge(&segment_id_a);
|
|
||||||
segment_register.start_merge(&segment_id_b);
|
|
||||||
assert_eq!(
|
|
||||||
segment_register
|
|
||||||
.segment_entry(&segment_id_a)
|
|
||||||
.unwrap()
|
|
||||||
.state(),
|
|
||||||
SegmentState::InMerge
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
segment_register
|
|
||||||
.segment_entry(&segment_id_b)
|
|
||||||
.unwrap()
|
|
||||||
.state(),
|
|
||||||
SegmentState::InMerge
|
|
||||||
);
|
|
||||||
segment_register.remove_segment(&segment_id_a);
|
segment_register.remove_segment(&segment_id_a);
|
||||||
segment_register.remove_segment(&segment_id_b);
|
segment_register.remove_segment(&segment_id_b);
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -16,9 +16,10 @@ use futures_cpupool::CpuFuture;
|
|||||||
use futures_cpupool::CpuPool;
|
use futures_cpupool::CpuPool;
|
||||||
use indexer::delete_queue::DeleteCursor;
|
use indexer::delete_queue::DeleteCursor;
|
||||||
use indexer::index_writer::advance_deletes;
|
use indexer::index_writer::advance_deletes;
|
||||||
|
use indexer::merge_operation::MergeOperationInventory;
|
||||||
use indexer::merger::IndexMerger;
|
use indexer::merger::IndexMerger;
|
||||||
use indexer::stamper::Stamper;
|
use indexer::stamper::Stamper;
|
||||||
use indexer::MergeCandidate;
|
use indexer::MergeOperation;
|
||||||
use indexer::SegmentEntry;
|
use indexer::SegmentEntry;
|
||||||
use indexer::SegmentSerializer;
|
use indexer::SegmentSerializer;
|
||||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||||
@@ -26,6 +27,7 @@ use schema::Schema;
|
|||||||
use serde_json;
|
use serde_json;
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::collections::HashSet;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
@@ -45,8 +47,16 @@ use Result;
|
|||||||
/// and flushed.
|
/// and flushed.
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
|
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
||||||
save_metas(vec![], schema, opstamp, None, directory)
|
save_metas(
|
||||||
|
&IndexMeta {
|
||||||
|
segments: Vec::new(),
|
||||||
|
schema,
|
||||||
|
opstamp: 0u64,
|
||||||
|
payload: None,
|
||||||
|
},
|
||||||
|
directory,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
@@ -58,20 +68,9 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
|
|||||||
/// and flushed.
|
/// and flushed.
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
pub fn save_metas(
|
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
|
||||||
segment_metas: Vec<SegmentMeta>,
|
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||||
schema: Schema,
|
// Just adding a new line at the end of the buffer.
|
||||||
opstamp: u64,
|
|
||||||
payload: Option<String>,
|
|
||||||
directory: &mut Directory,
|
|
||||||
) -> Result<()> {
|
|
||||||
let metas = IndexMeta {
|
|
||||||
segments: segment_metas,
|
|
||||||
schema,
|
|
||||||
opstamp,
|
|
||||||
payload,
|
|
||||||
};
|
|
||||||
let mut buffer = serde_json::to_vec_pretty(&metas)?;
|
|
||||||
writeln!(&mut buffer)?;
|
writeln!(&mut buffer)?;
|
||||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||||
@@ -83,16 +82,21 @@ pub fn save_metas(
|
|||||||
//
|
//
|
||||||
// All this processing happens on a single thread
|
// All this processing happens on a single thread
|
||||||
// consuming a common queue.
|
// consuming a common queue.
|
||||||
|
//
|
||||||
|
// We voluntarily pass a merge_operation ref to guarantee that
|
||||||
|
// the merge_operation is alive during the process
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||||
|
|
||||||
fn perform_merge(
|
fn perform_merge(
|
||||||
|
merge_operation: &MergeOperation,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
mut segment_entries: Vec<SegmentEntry>,
|
mut segment_entries: Vec<SegmentEntry>,
|
||||||
mut merged_segment: Segment,
|
|
||||||
target_opstamp: u64,
|
|
||||||
) -> Result<SegmentEntry> {
|
) -> Result<SegmentEntry> {
|
||||||
|
let target_opstamp = merge_operation.target_opstamp();
|
||||||
|
|
||||||
// first we need to apply deletes to our segment.
|
// first we need to apply deletes to our segment.
|
||||||
|
let mut merged_segment = index.new_segment();
|
||||||
|
|
||||||
// TODO add logging
|
// TODO add logging
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
@@ -126,15 +130,23 @@ fn perform_merge(
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct InnerSegmentUpdater {
|
struct InnerSegmentUpdater {
|
||||||
|
// we keep a copy of the current active IndexMeta to
|
||||||
|
// avoid loading the file everytime we need it in the
|
||||||
|
// `SegmentUpdater`.
|
||||||
|
//
|
||||||
|
// This should be up to date as all update happen through
|
||||||
|
// the unique active `SegmentUpdater`.
|
||||||
|
active_metas: RwLock<Arc<IndexMeta>>,
|
||||||
pool: CpuPool,
|
pool: CpuPool,
|
||||||
index: Index,
|
index: Index,
|
||||||
segment_manager: SegmentManager,
|
segment_manager: SegmentManager,
|
||||||
merge_policy: RwLock<Box<MergePolicy>>,
|
merge_policy: RwLock<Arc<Box<MergePolicy>>>,
|
||||||
merging_thread_id: AtomicUsize,
|
merging_thread_id: AtomicUsize,
|
||||||
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
|
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
|
||||||
generation: AtomicUsize,
|
generation: AtomicUsize,
|
||||||
killed: AtomicBool,
|
killed: AtomicBool,
|
||||||
stamper: Stamper,
|
stamper: Stamper,
|
||||||
|
merge_operations: MergeOperationInventory,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentUpdater {
|
impl SegmentUpdater {
|
||||||
@@ -149,32 +161,29 @@ impl SegmentUpdater {
|
|||||||
.name_prefix("segment_updater")
|
.name_prefix("segment_updater")
|
||||||
.pool_size(1)
|
.pool_size(1)
|
||||||
.create();
|
.create();
|
||||||
|
let index_meta = index.load_metas()?;
|
||||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||||
|
active_metas: RwLock::new(Arc::new(index_meta)),
|
||||||
pool,
|
pool,
|
||||||
index,
|
index,
|
||||||
segment_manager,
|
segment_manager,
|
||||||
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
|
merge_policy: RwLock::new(Arc::new(Box::new(DefaultMergePolicy::default()))),
|
||||||
merging_thread_id: AtomicUsize::default(),
|
merging_thread_id: AtomicUsize::default(),
|
||||||
merging_threads: RwLock::new(HashMap::new()),
|
merging_threads: RwLock::new(HashMap::new()),
|
||||||
generation: AtomicUsize::default(),
|
generation: AtomicUsize::default(),
|
||||||
killed: AtomicBool::new(false),
|
killed: AtomicBool::new(false),
|
||||||
stamper,
|
stamper,
|
||||||
|
merge_operations: Default::default(),
|
||||||
})))
|
})))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_segment(&self) -> Segment {
|
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
|
||||||
let new_segment = self.0.index.new_segment();
|
self.0.merge_policy.read().unwrap().clone()
|
||||||
let segment_id = new_segment.id();
|
|
||||||
self.0.segment_manager.write_segment(segment_id);
|
|
||||||
new_segment
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
|
|
||||||
self.0.merge_policy.read().unwrap().box_clone()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||||
*self.0.merge_policy.write().unwrap() = merge_policy;
|
let arc_merge_policy = Arc::new(merge_policy);
|
||||||
|
*self.0.merge_policy.write().unwrap() = arc_merge_policy;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_merging_thread_id(&self) -> usize {
|
fn get_merging_thread_id(&self) -> usize {
|
||||||
@@ -244,14 +253,15 @@ impl SegmentUpdater {
|
|||||||
//
|
//
|
||||||
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
||||||
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
||||||
save_metas(
|
let index_meta = IndexMeta {
|
||||||
commited_segment_metas,
|
segments: commited_segment_metas,
|
||||||
index.schema(),
|
schema: index.schema(),
|
||||||
opstamp,
|
opstamp,
|
||||||
commit_message,
|
payload: commit_message,
|
||||||
directory.box_clone().borrow_mut(),
|
};
|
||||||
)
|
save_metas(&index_meta, directory.box_clone().borrow_mut())
|
||||||
.expect("Could not save metas.");
|
.expect("Could not save metas.");
|
||||||
|
self.store_meta(&index_meta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -286,51 +296,62 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||||
//let future_merged_segment = */
|
let commit_opstamp = self.load_metas().opstamp;
|
||||||
let segment_ids_vec = segment_ids.to_vec();
|
let merge_operation = MergeOperation::new(
|
||||||
self.run_async(move |segment_updater| {
|
&self.0.merge_operations,
|
||||||
segment_updater.start_merge_impl(&segment_ids_vec[..])
|
commit_opstamp,
|
||||||
})
|
segment_ids.to_vec(),
|
||||||
.wait()?
|
);
|
||||||
|
self.run_async(move |segment_updater| segment_updater.start_merge_impl(merge_operation))
|
||||||
|
.wait()?
|
||||||
|
}
|
||||||
|
|
||||||
|
fn store_meta(&self, index_meta: &IndexMeta) {
|
||||||
|
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
|
||||||
|
}
|
||||||
|
fn load_metas(&self) -> Arc<IndexMeta> {
|
||||||
|
self.0.active_metas.read().unwrap().clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
// `segment_ids` is required to be non-empty.
|
// `segment_ids` is required to be non-empty.
|
||||||
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
fn start_merge_impl(&self, merge_operation: MergeOperation) -> Result<Receiver<SegmentMeta>> {
|
||||||
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
|
assert!(
|
||||||
|
!merge_operation.segment_ids().is_empty(),
|
||||||
|
"Segment_ids cannot be empty."
|
||||||
|
);
|
||||||
|
|
||||||
let segment_updater_clone = self.clone();
|
let segment_updater_clone = self.clone();
|
||||||
let segment_entries: Vec<SegmentEntry> = self.0.segment_manager.start_merge(segment_ids)?;
|
let segment_entries: Vec<SegmentEntry> = self
|
||||||
|
.0
|
||||||
|
.segment_manager
|
||||||
|
.start_merge(merge_operation.segment_ids())?;
|
||||||
|
|
||||||
let segment_ids_vec = segment_ids.to_vec();
|
// let segment_ids_vec = merge_operation.segment_ids.to_vec();
|
||||||
|
|
||||||
let merging_thread_id = self.get_merging_thread_id();
|
let merging_thread_id = self.get_merging_thread_id();
|
||||||
info!(
|
info!(
|
||||||
"Starting merge thread #{} - {:?}",
|
"Starting merge thread #{} - {:?}",
|
||||||
merging_thread_id, segment_ids
|
merging_thread_id,
|
||||||
|
merge_operation.segment_ids()
|
||||||
);
|
);
|
||||||
let (merging_future_send, merging_future_recv) = oneshot();
|
let (merging_future_send, merging_future_recv) = oneshot();
|
||||||
|
|
||||||
let target_opstamp = self.0.stamper.stamp();
|
|
||||||
|
|
||||||
// first we need to apply deletes to our segment.
|
// first we need to apply deletes to our segment.
|
||||||
let merging_join_handle = thread::Builder::new()
|
let merging_join_handle = thread::Builder::new()
|
||||||
.name(format!("mergingthread-{}", merging_thread_id))
|
.name(format!("mergingthread-{}", merging_thread_id))
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
// first we need to apply deletes to our segment.
|
// first we need to apply deletes to our segment.
|
||||||
let merged_segment = segment_updater_clone.new_segment();
|
|
||||||
let merged_segment_id = merged_segment.id();
|
|
||||||
let merge_result = perform_merge(
|
let merge_result = perform_merge(
|
||||||
|
&merge_operation,
|
||||||
&segment_updater_clone.0.index,
|
&segment_updater_clone.0.index,
|
||||||
segment_entries,
|
segment_entries,
|
||||||
merged_segment,
|
|
||||||
target_opstamp,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
match merge_result {
|
match merge_result {
|
||||||
Ok(after_merge_segment_entry) => {
|
Ok(after_merge_segment_entry) => {
|
||||||
let merged_segment_meta = after_merge_segment_entry.meta().clone();
|
let merged_segment_meta = after_merge_segment_entry.meta().clone();
|
||||||
segment_updater_clone
|
segment_updater_clone
|
||||||
.end_merge(segment_ids_vec, after_merge_segment_entry)
|
.end_merge(merge_operation, after_merge_segment_entry)
|
||||||
.expect("Segment updater thread is corrupted.");
|
.expect("Segment updater thread is corrupted.");
|
||||||
|
|
||||||
// the future may fail if the listener of the oneshot future
|
// the future may fail if the listener of the oneshot future
|
||||||
@@ -341,13 +362,18 @@ impl SegmentUpdater {
|
|||||||
let _merging_future_res = merging_future_send.send(merged_segment_meta);
|
let _merging_future_res = merging_future_send.send(merged_segment_meta);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
|
warn!(
|
||||||
|
"Merge of {:?} was cancelled: {:?}",
|
||||||
|
merge_operation.segment_ids(),
|
||||||
|
e
|
||||||
|
);
|
||||||
// ... cancel merge
|
// ... cancel merge
|
||||||
if cfg!(test) {
|
if cfg!(test) {
|
||||||
panic!("Merge failed.");
|
panic!("Merge failed.");
|
||||||
}
|
}
|
||||||
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
|
// As `merge_operation` will be dropped, the segment in merge state will
|
||||||
// merging_future_send will be dropped, sending an error to the future.
|
// be available for merge again.
|
||||||
|
// `merging_future_send` will be dropped, sending an error to the future.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
segment_updater_clone
|
segment_updater_clone
|
||||||
@@ -368,16 +394,34 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn consider_merge_options(&self) {
|
fn consider_merge_options(&self) {
|
||||||
|
let merge_segment_ids: HashSet<SegmentId> = self.0.merge_operations.segment_in_merge();
|
||||||
let (committed_segments, uncommitted_segments) =
|
let (committed_segments, uncommitted_segments) =
|
||||||
get_mergeable_segments(&self.0.segment_manager);
|
get_mergeable_segments(&merge_segment_ids, &self.0.segment_manager);
|
||||||
|
|
||||||
// Committed segments cannot be merged with uncommitted_segments.
|
// Committed segments cannot be merged with uncommitted_segments.
|
||||||
// We therefore consider merges using these two sets of segments independently.
|
// We therefore consider merges using these two sets of segments independently.
|
||||||
let merge_policy = self.get_merge_policy();
|
let merge_policy = self.get_merge_policy();
|
||||||
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
|
|
||||||
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
let current_opstamp = self.0.stamper.stamp();
|
||||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
let mut merge_candidates: Vec<MergeOperation> = merge_policy
|
||||||
for MergeCandidate(segment_metas) in merge_candidates {
|
.compute_merge_candidates(&uncommitted_segments)
|
||||||
match self.start_merge_impl(&segment_metas) {
|
.into_iter()
|
||||||
|
.map(|merge_candidate| {
|
||||||
|
MergeOperation::new(&self.0.merge_operations, current_opstamp, merge_candidate.0)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let commit_opstamp = self.load_metas().opstamp;
|
||||||
|
let committed_merge_candidates = merge_policy
|
||||||
|
.compute_merge_candidates(&committed_segments)
|
||||||
|
.into_iter()
|
||||||
|
.map(|merge_candidate| {
|
||||||
|
MergeOperation::new(&self.0.merge_operations, commit_opstamp, merge_candidate.0)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
merge_candidates.extend(committed_merge_candidates.into_iter());
|
||||||
|
for merge_operation in merge_candidates {
|
||||||
|
match self.start_merge_impl(merge_operation) {
|
||||||
Ok(merge_future) => {
|
Ok(merge_future) => {
|
||||||
if let Err(e) = merge_future.fuse().poll() {
|
if let Err(e) = merge_future.fuse().poll() {
|
||||||
error!("The merge task failed quickly after starting: {:?}", e);
|
error!("The merge task failed quickly after starting: {:?}", e);
|
||||||
@@ -393,31 +437,16 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cancel_merge(
|
|
||||||
&self,
|
|
||||||
before_merge_segment_ids: &[SegmentId],
|
|
||||||
after_merge_segment_entry: SegmentId,
|
|
||||||
) {
|
|
||||||
self.0
|
|
||||||
.segment_manager
|
|
||||||
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn end_merge(
|
fn end_merge(
|
||||||
&self,
|
&self,
|
||||||
before_merge_segment_ids: Vec<SegmentId>,
|
merge_operation: MergeOperation,
|
||||||
mut after_merge_segment_entry: SegmentEntry,
|
mut after_merge_segment_entry: SegmentEntry,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||||
if let Some(delete_operation) = delete_cursor.get() {
|
if let Some(delete_operation) = delete_cursor.get() {
|
||||||
let committed_opstamp = segment_updater
|
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||||
.0
|
|
||||||
.index
|
|
||||||
.load_metas()
|
|
||||||
.expect("Failed to read opstamp")
|
|
||||||
.opstamp;
|
|
||||||
if delete_operation.opstamp < committed_opstamp {
|
if delete_operation.opstamp < committed_opstamp {
|
||||||
let index = &segment_updater.0.index;
|
let index = &segment_updater.0.index;
|
||||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||||
@@ -426,16 +455,15 @@ impl SegmentUpdater {
|
|||||||
{
|
{
|
||||||
error!(
|
error!(
|
||||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||||
before_merge_segment_ids, e
|
merge_operation.segment_ids(),
|
||||||
|
e
|
||||||
);
|
);
|
||||||
// ... cancel merge
|
|
||||||
if cfg!(test) {
|
if cfg!(test) {
|
||||||
panic!("Merge failed.");
|
panic!("Merge failed.");
|
||||||
}
|
}
|
||||||
segment_updater.cancel_merge(
|
// ... cancel merge
|
||||||
&before_merge_segment_ids,
|
// `merge_operations` are tracked. As it is dropped, the
|
||||||
after_merge_segment_entry.segment_id(),
|
// the segment_ids will be available again for merge.
|
||||||
);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -443,11 +471,11 @@ impl SegmentUpdater {
|
|||||||
segment_updater
|
segment_updater
|
||||||
.0
|
.0
|
||||||
.segment_manager
|
.segment_manager
|
||||||
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
info!("save metas");
|
info!("save metas");
|
||||||
let previous_metas = segment_updater.0.index.load_metas().unwrap();
|
let previous_metas = segment_updater.load_metas();
|
||||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
|
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
})
|
})
|
||||||
.wait()
|
.wait()
|
||||||
@@ -469,32 +497,25 @@ impl SegmentUpdater {
|
|||||||
/// Obsolete files will eventually be cleaned up
|
/// Obsolete files will eventually be cleaned up
|
||||||
/// by the directory garbage collector.
|
/// by the directory garbage collector.
|
||||||
pub fn wait_merging_thread(&self) -> Result<()> {
|
pub fn wait_merging_thread(&self) -> Result<()> {
|
||||||
let mut num_segments: usize;
|
|
||||||
loop {
|
loop {
|
||||||
num_segments = self.0.segment_manager.num_segments();
|
let merging_threads: HashMap<usize, JoinHandle<Result<()>>> = {
|
||||||
|
|
||||||
let mut new_merging_threads = HashMap::new();
|
|
||||||
{
|
|
||||||
let mut merging_threads = self.0.merging_threads.write().unwrap();
|
let mut merging_threads = self.0.merging_threads.write().unwrap();
|
||||||
mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
|
mem::replace(merging_threads.deref_mut(), HashMap::new())
|
||||||
|
};
|
||||||
|
if merging_threads.is_empty() {
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
debug!("wait merging thread {}", new_merging_threads.len());
|
debug!("wait merging thread {}", merging_threads.len());
|
||||||
for (_, merging_thread_handle) in new_merging_threads {
|
for (_, merging_thread_handle) in merging_threads {
|
||||||
merging_thread_handle
|
merging_thread_handle
|
||||||
.join()
|
.join()
|
||||||
.map(|_| ())
|
.map(|_| ())
|
||||||
.map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?;
|
.map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?;
|
||||||
}
|
}
|
||||||
// Our merging thread may have queued their completed
|
// Our merging thread may have queued their completed merged segment.
|
||||||
|
// Let's wait for that too.
|
||||||
self.run_async(move |_| {}).wait()?;
|
self.run_async(move |_| {}).wait()?;
|
||||||
|
|
||||||
let new_num_segments = self.0.segment_manager.num_segments();
|
|
||||||
|
|
||||||
if new_num_segments >= num_segments {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -514,7 +535,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -558,4 +579,75 @@ mod tests {
|
|||||||
assert_eq!(index.searcher().segment_readers().len(), 1);
|
assert_eq!(index.searcher().segment_readers().len(), 1);
|
||||||
assert_eq!(index.searcher().num_docs(), 302);
|
assert_eq!(index.searcher().num_docs(), 302);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn delete_all_docs() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
|
// writing the segment
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
|
||||||
|
{
|
||||||
|
for _ in 0..100 {
|
||||||
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
|
}
|
||||||
|
assert!(index_writer.commit().is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
for _ in 0..100 {
|
||||||
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
|
index_writer.add_document(doc!(text_field=>"d"));
|
||||||
|
}
|
||||||
|
assert!(index_writer.commit().is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
index_writer.add_document(doc!(text_field=>"e"));
|
||||||
|
index_writer.add_document(doc!(text_field=>"f"));
|
||||||
|
assert!(index_writer.commit().is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let seg_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
// docs exist, should have at least 1 segment
|
||||||
|
assert!(seg_ids.len() > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
|
||||||
|
for term_val in term_vals {
|
||||||
|
let term = Term::from_field_text(text_field, term_val);
|
||||||
|
index_writer.delete_term(term);
|
||||||
|
assert!(index_writer.commit().is_ok());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
index_writer
|
||||||
|
.wait_merging_threads()
|
||||||
|
.expect("waiting for merging threads");
|
||||||
|
}
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
assert_eq!(index.searcher().num_docs(), 0);
|
||||||
|
|
||||||
|
let seg_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
assert!(seg_ids.is_empty());
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
assert_eq!(index.searcher().num_docs(), 0);
|
||||||
|
// empty segments should be erased
|
||||||
|
assert!(index.searchable_segment_metas().unwrap().is_empty());
|
||||||
|
assert!(index.searcher().segment_readers().is_empty());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,50 +1,77 @@
|
|||||||
|
use std::ops::Range;
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
// AtomicU64 have not landed in stable.
|
// AtomicU64 have not landed in stable.
|
||||||
// For the moment let's just use AtomicUsize on
|
// For the moment let's just use AtomicUsize on
|
||||||
// x86/64 bit platform, and a mutex on other platform.
|
// x86/64 bit platform, and a mutex on other platform.
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
#[cfg(target = "x86_64")]
|
|
||||||
mod archicture_impl {
|
mod archicture_impl {
|
||||||
|
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
#[derive(Clone, Default)]
|
#[derive(Default)]
|
||||||
pub struct Stamper(Arc<AtomicU64>);
|
pub struct AtomicU64Ersatz(AtomicUsize);
|
||||||
|
|
||||||
impl Stamper {
|
impl AtomicU64Ersatz {
|
||||||
pub fn new(first_opstamp: u64) -> Stamper {
|
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stamp(&self) -> u64 {
|
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
self.0.fetch_add(val as usize, order) as u64
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(target = "x86_64"))]
|
#[cfg(not(target_arch = "x86_64"))]
|
||||||
mod archicture_impl {
|
mod archicture_impl {
|
||||||
|
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::atomic::Ordering;
|
||||||
|
/// Under other architecture, we rely on a mutex.
|
||||||
|
use std::sync::RwLock;
|
||||||
|
|
||||||
#[derive(Clone, Default)]
|
#[derive(Default)]
|
||||||
pub struct Stamper(Arc<Mutex<u64>>);
|
pub struct AtomicU64Ersatz(RwLock<u64>);
|
||||||
|
|
||||||
impl Stamper {
|
impl AtomicU64Ersatz {
|
||||||
pub fn new(first_opstamp: u64) -> Stamper {
|
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||||
Stamper(Arc::new(Mutex::new(first_opstamp)))
|
AtomicU64Ersatz(RwLock::new(first_opstamp))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stamp(&self) -> u64 {
|
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
||||||
let mut guard = self.0.lock().expect("Failed to lock the stamper");
|
let mut lock = self.0.write().unwrap();
|
||||||
let previous_val = *guard;
|
let previous_val = *lock;
|
||||||
*guard = previous_val + 1;
|
*lock = previous_val + incr;
|
||||||
previous_val
|
previous_val
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub use self::archicture_impl::Stamper;
|
use self::archicture_impl::AtomicU64Ersatz;
|
||||||
|
|
||||||
|
#[derive(Clone, Default)]
|
||||||
|
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
||||||
|
|
||||||
|
impl Stamper {
|
||||||
|
pub fn new(first_opstamp: u64) -> Stamper {
|
||||||
|
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stamp(&self) -> u64 {
|
||||||
|
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given a desired count `n`, `stamps` returns an iterator that
|
||||||
|
/// will supply `n` number of u64 stamps.
|
||||||
|
pub fn stamps(&self, n: u64) -> Range<u64> {
|
||||||
|
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
||||||
|
Range {
|
||||||
|
start: start,
|
||||||
|
end: start + n,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
@@ -62,5 +89,7 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(stamper.stamp(), 10u64);
|
assert_eq!(stamper.stamp(), 10u64);
|
||||||
assert_eq!(stamper_clone.stamp(), 11u64);
|
assert_eq!(stamper_clone.stamp(), 11u64);
|
||||||
|
assert_eq!(stamper.stamps(3u64), (12..15));
|
||||||
|
assert_eq!(stamper.stamp(), 15u64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
40
src/lib.rs
40
src/lib.rs
@@ -129,10 +129,7 @@ extern crate base64;
|
|||||||
extern crate bit_set;
|
extern crate bit_set;
|
||||||
extern crate bitpacking;
|
extern crate bitpacking;
|
||||||
extern crate byteorder;
|
extern crate byteorder;
|
||||||
extern crate scoped_pool;
|
|
||||||
|
|
||||||
extern crate combine;
|
extern crate combine;
|
||||||
|
|
||||||
extern crate crossbeam;
|
extern crate crossbeam;
|
||||||
extern crate fnv;
|
extern crate fnv;
|
||||||
extern crate fst;
|
extern crate fst;
|
||||||
@@ -146,6 +143,7 @@ extern crate num_cpus;
|
|||||||
extern crate owning_ref;
|
extern crate owning_ref;
|
||||||
extern crate regex;
|
extern crate regex;
|
||||||
extern crate rust_stemmers;
|
extern crate rust_stemmers;
|
||||||
|
extern crate scoped_pool;
|
||||||
extern crate serde;
|
extern crate serde;
|
||||||
extern crate stable_deref_trait;
|
extern crate stable_deref_trait;
|
||||||
extern crate tempdir;
|
extern crate tempdir;
|
||||||
@@ -170,7 +168,7 @@ extern crate maplit;
|
|||||||
extern crate test;
|
extern crate test;
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate downcast;
|
extern crate downcast_rs;
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate fail;
|
extern crate fail;
|
||||||
@@ -231,11 +229,7 @@ pub use common::{i64_to_u64, u64_to_i64};
|
|||||||
/// Expose the current version of tantivy, as well
|
/// Expose the current version of tantivy, as well
|
||||||
/// whether it was compiled with the simd compression.
|
/// whether it was compiled with the simd compression.
|
||||||
pub fn version() -> &'static str {
|
pub fn version() -> &'static str {
|
||||||
if cfg!(feature = "simdcompression") {
|
env!("CARGO_PKG_VERSION")
|
||||||
concat!(env!("CARGO_PKG_VERSION"), "-simd")
|
|
||||||
} else {
|
|
||||||
concat!(env!("CARGO_PKG_VERSION"), "-nosimd")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Defines tantivy's merging strategy
|
/// Defines tantivy's merging strategy
|
||||||
@@ -348,7 +342,7 @@ mod tests {
|
|||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"af b");
|
let doc = doc!(text_field=>"af b");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -370,7 +364,7 @@ mod tests {
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
@@ -412,7 +406,7 @@ mod tests {
|
|||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"a b c");
|
let doc = doc!(text_field=>"a b c");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -440,7 +434,7 @@ mod tests {
|
|||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"a b c");
|
let doc = doc!(text_field=>"a b c");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -487,7 +481,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
// 0
|
// 0
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
// 1
|
// 1
|
||||||
@@ -534,7 +528,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
// 0
|
// 0
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
// 1
|
// 1
|
||||||
@@ -571,7 +565,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
@@ -620,7 +614,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(field=>1u64));
|
index_writer.add_document(doc!(field=>1u64));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
@@ -643,7 +637,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
let negative_val = -1i64;
|
let negative_val = -1i64;
|
||||||
index_writer.add_document(doc!(value_field => negative_val));
|
index_writer.add_document(doc!(value_field => negative_val));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
@@ -667,7 +661,7 @@ mod tests {
|
|||||||
let absent_field = schema_builder.add_text_field("text", TEXT);
|
let absent_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
assert!(index.load_searchers().is_ok());
|
assert!(index.load_searchers().is_ok());
|
||||||
@@ -684,7 +678,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
||||||
|
|
||||||
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
|
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
|
||||||
let doc = doc!(text_field=>val);
|
let doc = doc!(text_field=>val);
|
||||||
@@ -720,7 +714,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"af af af bc bc");
|
let doc = doc!(text_field=>"af af af bc bc");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -756,7 +750,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"af af af b"));
|
index_writer.add_document(doc!(text_field=>"af af af b"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d"));
|
index_writer.add_document(doc!(text_field=>"a b c d"));
|
||||||
@@ -809,7 +803,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"af b");
|
let doc = doc!(text_field=>"af b");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
|
|||||||
@@ -34,10 +34,6 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
|||||||
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
||||||
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
|
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
|
||||||
|
|
||||||
lazy_static! {
|
|
||||||
static ref BIT_PACKER: BitPacker4x = BitPacker4x::new();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,23 @@
|
|||||||
use super::BIT_PACKER;
|
/// Positions works as a long sequence of compressed block.
|
||||||
|
/// All terms are chained one after the other.
|
||||||
|
///
|
||||||
|
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
|
||||||
|
/// This means we need to skip to the `nth` positions efficiently.
|
||||||
|
///
|
||||||
|
/// This is done thanks to two levels of skiping that we refer to in the code
|
||||||
|
/// as `long_skip` and `short_skip`.
|
||||||
|
///
|
||||||
|
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
|
||||||
|
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
|
||||||
|
///
|
||||||
|
/// We find the number of long skips, as `n / long_skip`.
|
||||||
|
///
|
||||||
|
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
|
||||||
|
/// (values can go from 0bit to 32 bits) required to decompressed every block.
|
||||||
|
///
|
||||||
|
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
|
||||||
|
/// so skipping a block without decompressing it is just a matter of advancing that many
|
||||||
|
/// bytes.
|
||||||
use bitpacking::{BitPacker, BitPacker4x};
|
use bitpacking::{BitPacker, BitPacker4x};
|
||||||
use common::{BinarySerializable, FixedSize};
|
use common::{BinarySerializable, FixedSize};
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
@@ -8,9 +27,65 @@ use positions::LONG_SKIP_INTERVAL;
|
|||||||
use positions::LONG_SKIP_IN_BLOCKS;
|
use positions::LONG_SKIP_IN_BLOCKS;
|
||||||
use postings::compression::compressed_block_size;
|
use postings::compression::compressed_block_size;
|
||||||
|
|
||||||
|
struct Positions {
|
||||||
|
bit_packer: BitPacker4x,
|
||||||
|
skip_source: ReadOnlySource,
|
||||||
|
position_source: ReadOnlySource,
|
||||||
|
long_skip_source: ReadOnlySource,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Positions {
|
||||||
|
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
|
||||||
|
let skip_len = skip_source.len();
|
||||||
|
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||||
|
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||||
|
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
||||||
|
let (skip_source, long_skip_source) = body.split(body_split);
|
||||||
|
Positions {
|
||||||
|
bit_packer: BitPacker4x::new(),
|
||||||
|
skip_source,
|
||||||
|
long_skip_source,
|
||||||
|
position_source,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the offset of the block associated to the given `long_skip_id`.
|
||||||
|
///
|
||||||
|
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
|
||||||
|
fn long_skip(&self, long_skip_id: usize) -> u64 {
|
||||||
|
if long_skip_id == 0 {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
let long_skip_slice = self.long_skip_source.as_slice();
|
||||||
|
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
|
||||||
|
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reader(&self, offset: u64) -> PositionReader {
|
||||||
|
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||||
|
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
|
||||||
|
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
|
||||||
|
let mut position_read = OwnedRead::new(self.position_source.clone());
|
||||||
|
position_read.advance(offset_num_bytes as usize);
|
||||||
|
let mut skip_read = OwnedRead::new(self.skip_source.clone());
|
||||||
|
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||||
|
let mut position_reader = PositionReader {
|
||||||
|
bit_packer: self.bit_packer,
|
||||||
|
skip_read,
|
||||||
|
position_read,
|
||||||
|
inner_offset: 0,
|
||||||
|
buffer: Box::new([0u32; 128]),
|
||||||
|
ahead: None,
|
||||||
|
};
|
||||||
|
position_reader.skip(small_skip);
|
||||||
|
position_reader
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct PositionReader {
|
pub struct PositionReader {
|
||||||
skip_read: OwnedRead,
|
skip_read: OwnedRead,
|
||||||
position_read: OwnedRead,
|
position_read: OwnedRead,
|
||||||
|
bit_packer: BitPacker4x,
|
||||||
inner_offset: usize,
|
inner_offset: usize,
|
||||||
buffer: Box<[u32; 128]>,
|
buffer: Box<[u32; 128]>,
|
||||||
ahead: Option<usize>, // if None, no block is loaded.
|
ahead: Option<usize>, // if None, no block is loaded.
|
||||||
@@ -27,6 +102,7 @@ pub struct PositionReader {
|
|||||||
// If the requested number of els ends exactly at a given block, the next
|
// If the requested number of els ends exactly at a given block, the next
|
||||||
// block is not decompressed.
|
// block is not decompressed.
|
||||||
fn read_impl(
|
fn read_impl(
|
||||||
|
bit_packer: BitPacker4x,
|
||||||
mut position: &[u8],
|
mut position: &[u8],
|
||||||
buffer: &mut [u32; 128],
|
buffer: &mut [u32; 128],
|
||||||
mut inner_offset: usize,
|
mut inner_offset: usize,
|
||||||
@@ -37,21 +113,23 @@ fn read_impl(
|
|||||||
let mut output_len = output.len();
|
let mut output_len = output.len();
|
||||||
let mut ahead = 0;
|
let mut ahead = 0;
|
||||||
loop {
|
loop {
|
||||||
let available_len = 128 - inner_offset;
|
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
|
||||||
|
// We have enough elements in the current block.
|
||||||
|
// Let's copy the requested elements in the output buffer,
|
||||||
|
// and return.
|
||||||
if output_len <= available_len {
|
if output_len <= available_len {
|
||||||
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
|
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
|
||||||
return ahead;
|
return ahead;
|
||||||
} else {
|
|
||||||
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
|
||||||
output_len -= available_len;
|
|
||||||
output_start += available_len;
|
|
||||||
inner_offset = 0;
|
|
||||||
let num_bits = num_bits[ahead];
|
|
||||||
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
|
|
||||||
let block_len = compressed_block_size(num_bits);
|
|
||||||
position = &position[block_len..];
|
|
||||||
ahead += 1;
|
|
||||||
}
|
}
|
||||||
|
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
||||||
|
output_len -= available_len;
|
||||||
|
output_start += available_len;
|
||||||
|
inner_offset = 0;
|
||||||
|
let num_bits = num_bits[ahead];
|
||||||
|
bit_packer.decompress(position, &mut buffer[..], num_bits);
|
||||||
|
let block_len = compressed_block_size(num_bits);
|
||||||
|
position = &position[block_len..];
|
||||||
|
ahead += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,35 +139,7 @@ impl PositionReader {
|
|||||||
skip_source: ReadOnlySource,
|
skip_source: ReadOnlySource,
|
||||||
offset: u64,
|
offset: u64,
|
||||||
) -> PositionReader {
|
) -> PositionReader {
|
||||||
let skip_len = skip_source.len();
|
Positions::new(position_source, skip_source).reader(offset)
|
||||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
|
||||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
|
||||||
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
|
||||||
let (skip_body, long_skips) = body.split(body_split);
|
|
||||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
|
||||||
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
|
|
||||||
let offset_num_bytes: u64 = {
|
|
||||||
if long_skip_id > 0 {
|
|
||||||
let mut long_skip_blocks: &[u8] =
|
|
||||||
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
|
||||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let mut position_read = OwnedRead::new(position_source);
|
|
||||||
position_read.advance(offset_num_bytes as usize);
|
|
||||||
let mut skip_read = OwnedRead::new(skip_body);
|
|
||||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
|
||||||
let mut position_reader = PositionReader {
|
|
||||||
skip_read,
|
|
||||||
position_read,
|
|
||||||
inner_offset: 0,
|
|
||||||
buffer: Box::new([0u32; 128]),
|
|
||||||
ahead: None,
|
|
||||||
};
|
|
||||||
position_reader.skip(small_skip);
|
|
||||||
position_reader
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fills a buffer with the next `output.len()` integers.
|
/// Fills a buffer with the next `output.len()` integers.
|
||||||
@@ -101,10 +151,12 @@ impl PositionReader {
|
|||||||
if self.ahead != Some(0) {
|
if self.ahead != Some(0) {
|
||||||
// the block currently available is not the block
|
// the block currently available is not the block
|
||||||
// for the current position
|
// for the current position
|
||||||
BIT_PACKER.decompress(position_data, self.buffer.as_mut(), num_bits);
|
self.bit_packer.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||||
|
self.ahead = Some(0);
|
||||||
}
|
}
|
||||||
let block_len = compressed_block_size(num_bits);
|
let block_len = compressed_block_size(num_bits);
|
||||||
self.ahead = Some(read_impl(
|
self.ahead = Some(read_impl(
|
||||||
|
self.bit_packer,
|
||||||
&position_data[block_len..],
|
&position_data[block_len..],
|
||||||
self.buffer.as_mut(),
|
self.buffer.as_mut(),
|
||||||
self.inner_offset,
|
self.inner_offset,
|
||||||
@@ -133,14 +185,13 @@ impl PositionReader {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
|
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||||
.iter()
|
.iter()
|
||||||
.cloned()
|
.map(|num_bits| *num_bits as usize)
|
||||||
.map(|num_bit| num_bit as usize)
|
|
||||||
.sum::<usize>()
|
.sum::<usize>()
|
||||||
* (COMPRESSION_BLOCK_SIZE / 8);
|
* COMPRESSION_BLOCK_SIZE;
|
||||||
|
let skip_len_in_bytes = skip_len_in_bits / 8;
|
||||||
self.skip_read.advance(num_blocks_to_advance);
|
self.skip_read.advance(num_blocks_to_advance);
|
||||||
self.position_read.advance(skip_len);
|
self.position_read.advance(skip_len_in_bytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,29 +1,30 @@
|
|||||||
use super::BIT_PACKER;
|
|
||||||
use bitpacking::BitPacker;
|
use bitpacking::BitPacker;
|
||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
|
use common::CountingWriter;
|
||||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||||
use std::io;
|
use std::io::{self, Write};
|
||||||
|
use bitpacking::BitPacker4x;
|
||||||
|
|
||||||
pub struct PositionSerializer<W: io::Write> {
|
pub struct PositionSerializer<W: io::Write> {
|
||||||
write_stream: W,
|
bit_packer: BitPacker4x,
|
||||||
|
write_stream: CountingWriter<W>,
|
||||||
write_skiplist: W,
|
write_skiplist: W,
|
||||||
block: Vec<u32>,
|
block: Vec<u32>,
|
||||||
buffer: Vec<u8>,
|
buffer: Vec<u8>,
|
||||||
num_ints: u64,
|
num_ints: u64,
|
||||||
long_skips: Vec<u64>,
|
long_skips: Vec<u64>,
|
||||||
cumulated_num_bits: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: io::Write> PositionSerializer<W> {
|
impl<W: io::Write> PositionSerializer<W> {
|
||||||
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
|
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
|
||||||
PositionSerializer {
|
PositionSerializer {
|
||||||
write_stream,
|
bit_packer: BitPacker4x::new(),
|
||||||
|
write_stream: CountingWriter::wrap(write_stream),
|
||||||
write_skiplist,
|
write_skiplist,
|
||||||
block: Vec::with_capacity(128),
|
block: Vec::with_capacity(128),
|
||||||
buffer: vec![0u8; 128 * 4],
|
buffer: vec![0u8; 128 * 4],
|
||||||
num_ints: 0u64,
|
num_ints: 0u64,
|
||||||
long_skips: Vec::new(),
|
long_skips: Vec::new(),
|
||||||
cumulated_num_bits: 0u64,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,14 +51,13 @@ impl<W: io::Write> PositionSerializer<W> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn flush_block(&mut self) -> io::Result<()> {
|
fn flush_block(&mut self) -> io::Result<()> {
|
||||||
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
|
let num_bits = self.bit_packer.num_bits(&self.block[..]);
|
||||||
self.cumulated_num_bits += u64::from(num_bits);
|
|
||||||
self.write_skiplist.write_all(&[num_bits])?;
|
self.write_skiplist.write_all(&[num_bits])?;
|
||||||
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
|
let written_len = self.bit_packer.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||||
self.write_stream.write_all(&self.buffer[..written_len])?;
|
self.write_stream.write_all(&self.buffer[..written_len])?;
|
||||||
self.block.clear();
|
self.block.clear();
|
||||||
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
|
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
|
||||||
self.long_skips.push(self.cumulated_num_bits);
|
self.long_skips.push(self.write_stream.written_bytes());
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -280,7 +280,7 @@ pub mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_text(text_field, "g b b d c g c");
|
doc.add_text(text_field, "g b b d c g c");
|
||||||
@@ -322,7 +322,7 @@ pub mod tests {
|
|||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
for i in 0..num_docs {
|
for i in 0..num_docs {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_u64(value_field, 2);
|
doc.add_u64(value_field, 2);
|
||||||
@@ -399,7 +399,7 @@ pub mod tests {
|
|||||||
|
|
||||||
// delete some of the documents
|
// delete some of the documents
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.delete_term(term_0);
|
index_writer.delete_term(term_0);
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
@@ -449,7 +449,7 @@ pub mod tests {
|
|||||||
|
|
||||||
// delete everything else
|
// delete everything else
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.delete_term(term_1);
|
index_writer.delete_term(term_1);
|
||||||
|
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
@@ -457,25 +457,14 @@ pub mod tests {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
|
||||||
|
|
||||||
// finally, check that it's empty
|
// finally, check that it's empty
|
||||||
{
|
{
|
||||||
let mut segment_postings = segment_reader
|
let searchable_segment_ids = index
|
||||||
.inverted_index(term_2.field())
|
.searchable_segment_ids()
|
||||||
.read_postings(&term_2, IndexRecordOption::Basic)
|
.expect("could not get index segment ids");
|
||||||
.unwrap();
|
assert!(searchable_segment_ids.is_empty());
|
||||||
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
assert_eq!(segment_postings.skip_next(0), SkipResult::Reached);
|
|
||||||
assert_eq!(segment_postings.doc(), 0);
|
|
||||||
assert!(segment_reader.is_deleted(0));
|
|
||||||
|
|
||||||
let mut segment_postings = segment_reader
|
|
||||||
.inverted_index(term_2.field())
|
|
||||||
.read_postings(&term_2, IndexRecordOption::Basic)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -506,7 +495,7 @@ pub mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let posting_list_size = 1_000_000;
|
let posting_list_size = 1_000_000;
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
for _ in 0..posting_list_size {
|
for _ in 0..posting_list_size {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
if rng.gen_bool(1f64 / 15f64) {
|
if rng.gen_bool(1f64 / 15f64) {
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||||
|
|
||||||
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
use postings::recorder::{
|
||||||
|
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
|
||||||
|
};
|
||||||
use postings::UnorderedTermId;
|
use postings::UnorderedTermId;
|
||||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
@@ -213,7 +215,7 @@ pub trait PostingsWriter {
|
|||||||
|
|
||||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||||
/// dispatch to the recorder information.
|
/// dispatch to the recorder information.
|
||||||
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||||
total_num_tokens: u64,
|
total_num_tokens: u64,
|
||||||
_recorder_type: PhantomData<Rec>,
|
_recorder_type: PhantomData<Rec>,
|
||||||
}
|
}
|
||||||
@@ -245,8 +247,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
debug_assert!(term.as_slice().len() >= 4);
|
debug_assert!(term.as_slice().len() >= 4);
|
||||||
self.total_num_tokens += 1;
|
self.total_num_tokens += 1;
|
||||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||||
if opt_recorder.is_some() {
|
if let Some(mut recorder) = opt_recorder {
|
||||||
let mut recorder = opt_recorder.unwrap();
|
|
||||||
let current_doc = recorder.current_doc();
|
let current_doc = recorder.current_doc();
|
||||||
if current_doc != doc {
|
if current_doc != doc {
|
||||||
recorder.close_doc(heap);
|
recorder.close_doc(heap);
|
||||||
@@ -255,7 +256,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
recorder.record_position(position, heap);
|
recorder.record_position(position, heap);
|
||||||
recorder
|
recorder
|
||||||
} else {
|
} else {
|
||||||
let mut recorder = Rec::new(heap);
|
let mut recorder = Rec::new();
|
||||||
recorder.new_doc(doc, heap);
|
recorder.new_doc(doc, heap);
|
||||||
recorder.record_position(position, heap);
|
recorder.record_position(position, heap);
|
||||||
recorder
|
recorder
|
||||||
@@ -270,10 +271,11 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
termdict_heap: &MemoryArena,
|
termdict_heap: &MemoryArena,
|
||||||
heap: &MemoryArena,
|
heap: &MemoryArena,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
|
let mut buffer_lender = BufferLender::default();
|
||||||
for &(term_bytes, addr, _) in term_addrs {
|
for &(term_bytes, addr, _) in term_addrs {
|
||||||
let recorder: Rec = unsafe { termdict_heap.read(addr) };
|
let recorder: Rec = termdict_heap.read(addr);
|
||||||
serializer.new_term(&term_bytes[4..])?;
|
serializer.new_term(&term_bytes[4..])?;
|
||||||
recorder.serialize(serializer, heap)?;
|
recorder.serialize(&mut buffer_lender, serializer, heap)?;
|
||||||
serializer.close_term()?;
|
serializer.close_term()?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -1,10 +1,51 @@
|
|||||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||||
|
use common::{read_u32_vint, write_u32_vint};
|
||||||
use postings::FieldSerializer;
|
use postings::FieldSerializer;
|
||||||
use std::{self, io};
|
use std::io;
|
||||||
use DocId;
|
use DocId;
|
||||||
|
|
||||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||||
const POSITION_END: u32 = std::u32::MAX;
|
const POSITION_END: u32 = 0;
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub(crate) struct BufferLender {
|
||||||
|
buffer_u8: Vec<u8>,
|
||||||
|
buffer_u32: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BufferLender {
|
||||||
|
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
|
||||||
|
self.buffer_u8.clear();
|
||||||
|
&mut self.buffer_u8
|
||||||
|
}
|
||||||
|
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
|
||||||
|
self.buffer_u8.clear();
|
||||||
|
self.buffer_u32.clear();
|
||||||
|
(&mut self.buffer_u8, &mut self.buffer_u32)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct VInt32Reader<'a> {
|
||||||
|
data: &'a [u8],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> VInt32Reader<'a> {
|
||||||
|
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
|
||||||
|
VInt32Reader { data }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for VInt32Reader<'a> {
|
||||||
|
type Item = u32;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<u32> {
|
||||||
|
if self.data.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(read_u32_vint(&mut self.data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Recorder is in charge of recording relevant information about
|
/// Recorder is in charge of recording relevant information about
|
||||||
/// the presence of a term in a document.
|
/// the presence of a term in a document.
|
||||||
@@ -15,9 +56,9 @@ const POSITION_END: u32 = std::u32::MAX;
|
|||||||
/// * the document id
|
/// * the document id
|
||||||
/// * the term frequency
|
/// * the term frequency
|
||||||
/// * the term positions
|
/// * the term positions
|
||||||
pub trait Recorder: Copy {
|
pub(crate) trait Recorder: Copy + 'static {
|
||||||
///
|
///
|
||||||
fn new(heap: &mut MemoryArena) -> Self;
|
fn new() -> Self;
|
||||||
/// Returns the current document
|
/// Returns the current document
|
||||||
fn current_doc(&self) -> u32;
|
fn current_doc(&self) -> u32;
|
||||||
/// Starts recording information about a new document
|
/// Starts recording information about a new document
|
||||||
@@ -29,7 +70,12 @@ pub trait Recorder: Copy {
|
|||||||
/// Close the document. It will help record the term frequency.
|
/// Close the document. It will help record the term frequency.
|
||||||
fn close_doc(&mut self, heap: &mut MemoryArena);
|
fn close_doc(&mut self, heap: &mut MemoryArena);
|
||||||
/// Pushes the postings information to the serializer.
|
/// Pushes the postings information to the serializer.
|
||||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
|
fn serialize(
|
||||||
|
&self,
|
||||||
|
buffer_lender: &mut BufferLender,
|
||||||
|
serializer: &mut FieldSerializer,
|
||||||
|
heap: &MemoryArena,
|
||||||
|
) -> io::Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Only records the doc ids
|
/// Only records the doc ids
|
||||||
@@ -40,9 +86,9 @@ pub struct NothingRecorder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for NothingRecorder {
|
impl Recorder for NothingRecorder {
|
||||||
fn new(heap: &mut MemoryArena) -> Self {
|
fn new() -> Self {
|
||||||
NothingRecorder {
|
NothingRecorder {
|
||||||
stack: ExpUnrolledLinkedList::new(heap),
|
stack: ExpUnrolledLinkedList::new(),
|
||||||
current_doc: u32::max_value(),
|
current_doc: u32::max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -53,16 +99,23 @@ impl Recorder for NothingRecorder {
|
|||||||
|
|
||||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
self.stack.push(doc, heap);
|
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
||||||
|
|
||||||
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
||||||
|
|
||||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
fn serialize(
|
||||||
for doc in self.stack.iter(heap) {
|
&self,
|
||||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
buffer_lender: &mut BufferLender,
|
||||||
|
serializer: &mut FieldSerializer,
|
||||||
|
heap: &MemoryArena,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
let buffer = buffer_lender.lend_u8();
|
||||||
|
self.stack.read_to_end(heap, buffer);
|
||||||
|
for doc in VInt32Reader::new(&buffer[..]) {
|
||||||
|
serializer.write_doc(doc as u32, 0u32, &EMPTY_ARRAY)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -77,9 +130,9 @@ pub struct TermFrequencyRecorder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for TermFrequencyRecorder {
|
impl Recorder for TermFrequencyRecorder {
|
||||||
fn new(heap: &mut MemoryArena) -> Self {
|
fn new() -> Self {
|
||||||
TermFrequencyRecorder {
|
TermFrequencyRecorder {
|
||||||
stack: ExpUnrolledLinkedList::new(heap),
|
stack: ExpUnrolledLinkedList::new(),
|
||||||
current_doc: u32::max_value(),
|
current_doc: u32::max_value(),
|
||||||
current_tf: 0u32,
|
current_tf: 0u32,
|
||||||
}
|
}
|
||||||
@@ -91,7 +144,7 @@ impl Recorder for TermFrequencyRecorder {
|
|||||||
|
|
||||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
self.stack.push(doc, heap);
|
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
||||||
@@ -100,24 +153,24 @@ impl Recorder for TermFrequencyRecorder {
|
|||||||
|
|
||||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||||
debug_assert!(self.current_tf > 0);
|
debug_assert!(self.current_tf > 0);
|
||||||
self.stack.push(self.current_tf, heap);
|
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(heap));
|
||||||
self.current_tf = 0;
|
self.current_tf = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
fn serialize(
|
||||||
// the last document has not been closed...
|
&self,
|
||||||
// its term freq is self.current_tf.
|
buffer_lender: &mut BufferLender,
|
||||||
let mut doc_iter = self
|
serializer: &mut FieldSerializer,
|
||||||
.stack
|
heap: &MemoryArena,
|
||||||
.iter(heap)
|
) -> io::Result<()> {
|
||||||
.chain(Some(self.current_tf).into_iter());
|
let buffer = buffer_lender.lend_u8();
|
||||||
|
self.stack.read_to_end(heap, buffer);
|
||||||
while let Some(doc) = doc_iter.next() {
|
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||||
let term_freq = doc_iter
|
while let Some(doc) = u32_it.next() {
|
||||||
.next()
|
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||||
.expect("The IndexWriter recorded a doc without a term freq.");
|
serializer.write_doc(doc as u32, term_freq, &EMPTY_ARRAY)?;
|
||||||
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -128,11 +181,10 @@ pub struct TFAndPositionRecorder {
|
|||||||
stack: ExpUnrolledLinkedList,
|
stack: ExpUnrolledLinkedList,
|
||||||
current_doc: DocId,
|
current_doc: DocId,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for TFAndPositionRecorder {
|
impl Recorder for TFAndPositionRecorder {
|
||||||
fn new(heap: &mut MemoryArena) -> Self {
|
fn new() -> Self {
|
||||||
TFAndPositionRecorder {
|
TFAndPositionRecorder {
|
||||||
stack: ExpUnrolledLinkedList::new(heap),
|
stack: ExpUnrolledLinkedList::new(),
|
||||||
current_doc: u32::max_value(),
|
current_doc: u32::max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -143,33 +195,88 @@ impl Recorder for TFAndPositionRecorder {
|
|||||||
|
|
||||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
self.stack.push(doc, heap);
|
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
||||||
self.stack.push(position, heap);
|
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(heap));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||||
self.stack.push(POSITION_END, heap);
|
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(heap));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
fn serialize(
|
||||||
let mut doc_positions = Vec::with_capacity(100);
|
&self,
|
||||||
let mut positions_iter = self.stack.iter(heap);
|
buffer_lender: &mut BufferLender,
|
||||||
while let Some(doc) = positions_iter.next() {
|
serializer: &mut FieldSerializer,
|
||||||
let mut prev_position = 0;
|
heap: &MemoryArena,
|
||||||
doc_positions.clear();
|
) -> io::Result<()> {
|
||||||
for position in &mut positions_iter {
|
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||||
if position == POSITION_END {
|
self.stack.read_to_end(heap, buffer_u8);
|
||||||
break;
|
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
||||||
} else {
|
while let Some(doc) = u32_it.next() {
|
||||||
doc_positions.push(position - prev_position);
|
let mut prev_position_plus_one = 1u32;
|
||||||
prev_position = position;
|
buffer_positions.clear();
|
||||||
|
loop {
|
||||||
|
match u32_it.next() {
|
||||||
|
Some(POSITION_END) | None => {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Some(position_plus_one) => {
|
||||||
|
let delta_position = position_plus_one - prev_position_plus_one;
|
||||||
|
buffer_positions.push(delta_position);
|
||||||
|
prev_position_plus_one = position_plus_one;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
|
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::write_u32_vint;
|
||||||
|
use super::BufferLender;
|
||||||
|
use super::VInt32Reader;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_buffer_lender() {
|
||||||
|
let mut buffer_lender = BufferLender::default();
|
||||||
|
{
|
||||||
|
let buf = buffer_lender.lend_u8();
|
||||||
|
assert!(buf.is_empty());
|
||||||
|
buf.push(1u8);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let buf = buffer_lender.lend_u8();
|
||||||
|
assert!(buf.is_empty());
|
||||||
|
buf.push(1u8);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let (_, buf) = buffer_lender.lend_all();
|
||||||
|
assert!(buf.is_empty());
|
||||||
|
buf.push(1u32);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let (_, buf) = buffer_lender.lend_all();
|
||||||
|
assert!(buf.is_empty());
|
||||||
|
buf.push(1u32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vint_u32() {
|
||||||
|
let mut buffer = vec![];
|
||||||
|
let vals = [0, 1, 324_234_234, u32::max_value()];
|
||||||
|
for &i in &vals {
|
||||||
|
assert!(write_u32_vint(i, &mut buffer).is_ok());
|
||||||
|
}
|
||||||
|
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
|
||||||
|
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
|
||||||
|
assert_eq!(&res[..], &vals[..]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -123,22 +123,23 @@ impl SegmentPostings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||||
let mut start = 0;
|
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||||
let end = arr.len();
|
let end = arr.len();
|
||||||
debug_assert!(target <= arr[end - 1]);
|
let mut begin = 0;
|
||||||
let mut jump = 1;
|
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||||
loop {
|
if pivot >= end {
|
||||||
let new = start + jump;
|
break;
|
||||||
if new >= end {
|
|
||||||
return (start, end);
|
|
||||||
}
|
}
|
||||||
if arr[new] > target {
|
if arr[pivot] > target {
|
||||||
return (start, new);
|
return (begin, pivot);
|
||||||
}
|
}
|
||||||
start = new;
|
begin = pivot;
|
||||||
jump *= 2;
|
|
||||||
}
|
}
|
||||||
|
(begin, end)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search the first index containing an element greater or equal to the target.
|
/// Search the first index containing an element greater or equal to the target.
|
||||||
@@ -149,12 +150,8 @@ fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
|||||||
/// The target is assumed greater or equal to the first element.
|
/// The target is assumed greater or equal to the first element.
|
||||||
/// The target is assumed smaller or equal to the last element.
|
/// The target is assumed smaller or equal to the last element.
|
||||||
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||||
let (start, end) = exponential_search(target, block_docs);
|
let (start, end) = exponential_search(block_docs, target);
|
||||||
start.wrapping_add(
|
start + linear_search(&block_docs[start..end], target)
|
||||||
block_docs[start..end]
|
|
||||||
.binary_search(&target)
|
|
||||||
.unwrap_or_else(|e| e),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocSet for SegmentPostings {
|
impl DocSet for SegmentPostings {
|
||||||
@@ -621,6 +618,7 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::exponential_search;
|
use super::exponential_search;
|
||||||
|
use super::linear_search;
|
||||||
use super::search_within_block;
|
use super::search_within_block;
|
||||||
use super::BlockSegmentPostings;
|
use super::BlockSegmentPostings;
|
||||||
use super::BlockSegmentPostingsSkipResult;
|
use super::BlockSegmentPostingsSkipResult;
|
||||||
@@ -636,6 +634,21 @@ mod tests {
|
|||||||
use DocId;
|
use DocId;
|
||||||
use SkipResult;
|
use SkipResult;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_linear_search() {
|
||||||
|
let len: usize = 50;
|
||||||
|
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||||
|
for target in 1..*arr.last().unwrap() {
|
||||||
|
let res = linear_search(&arr[..], target);
|
||||||
|
if res > 0 {
|
||||||
|
assert!(arr[res - 1] < target);
|
||||||
|
}
|
||||||
|
if res < len {
|
||||||
|
assert!(arr[res] >= target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_segment_postings() {
|
fn test_empty_segment_postings() {
|
||||||
let mut postings = SegmentPostings::empty();
|
let mut postings = SegmentPostings::empty();
|
||||||
@@ -664,10 +677,10 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_exponentiel_search() {
|
fn test_exponentiel_search() {
|
||||||
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
|
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||||
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
|
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
|
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||||
(3, 7)
|
(3, 7)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -752,7 +765,7 @@ mod tests {
|
|||||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
let mut last_doc = 0u32;
|
let mut last_doc = 0u32;
|
||||||
for &doc in docs {
|
for &doc in docs {
|
||||||
for _ in last_doc..doc {
|
for _ in last_doc..doc {
|
||||||
@@ -823,7 +836,7 @@ mod tests {
|
|||||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
// create two postings list, one containg even number,
|
// create two postings list, one containg even number,
|
||||||
// the other containing odd numbers.
|
// the other containing odd numbers.
|
||||||
for i in 0..6 {
|
for i in 0..6 {
|
||||||
|
|||||||
@@ -1,28 +1,37 @@
|
|||||||
use super::{Addr, MemoryArena};
|
use super::{Addr, MemoryArena};
|
||||||
|
|
||||||
use common::is_power_of_2;
|
use postings::stacker::memory_arena::load;
|
||||||
|
use postings::stacker::memory_arena::store;
|
||||||
|
use std::io;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
|
||||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||||
|
const FIRST_BLOCK: usize = 16;
|
||||||
|
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
|
||||||
|
|
||||||
const FIRST_BLOCK: u32 = 4u32;
|
enum CapacityResult {
|
||||||
|
Available(u32),
|
||||||
|
NeedAlloc(u32),
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
fn len_to_capacity(len: u32) -> CapacityResult {
|
||||||
pub fn jump_needed(len: u32) -> Option<usize> {
|
|
||||||
match len {
|
match len {
|
||||||
0...3 => None,
|
0...15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
|
||||||
4...MAX_BLOCK_LEN => {
|
16...MAX_BLOCK_LEN => {
|
||||||
if is_power_of_2(len as usize) {
|
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
|
||||||
Some(len as usize)
|
let available = cap - len;
|
||||||
|
if available == 0 {
|
||||||
|
CapacityResult::NeedAlloc(len)
|
||||||
} else {
|
} else {
|
||||||
None
|
CapacityResult::Available(available)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n => {
|
n => {
|
||||||
if n % MAX_BLOCK_LEN == 0 {
|
let available = n % MAX_BLOCK_LEN;
|
||||||
Some(MAX_BLOCK_LEN as usize)
|
if available == 0 {
|
||||||
|
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
|
||||||
} else {
|
} else {
|
||||||
None
|
CapacityResult::Available(MAX_BLOCK_LEN - available)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -52,82 +61,119 @@ pub fn jump_needed(len: u32) -> Option<usize> {
|
|||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct ExpUnrolledLinkedList {
|
pub struct ExpUnrolledLinkedList {
|
||||||
len: u32,
|
len: u32,
|
||||||
head: Addr,
|
|
||||||
tail: Addr,
|
tail: Addr,
|
||||||
|
inlined_data: [u8; INLINED_BLOCK_LEN as usize],
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ExpUnrolledLinkedListWriter<'a> {
|
||||||
|
eull: &'a mut ExpUnrolledLinkedList,
|
||||||
|
heap: &'a mut MemoryArena,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ensure_capacity<'a>(
|
||||||
|
eull: &'a mut ExpUnrolledLinkedList,
|
||||||
|
heap: &'a mut MemoryArena,
|
||||||
|
) -> &'a mut [u8] {
|
||||||
|
if eull.len <= FIRST_BLOCK as u32 {
|
||||||
|
// We are still hitting the inline block.
|
||||||
|
if eull.len < FIRST_BLOCK as u32 {
|
||||||
|
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
|
||||||
|
}
|
||||||
|
// We need to allocate a new block!
|
||||||
|
let new_block_addr: Addr = heap.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
|
||||||
|
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
|
||||||
|
eull.tail = new_block_addr;
|
||||||
|
return heap.slice_mut(eull.tail, FIRST_BLOCK);
|
||||||
|
}
|
||||||
|
let len = match len_to_capacity(eull.len) {
|
||||||
|
CapacityResult::NeedAlloc(new_block_len) => {
|
||||||
|
let new_block_addr: Addr =
|
||||||
|
heap.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
|
||||||
|
heap.write_at(eull.tail, new_block_addr);
|
||||||
|
eull.tail = new_block_addr;
|
||||||
|
new_block_len
|
||||||
|
}
|
||||||
|
CapacityResult::Available(available) => available,
|
||||||
|
};
|
||||||
|
heap.slice_mut(eull.tail, len as usize)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
||||||
|
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
|
||||||
|
if buf.is_empty() {
|
||||||
|
// we need to cut early, because `ensure_capacity`
|
||||||
|
// allocates if there is no capacity at all right now.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
while !buf.is_empty() {
|
||||||
|
let add_len: usize;
|
||||||
|
{
|
||||||
|
let output_buf = ensure_capacity(self.eull, self.heap);
|
||||||
|
add_len = buf.len().min(output_buf.len());
|
||||||
|
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
|
||||||
|
}
|
||||||
|
self.eull.len += add_len as u32;
|
||||||
|
self.eull.tail = self.eull.tail.offset(add_len as u32);
|
||||||
|
buf = &buf[add_len..];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
|
||||||
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||||
|
// There is no use case to only write the capacity.
|
||||||
|
// This is not IO after all, so we write the whole
|
||||||
|
// buffer even if the contract of `.write` is looser.
|
||||||
|
self.extend_from_slice(buf);
|
||||||
|
Ok(buf.len())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||||
|
self.extend_from_slice(buf);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush(&mut self) -> io::Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExpUnrolledLinkedList {
|
impl ExpUnrolledLinkedList {
|
||||||
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
|
pub fn new() -> ExpUnrolledLinkedList {
|
||||||
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
|
|
||||||
ExpUnrolledLinkedList {
|
ExpUnrolledLinkedList {
|
||||||
len: 0u32,
|
len: 0u32,
|
||||||
head: addr,
|
tail: Addr::null_pointer(),
|
||||||
tail: addr,
|
inlined_data: [0u8; INLINED_BLOCK_LEN as usize],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
|
#[inline(always)]
|
||||||
ExpUnrolledLinkedListIterator {
|
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
|
||||||
heap,
|
ExpUnrolledLinkedListWriter { eull: self, heap }
|
||||||
addr: self.head,
|
|
||||||
len: self.len,
|
|
||||||
consumed: 0,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Appends a new element to the current stack.
|
pub fn read_to_end(&self, heap: &MemoryArena, output: &mut Vec<u8>) {
|
||||||
///
|
let len = self.len as usize;
|
||||||
/// If the current block end is reached, a new block is allocated.
|
if len <= FIRST_BLOCK {
|
||||||
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
|
output.extend_from_slice(&self.inlined_data[..len]);
|
||||||
self.len += 1;
|
return;
|
||||||
if let Some(new_block_len) = jump_needed(self.len) {
|
|
||||||
// We need to allocate another block.
|
|
||||||
// We also allocate an extra `u32` to store the pointer
|
|
||||||
// to the future next block.
|
|
||||||
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
|
|
||||||
let new_block_addr: Addr = heap.allocate_space(new_block_size);
|
|
||||||
unsafe {
|
|
||||||
// logic
|
|
||||||
heap.write(self.tail, new_block_addr)
|
|
||||||
};
|
|
||||||
self.tail = new_block_addr;
|
|
||||||
}
|
}
|
||||||
unsafe {
|
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
|
||||||
// logic
|
let mut cur = FIRST_BLOCK;
|
||||||
heap.write(self.tail, val);
|
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
|
||||||
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
|
loop {
|
||||||
}
|
let cap = match len_to_capacity(cur as u32) {
|
||||||
}
|
CapacityResult::Available(capacity) => capacity,
|
||||||
}
|
CapacityResult::NeedAlloc(capacity) => capacity,
|
||||||
|
} as usize;
|
||||||
pub struct ExpUnrolledLinkedListIterator<'a> {
|
let data = heap.slice(addr, cap);
|
||||||
heap: &'a MemoryArena,
|
if cur + cap >= len {
|
||||||
addr: Addr,
|
output.extend_from_slice(&data[..(len - cur)]);
|
||||||
len: u32,
|
return;
|
||||||
consumed: u32,
|
}
|
||||||
}
|
output.extend_from_slice(data);
|
||||||
|
cur += cap;
|
||||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
addr = heap.read(addr.offset(cap as u32));
|
||||||
type Item = u32;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<u32> {
|
|
||||||
if self.consumed == self.len {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
self.consumed += 1;
|
|
||||||
let addr: Addr = if jump_needed(self.consumed).is_some() {
|
|
||||||
unsafe {
|
|
||||||
// logic
|
|
||||||
self.heap.read(self.addr)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
self.addr
|
|
||||||
};
|
|
||||||
self.addr = addr.offset(mem::size_of::<u32>() as u32);
|
|
||||||
Some(unsafe {
|
|
||||||
// logic
|
|
||||||
self.heap.read(addr)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -136,39 +182,126 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::super::MemoryArena;
|
use super::super::MemoryArena;
|
||||||
use super::jump_needed;
|
use super::len_to_capacity;
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||||
|
|
||||||
|
#[test]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_stack() {
|
fn test_stack() {
|
||||||
let mut heap = MemoryArena::new();
|
let mut heap = MemoryArena::new();
|
||||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
let mut stack = ExpUnrolledLinkedList::new();
|
||||||
stack.push(1u32, &mut heap);
|
stack.writer(&mut heap).extend_from_slice(&[1u8]);
|
||||||
stack.push(2u32, &mut heap);
|
stack.writer(&mut heap).extend_from_slice(&[2u8]);
|
||||||
stack.push(4u32, &mut heap);
|
stack.writer(&mut heap).extend_from_slice(&[3u8, 4u8]);
|
||||||
stack.push(8u32, &mut heap);
|
stack.writer(&mut heap).extend_from_slice(&[5u8]);
|
||||||
{
|
{
|
||||||
let mut it = stack.iter(&heap);
|
let mut buffer = Vec::new();
|
||||||
assert_eq!(it.next().unwrap(), 1u32);
|
stack.read_to_end(&heap, &mut buffer);
|
||||||
assert_eq!(it.next().unwrap(), 2u32);
|
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
|
||||||
assert_eq!(it.next().unwrap(), 4u32);
|
|
||||||
assert_eq!(it.next().unwrap(), 8u32);
|
|
||||||
assert!(it.next().is_none());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_jump_if_needed() {
|
fn test_stack_long() {
|
||||||
let mut block_len = 4u32;
|
let mut heap = MemoryArena::new();
|
||||||
let mut i = 0;
|
let mut stack = ExpUnrolledLinkedList::new();
|
||||||
while i < 10_000_000 {
|
let source: Vec<u32> = (0..100).collect();
|
||||||
assert!(jump_needed(i + block_len - 1).is_none());
|
for &el in &source {
|
||||||
assert!(jump_needed(i + block_len + 1).is_none());
|
assert!(stack
|
||||||
assert!(jump_needed(i + block_len).is_some());
|
.writer(&mut heap)
|
||||||
let new_block_len = jump_needed(i + block_len).unwrap();
|
.write_u32::<LittleEndian>(el)
|
||||||
i += block_len;
|
.is_ok());
|
||||||
block_len = new_block_len as u32;
|
|
||||||
}
|
}
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
stack.read_to_end(&heap, &mut buffer);
|
||||||
|
let mut result = vec![];
|
||||||
|
let mut remaining = &buffer[..];
|
||||||
|
while !remaining.is_empty() {
|
||||||
|
result.push(LittleEndian::read_u32(&remaining[..4]));
|
||||||
|
remaining = &remaining[4..];
|
||||||
|
}
|
||||||
|
assert_eq!(&result[..], &source[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stack_interlaced() {
|
||||||
|
let mut heap = MemoryArena::new();
|
||||||
|
let mut stack = ExpUnrolledLinkedList::new();
|
||||||
|
let mut stack2 = ExpUnrolledLinkedList::new();
|
||||||
|
|
||||||
|
let mut vec1: Vec<u8> = vec![];
|
||||||
|
let mut vec2: Vec<u8> = vec![];
|
||||||
|
|
||||||
|
for i in 0..9 {
|
||||||
|
assert!(stack.writer(&mut heap).write_u32::<LittleEndian>(i).is_ok());
|
||||||
|
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
|
||||||
|
if i % 2 == 0 {
|
||||||
|
assert!(stack2
|
||||||
|
.writer(&mut heap)
|
||||||
|
.write_u32::<LittleEndian>(i)
|
||||||
|
.is_ok());
|
||||||
|
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut res1 = vec![];
|
||||||
|
let mut res2 = vec![];
|
||||||
|
stack.read_to_end(&heap, &mut res1);
|
||||||
|
stack2.read_to_end(&heap, &mut res2);
|
||||||
|
assert_eq!(&vec1[..], &res1[..]);
|
||||||
|
assert_eq!(&vec2[..], &res2[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_jump_if_needed() {
|
||||||
|
let mut available = 16u32;
|
||||||
|
for i in 0..10_000_000 {
|
||||||
|
match len_to_capacity(i) {
|
||||||
|
CapacityResult::NeedAlloc(cap) => {
|
||||||
|
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
|
||||||
|
available = cap;
|
||||||
|
}
|
||||||
|
CapacityResult::Available(cap) => {
|
||||||
|
assert_eq!(
|
||||||
|
available, cap,
|
||||||
|
"Failed len={}: Expected {} Got {}",
|
||||||
|
i, available, cap
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
available -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_jump_if_needed_progression() {
|
||||||
|
let mut v = vec![];
|
||||||
|
for i in 0.. {
|
||||||
|
if v.len() >= 10 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
match len_to_capacity(i) {
|
||||||
|
CapacityResult::NeedAlloc(cap) => {
|
||||||
|
v.push((i, cap));
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert_eq!(
|
||||||
|
&v[..],
|
||||||
|
&[
|
||||||
|
(16, 16),
|
||||||
|
(32, 32),
|
||||||
|
(64, 64),
|
||||||
|
(128, 128),
|
||||||
|
(256, 256),
|
||||||
|
(512, 512),
|
||||||
|
(1024, 1024),
|
||||||
|
(2048, 2048),
|
||||||
|
(4096, 4096),
|
||||||
|
(8192, 8192)
|
||||||
|
]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -176,6 +309,7 @@ mod tests {
|
|||||||
mod bench {
|
mod bench {
|
||||||
use super::super::MemoryArena;
|
use super::super::MemoryArena;
|
||||||
use super::ExpUnrolledLinkedList;
|
use super::ExpUnrolledLinkedList;
|
||||||
|
use byteorder::{NativeEndian, WriteBytesExt};
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
const NUM_STACK: usize = 10_000;
|
const NUM_STACK: usize = 10_000;
|
||||||
@@ -203,13 +337,13 @@ mod bench {
|
|||||||
let mut heap = MemoryArena::new();
|
let mut heap = MemoryArena::new();
|
||||||
let mut stacks = Vec::with_capacity(100);
|
let mut stacks = Vec::with_capacity(100);
|
||||||
for _ in 0..NUM_STACK {
|
for _ in 0..NUM_STACK {
|
||||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
let mut stack = ExpUnrolledLinkedList::new();
|
||||||
stacks.push(stack);
|
stacks.push(stack);
|
||||||
}
|
}
|
||||||
for s in 0..NUM_STACK {
|
for s in 0..NUM_STACK {
|
||||||
for i in 0u32..STACK_SIZE {
|
for i in 0u32..STACK_SIZE {
|
||||||
let t = s * 392017 % NUM_STACK;
|
let t = s * 392017 % NUM_STACK;
|
||||||
stacks[t].push(i, &mut heap);
|
let _ = stacks[t].writer(&mut heap).write_u32::<NativeEndian>(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
|
|||||||
/// page of memory.
|
/// page of memory.
|
||||||
///
|
///
|
||||||
/// The last 20 bits are an address within this page of memory.
|
/// The last 20 bits are an address within this page of memory.
|
||||||
#[derive(Clone, Copy, Debug)]
|
#[derive(Copy, Clone, Debug)]
|
||||||
pub struct Addr(u32);
|
pub struct Addr(u32);
|
||||||
|
|
||||||
impl Addr {
|
impl Addr {
|
||||||
@@ -69,32 +69,16 @@ impl Addr {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Trait required for an object to be `storable`.
|
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
||||||
///
|
assert_eq!(dest.len(), std::mem::size_of::<Item>());
|
||||||
/// # Warning
|
unsafe {
|
||||||
///
|
ptr::write_unaligned(dest.as_mut_ptr() as *mut Item, val);
|
||||||
/// Most of the time you should not implement this trait,
|
}
|
||||||
/// and only use the `MemoryArena` with object implementing `Copy`.
|
|
||||||
///
|
|
||||||
/// `ArenaStorable` is used in `tantivy` to force
|
|
||||||
/// a `Copy` object and a `slice` of data to be stored contiguously.
|
|
||||||
pub trait ArenaStorable {
|
|
||||||
fn num_bytes(&self) -> usize;
|
|
||||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<V> ArenaStorable for V
|
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
|
||||||
where
|
assert_eq!(data.len(), std::mem::size_of::<Item>());
|
||||||
V: Copy,
|
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
|
||||||
{
|
|
||||||
fn num_bytes(&self) -> usize {
|
|
||||||
mem::size_of::<V>()
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
|
||||||
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
|
|
||||||
ptr::write_unaligned(dst_ptr, self);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The `MemoryArena`
|
/// The `MemoryArena`
|
||||||
@@ -126,47 +110,9 @@ impl MemoryArena {
|
|||||||
self.pages.len() * PAGE_SIZE
|
self.pages.len() * PAGE_SIZE
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Writes a slice at the given address, assuming the
|
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
|
||||||
/// memory was allocated beforehands.
|
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
|
||||||
///
|
store(dest, val);
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// May panic or corrupt the heap if he space was not
|
|
||||||
/// properly allocated beforehands.
|
|
||||||
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
|
|
||||||
let bytes = data.as_ref();
|
|
||||||
self.pages[addr.page_id()]
|
|
||||||
.get_mut_slice(addr.page_local_addr(), bytes.len())
|
|
||||||
.copy_from_slice(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the `len` bytes starting at `addr`
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// Panics if the memory has not been allocated beforehands.
|
|
||||||
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
|
|
||||||
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
|
|
||||||
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stores an item's data in the heap
|
|
||||||
///
|
|
||||||
/// It allocates the `Item` beforehands.
|
|
||||||
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
|
|
||||||
let num_bytes = val.num_bytes();
|
|
||||||
let addr = self.allocate_space(num_bytes);
|
|
||||||
unsafe {
|
|
||||||
self.write(addr, val);
|
|
||||||
};
|
|
||||||
addr
|
|
||||||
}
|
|
||||||
|
|
||||||
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
|
|
||||||
val.write_into(self, addr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read an item in the heap at the given `address`.
|
/// Read an item in the heap at the given `address`.
|
||||||
@@ -174,9 +120,21 @@ impl MemoryArena {
|
|||||||
/// # Panics
|
/// # Panics
|
||||||
///
|
///
|
||||||
/// If the address is erroneous
|
/// If the address is erroneous
|
||||||
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
|
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||||
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
|
load(self.slice(addr, mem::size_of::<Item>()))
|
||||||
ptr::read_unaligned(ptr as *const Item)
|
}
|
||||||
|
|
||||||
|
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||||
|
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn slice_from(&self, addr: Addr) -> &[u8] {
|
||||||
|
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
|
||||||
|
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Allocates `len` bytes and returns the allocated address.
|
/// Allocates `len` bytes and returns the allocated address.
|
||||||
@@ -197,14 +155,10 @@ struct Page {
|
|||||||
|
|
||||||
impl Page {
|
impl Page {
|
||||||
fn new(page_id: usize) -> Page {
|
fn new(page_id: usize) -> Page {
|
||||||
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
|
|
||||||
unsafe {
|
|
||||||
data.set_len(PAGE_SIZE);
|
|
||||||
} // avoid initializing page
|
|
||||||
Page {
|
Page {
|
||||||
page_id,
|
page_id,
|
||||||
len: 0,
|
len: 0,
|
||||||
data: data.into_boxed_slice(),
|
data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -213,12 +167,16 @@ impl Page {
|
|||||||
len + self.len <= PAGE_SIZE
|
len + self.len <= PAGE_SIZE
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||||
&mut self.data[local_addr..][..len]
|
&self.slice_from(local_addr)[..len]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
fn slice_from(&self, local_addr: usize) -> &[u8] {
|
||||||
&self.data[local_addr..][..len]
|
&self.data[local_addr..]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||||
|
&mut self.data[local_addr..][..len]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
||||||
@@ -230,16 +188,6 @@ impl Page {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
|
||||||
self.data.as_ptr().add(addr)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
|
||||||
self.data.as_mut_ptr().add(addr)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -254,13 +202,13 @@ mod tests {
|
|||||||
let b = b"happy tax payer";
|
let b = b"happy tax payer";
|
||||||
|
|
||||||
let addr_a = arena.allocate_space(a.len());
|
let addr_a = arena.allocate_space(a.len());
|
||||||
arena.write_bytes(addr_a, a);
|
arena.slice_mut(addr_a, a.len()).copy_from_slice(a);
|
||||||
|
|
||||||
let addr_b = arena.allocate_space(b.len());
|
let addr_b = arena.allocate_space(b.len());
|
||||||
arena.write_bytes(addr_b, b);
|
arena.slice_mut(addr_b, b.len()).copy_from_slice(b);
|
||||||
|
|
||||||
assert_eq!(arena.read_slice(addr_a, a.len()), a);
|
assert_eq!(arena.slice(addr_a, a.len()), a);
|
||||||
assert_eq!(arena.read_slice(addr_b, b.len()), b);
|
assert_eq!(arena.slice(addr_b, b.len()), b);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||||
@@ -283,9 +231,15 @@ mod tests {
|
|||||||
b: 221,
|
b: 221,
|
||||||
c: 12,
|
c: 12,
|
||||||
};
|
};
|
||||||
let addr_a = arena.store(a);
|
|
||||||
let addr_b = arena.store(b);
|
let num_bytes = std::mem::size_of::<MyTest>();
|
||||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
|
let addr_a = arena.allocate_space(num_bytes);
|
||||||
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
|
arena.write_at(addr_a, a);
|
||||||
|
|
||||||
|
let addr_b = arena.allocate_space(num_bytes);
|
||||||
|
arena.write_at(addr_b, b);
|
||||||
|
|
||||||
|
assert_eq!(arena.read::<MyTest>(addr_a), a);
|
||||||
|
assert_eq!(arena.read::<MyTest>(addr_b), b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,5 +3,5 @@ mod memory_arena;
|
|||||||
mod term_hashmap;
|
mod term_hashmap;
|
||||||
|
|
||||||
pub use self::expull::ExpUnrolledLinkedList;
|
pub use self::expull::ExpUnrolledLinkedList;
|
||||||
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
pub use self::memory_arena::{Addr, MemoryArena};
|
||||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||||
|
|||||||
@@ -2,39 +2,15 @@ extern crate murmurhash32;
|
|||||||
|
|
||||||
use self::murmurhash32::murmurhash2;
|
use self::murmurhash32::murmurhash2;
|
||||||
|
|
||||||
use super::{Addr, ArenaStorable, MemoryArena};
|
use super::{Addr, MemoryArena};
|
||||||
|
use byteorder::{ByteOrder, NativeEndian};
|
||||||
|
use postings::stacker::memory_arena::store;
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::slice;
|
use std::slice;
|
||||||
|
|
||||||
pub type BucketId = usize;
|
pub type BucketId = usize;
|
||||||
|
|
||||||
struct KeyBytesValue<'a, V> {
|
|
||||||
key: &'a [u8],
|
|
||||||
value: V,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, V> KeyBytesValue<'a, V> {
|
|
||||||
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
|
|
||||||
KeyBytesValue { key, value }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
|
|
||||||
where
|
|
||||||
V: ArenaStorable,
|
|
||||||
{
|
|
||||||
fn num_bytes(&self) -> usize {
|
|
||||||
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
|
||||||
arena.write(addr, self.key.len() as u16);
|
|
||||||
arena.write_bytes(addr.offset(2), self.key);
|
|
||||||
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the actual memory size in bytes
|
/// Returns the actual memory size in bytes
|
||||||
/// required to create a table of size $2^num_bits$.
|
/// required to create a table of size $2^num_bits$.
|
||||||
pub fn compute_table_size(num_bits: usize) -> usize {
|
pub fn compute_table_size(num_bits: usize) -> usize {
|
||||||
@@ -114,8 +90,7 @@ impl<'a> Iterator for Iter<'a> {
|
|||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
self.inner.next().cloned().map(move |bucket: usize| {
|
self.inner.next().cloned().map(move |bucket: usize| {
|
||||||
let kv = self.hashmap.table[bucket];
|
let kv = self.hashmap.table[bucket];
|
||||||
let (key, offset): (&'a [u8], Addr) =
|
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||||
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
|
|
||||||
(key, offset, bucket as BucketId)
|
(key, offset, bucket as BucketId)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -146,12 +121,22 @@ impl TermHashMap {
|
|||||||
self.table.len() < self.occupied.len() * 3
|
self.table.len() < self.occupied.len() * 3
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
#[inline(always)]
|
||||||
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
|
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||||
let key_addr = addr.offset(2u32);
|
let data = self.heap.slice_from(addr);
|
||||||
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
|
let key_bytes_len = NativeEndian::read_u16(data) as usize;
|
||||||
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
|
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
|
||||||
(key_bytes, val_addr)
|
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
|
||||||
|
let (stored_key, value_addr) = self.get_key_value(addr);
|
||||||
|
if stored_key == target_key {
|
||||||
|
Some(value_addr)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||||
@@ -202,7 +187,7 @@ impl TermHashMap {
|
|||||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||||
where
|
where
|
||||||
S: AsRef<[u8]>,
|
S: AsRef<[u8]>,
|
||||||
V: Copy,
|
V: Copy + 'static,
|
||||||
TMutator: FnMut(Option<V>) -> V,
|
TMutator: FnMut(Option<V>) -> V,
|
||||||
{
|
{
|
||||||
if self.is_saturated() {
|
if self.is_saturated() {
|
||||||
@@ -216,22 +201,25 @@ impl TermHashMap {
|
|||||||
let kv: KeyValue = self.table[bucket];
|
let kv: KeyValue = self.table[bucket];
|
||||||
if kv.is_empty() {
|
if kv.is_empty() {
|
||||||
let val = updater(None);
|
let val = updater(None);
|
||||||
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
|
let num_bytes =
|
||||||
|
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
|
||||||
|
let key_addr = self.heap.allocate_space(num_bytes);
|
||||||
|
{
|
||||||
|
let data = self.heap.slice_mut(key_addr, num_bytes);
|
||||||
|
NativeEndian::write_u16(data, key_bytes.len() as u16);
|
||||||
|
let stop = 2 + key_bytes.len();
|
||||||
|
data[2..stop].copy_from_slice(key_bytes);
|
||||||
|
store(&mut data[stop..], val);
|
||||||
|
}
|
||||||
self.set_bucket(hash, key_addr, bucket);
|
self.set_bucket(hash, key_addr, bucket);
|
||||||
return bucket as BucketId;
|
return bucket as BucketId;
|
||||||
} else if kv.hash == hash {
|
} else if kv.hash == hash {
|
||||||
let (key_matches, val_addr) = {
|
if let Some(val_addr) =
|
||||||
let (stored_key, val_addr): (&[u8], Addr) =
|
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
|
||||||
unsafe { self.get_key_value(kv.key_value_addr) };
|
{
|
||||||
(stored_key == key_bytes, val_addr)
|
let v = self.heap.read(val_addr);
|
||||||
};
|
let new_v = updater(Some(v));
|
||||||
if key_matches {
|
self.heap.write_at(val_addr, new_v);
|
||||||
unsafe {
|
|
||||||
// logic
|
|
||||||
let v = self.heap.read(val_addr);
|
|
||||||
let new_v = updater(Some(v));
|
|
||||||
self.heap.write(val_addr, new_v);
|
|
||||||
};
|
|
||||||
return bucket as BucketId;
|
return bucket as BucketId;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -239,24 +227,6 @@ impl TermHashMap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench {
|
|
||||||
use super::murmurhash2::murmurhash2;
|
|
||||||
use test::Bencher;
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_murmurhash2(b: &mut Bencher) {
|
|
||||||
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
|
||||||
b.iter(|| {
|
|
||||||
let mut s = 0;
|
|
||||||
for &key in &keys {
|
|
||||||
s ^= murmurhash2(key.as_bytes());
|
|
||||||
}
|
|
||||||
s
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
@@ -288,10 +258,7 @@ mod tests {
|
|||||||
let mut vanilla_hash_map = HashMap::new();
|
let mut vanilla_hash_map = HashMap::new();
|
||||||
let mut iter_values = hash_map.iter();
|
let mut iter_values = hash_map.iter();
|
||||||
while let Some((key, addr, _)) = iter_values.next() {
|
while let Some((key, addr, _)) = iter_values.next() {
|
||||||
let val: u32 = unsafe {
|
let val: u32 = hash_map.heap.read(addr);
|
||||||
// test
|
|
||||||
hash_map.heap.read(addr)
|
|
||||||
};
|
|
||||||
vanilla_hash_map.insert(key.to_owned(), val);
|
vanilla_hash_map.insert(key.to_owned(), val);
|
||||||
}
|
}
|
||||||
assert_eq!(vanilla_hash_map.len(), 2);
|
assert_eq!(vanilla_hash_map.len(), 2);
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use downcast::Downcast;
|
|
||||||
use query::intersect_scorers;
|
use query::intersect_scorers;
|
||||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||||
use query::term_query::TermScorer;
|
use query::term_query::TermScorer;
|
||||||
@@ -10,7 +9,6 @@ use query::RequiredOptionalScorer;
|
|||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use query::Union;
|
use query::Union;
|
||||||
use query::Weight;
|
use query::Weight;
|
||||||
use std::borrow::Borrow;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
@@ -24,14 +22,11 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
|
||||||
let scorer_ref: &Scorer = scorer.borrow();
|
|
||||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
|
||||||
});
|
|
||||||
if is_all_term_queries {
|
if is_all_term_queries {
|
||||||
let scorers: Vec<TermScorer> = scorers
|
let scorers: Vec<TermScorer> = scorers
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||||
.collect();
|
.collect();
|
||||||
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
||||||
return scorer;
|
return scorer;
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use collector::tests::TestCollector;
|
use collector::tests::TestCollector;
|
||||||
use downcast::Downcast;
|
|
||||||
use query::score_combiner::SumWithCoordsCombiner;
|
use query::score_combiner::SumWithCoordsCombiner;
|
||||||
use query::term_query::TermScorer;
|
use query::term_query::TermScorer;
|
||||||
use query::Intersection;
|
use query::Intersection;
|
||||||
@@ -29,7 +28,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field => "a b c");
|
let doc = doc!(text_field => "a b c");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -72,7 +71,7 @@ mod tests {
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
assert!(scorer.is::<TermScorer>());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -84,13 +83,13 @@ mod tests {
|
|||||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
assert!(scorer.is::<Intersection<TermScorer>>());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
|
assert!(scorer.is::<Intersection<Box<Scorer>>>());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,16 +102,14 @@ mod tests {
|
|||||||
let query = query_parser.parse_query("+a b").unwrap();
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<
|
assert!(scorer
|
||||||
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
|
.is::<RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>>());
|
||||||
>::is_type(&*scorer));
|
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b").unwrap();
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, false).unwrap();
|
let weight = query.weight(&searcher, false).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
println!("{:?}", scorer.type_name());
|
assert!(scorer.is::<TermScorer>());
|
||||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
use docset::{DocSet, SkipResult};
|
use docset::{DocSet, SkipResult};
|
||||||
use downcast::Downcast;
|
|
||||||
use query::term_query::TermScorer;
|
use query::term_query::TermScorer;
|
||||||
use query::EmptyScorer;
|
use query::EmptyScorer;
|
||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use std::borrow::Borrow;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
@@ -26,13 +24,12 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
|||||||
(Some(single_docset), None) => single_docset,
|
(Some(single_docset), None) => single_docset,
|
||||||
(Some(left), Some(right)) => {
|
(Some(left), Some(right)) => {
|
||||||
{
|
{
|
||||||
let all_term_scorers = [&left, &right].into_iter().all(|scorer| {
|
let all_term_scorers = [&left, &right]
|
||||||
let scorer_ref: &Scorer = (*scorer).borrow();
|
.iter()
|
||||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
.all(|&scorer| scorer.is::<TermScorer>());
|
||||||
});
|
|
||||||
if all_term_scorers {
|
if all_term_scorers {
|
||||||
let left = *Downcast::<TermScorer>::downcast(left).unwrap();
|
let left = *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||||
let right = *Downcast::<TermScorer>::downcast(right).unwrap();
|
let right = *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap());
|
||||||
return Box::new(Intersection {
|
return Box::new(Intersection {
|
||||||
left,
|
left,
|
||||||
right,
|
right,
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
for &text in texts {
|
for &text in texts {
|
||||||
let doc = doc!(text_field=>text);
|
let doc = doc!(text_field=>text);
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -86,7 +86,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
@@ -141,7 +141,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"b"));
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
index_writer.add_document(doc!(text_field=>"b a"));
|
index_writer.add_document(doc!(text_field=>"b a"));
|
||||||
@@ -173,7 +173,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
|
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
|
|||||||
|
|
||||||
pub struct PhraseScorer<TPostings: Postings> {
|
pub struct PhraseScorer<TPostings: Postings> {
|
||||||
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
|
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
|
||||||
num_docsets: usize,
|
num_terms: usize,
|
||||||
left: Vec<u32>,
|
left: Vec<u32>,
|
||||||
right: Vec<u32>,
|
right: Vec<u32>,
|
||||||
phrase_count: u32,
|
phrase_count: u32,
|
||||||
@@ -138,7 +138,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
PhraseScorer {
|
PhraseScorer {
|
||||||
intersection_docset: Intersection::new(postings_with_offsets),
|
intersection_docset: Intersection::new(postings_with_offsets),
|
||||||
num_docsets,
|
num_terms: num_docsets,
|
||||||
left: Vec::with_capacity(100),
|
left: Vec::with_capacity(100),
|
||||||
right: Vec::with_capacity(100),
|
right: Vec::with_capacity(100),
|
||||||
phrase_count: 0u32,
|
phrase_count: 0u32,
|
||||||
@@ -165,7 +165,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
|||||||
.positions(&mut self.left);
|
.positions(&mut self.left);
|
||||||
}
|
}
|
||||||
let mut intersection_len = self.left.len();
|
let mut intersection_len = self.left.len();
|
||||||
for i in 1..self.num_docsets - 1 {
|
for i in 1..self.num_terms - 1 {
|
||||||
{
|
{
|
||||||
self.intersection_docset
|
self.intersection_docset
|
||||||
.docset_mut_specialized(i)
|
.docset_mut_specialized(i)
|
||||||
@@ -178,7 +178,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
self.intersection_docset
|
self.intersection_docset
|
||||||
.docset_mut_specialized(self.num_docsets - 1)
|
.docset_mut_specialized(self.num_terms - 1)
|
||||||
.positions(&mut self.right);
|
.positions(&mut self.right);
|
||||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||||
}
|
}
|
||||||
@@ -190,7 +190,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
|||||||
.positions(&mut self.left);
|
.positions(&mut self.left);
|
||||||
}
|
}
|
||||||
let mut intersection_len = self.left.len();
|
let mut intersection_len = self.left.len();
|
||||||
for i in 1..self.num_docsets - 1 {
|
for i in 1..self.num_terms - 1 {
|
||||||
{
|
{
|
||||||
self.intersection_docset
|
self.intersection_docset
|
||||||
.docset_mut_specialized(i)
|
.docset_mut_specialized(i)
|
||||||
@@ -203,7 +203,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
self.intersection_docset
|
self.intersection_docset
|
||||||
.docset_mut_specialized(self.num_docsets - 1)
|
.docset_mut_specialized(self.num_terms - 1)
|
||||||
.positions(&mut self.right);
|
.positions(&mut self.right);
|
||||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use super::Weight;
|
use super::Weight;
|
||||||
use core::searcher::Searcher;
|
use core::searcher::Searcher;
|
||||||
use downcast;
|
use downcast_rs;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -39,7 +39,7 @@ use Term;
|
|||||||
///
|
///
|
||||||
/// When implementing a new type of `Query`, it is normal to implement a
|
/// When implementing a new type of `Query`, it is normal to implement a
|
||||||
/// dedicated `Query`, `Weight` and `Scorer`.
|
/// dedicated `Query`, `Weight` and `Scorer`.
|
||||||
pub trait Query: QueryClone + downcast::Any + fmt::Debug {
|
pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
|
||||||
/// Create the weight associated to a query.
|
/// Create the weight associated to a query.
|
||||||
///
|
///
|
||||||
/// If scoring is not required, setting `scoring_enabled` to `false`
|
/// If scoring is not required, setting `scoring_enabled` to `false`
|
||||||
@@ -96,7 +96,4 @@ impl QueryClone for Box<Query> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(missing_docs)]
|
impl_downcast!(Query);
|
||||||
mod downcast_impl {
|
|
||||||
downcast!(super::Query);
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
use docset::{DocSet, SkipResult};
|
use docset::{DocSet, SkipResult};
|
||||||
use downcast;
|
use downcast_rs;
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -8,7 +8,7 @@ use Score;
|
|||||||
/// Scored set of documents matching a query within a specific segment.
|
/// Scored set of documents matching a query within a specific segment.
|
||||||
///
|
///
|
||||||
/// See [`Query`](./trait.Query.html).
|
/// See [`Query`](./trait.Query.html).
|
||||||
pub trait Scorer: downcast::Any + DocSet + 'static {
|
pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
|
||||||
/// Returns the score.
|
/// Returns the score.
|
||||||
///
|
///
|
||||||
/// This method will perform a bit of computation and is not cached.
|
/// This method will perform a bit of computation and is not cached.
|
||||||
@@ -23,10 +23,7 @@ pub trait Scorer: downcast::Any + DocSet + 'static {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(missing_docs)]
|
impl_downcast!(Scorer);
|
||||||
mod downcast_impl {
|
|
||||||
downcast!(super::Scorer);
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Scorer for Box<Scorer> {
|
impl Scorer for Box<Scorer> {
|
||||||
fn score(&mut self) -> Score {
|
fn score(&mut self) -> Score {
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field => "a");
|
let doc = doc!(text_field => "a");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
|||||||
}
|
}
|
||||||
let path: &str = path_asref.as_ref();
|
let path: &str = path_asref.as_ref();
|
||||||
assert!(!path.is_empty());
|
assert!(!path.is_empty());
|
||||||
assert!(path.starts_with("/"));
|
assert!(path.starts_with('/'));
|
||||||
let mut facet_encoded = String::new();
|
let mut facet_encoded = String::new();
|
||||||
let mut state = State::Idle;
|
let mut state = State::Idle;
|
||||||
let path_bytes = path.as_bytes();
|
let path_bytes = path.as_bytes();
|
||||||
|
|||||||
@@ -523,7 +523,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field => "a"));
|
index_writer.add_document(doc!(text_field => "a"));
|
||||||
index_writer.add_document(doc!(text_field => "a"));
|
index_writer.add_document(doc!(text_field => "a"));
|
||||||
index_writer.add_document(doc!(text_field => "a b"));
|
index_writer.add_document(doc!(text_field => "a b"));
|
||||||
@@ -580,7 +580,7 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc ! (text_field => TEST_TEXT);
|
let doc = doc ! (text_field => TEST_TEXT);
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ mod tests {
|
|||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use byteorder::ByteOrder;
|
use byteorder::{ByteOrder, LittleEndian};
|
||||||
use common::bitpacker::BitPacker;
|
use common::bitpacker::BitPacker;
|
||||||
use common::compute_num_bits;
|
use common::compute_num_bits;
|
||||||
use common::Endianness;
|
use common::Endianness;
|
||||||
@@ -7,7 +7,6 @@ use directory::ReadOnlySource;
|
|||||||
use postings::TermInfo;
|
use postings::TermInfo;
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
use std::ptr;
|
|
||||||
use termdict::TermOrdinal;
|
use termdict::TermOrdinal;
|
||||||
|
|
||||||
const BLOCK_LEN: usize = 256;
|
const BLOCK_LEN: usize = 256;
|
||||||
@@ -88,13 +87,17 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
|
|||||||
assert!(num_bits <= 56);
|
assert!(num_bits <= 56);
|
||||||
let addr_byte = addr_bits / 8;
|
let addr_byte = addr_bits / 8;
|
||||||
let bit_shift = (addr_bits % 8) as u64;
|
let bit_shift = (addr_bits % 8) as u64;
|
||||||
assert!(data.len() >= addr_byte + 7);
|
let val_unshifted_unmasked: u64 = if data.len() >= addr_byte + 8 {
|
||||||
let val_unshifted_unmasked: u64 = unsafe {
|
LittleEndian::read_u64(&data[addr_byte..][..8])
|
||||||
// ok because the pointer is only accessed using `ptr::read_unaligned`
|
} else {
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
// the buffer is not large enough.
|
||||||
let addr = data.as_ptr().add(addr_byte) as *const u64;
|
// Let's copy the few remaining bytes to a 8 byte buffer
|
||||||
// ok thanks to the 7 byte padding
|
// padded with 0s.
|
||||||
ptr::read_unaligned(addr)
|
let mut buf = [0u8; 8];
|
||||||
|
let data_to_copy = &data[addr_byte..];
|
||||||
|
let nbytes = data_to_copy.len();
|
||||||
|
buf[..nbytes].copy_from_slice(data_to_copy);
|
||||||
|
LittleEndian::read_u64(&buf)
|
||||||
};
|
};
|
||||||
let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift;
|
let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift;
|
||||||
let mask = (1u64 << u64::from(num_bits)) - 1;
|
let mask = (1u64 << u64::from(num_bits)) - 1;
|
||||||
@@ -246,7 +249,6 @@ impl TermInfoStoreWriter {
|
|||||||
self.num_terms.serialize(write)?;
|
self.num_terms.serialize(write)?;
|
||||||
write.write_all(&self.buffer_block_metas)?;
|
write.write_all(&self.buffer_block_metas)?;
|
||||||
write.write_all(&self.buffer_term_infos)?;
|
write.write_all(&self.buffer_term_infos)?;
|
||||||
write.write_all(&[0u8; 7])?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,7 +50,6 @@ where
|
|||||||
self.token_mut().text.make_ascii_lowercase();
|
self.token_mut().text.make_ascii_lowercase();
|
||||||
} else {
|
} else {
|
||||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
|
||||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
}
|
}
|
||||||
true
|
true
|
||||||
|
|||||||
@@ -73,7 +73,7 @@
|
|||||||
//! let en_stem = SimpleTokenizer
|
//! let en_stem = SimpleTokenizer
|
||||||
//! .filter(RemoveLongFilter::limit(40))
|
//! .filter(RemoveLongFilter::limit(40))
|
||||||
//! .filter(LowerCaser)
|
//! .filter(LowerCaser)
|
||||||
//! .filter(Stemmer::new());
|
//! .filter(Stemmer::new(Language::English));
|
||||||
//! # }
|
//! # }
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
@@ -148,7 +148,7 @@ pub use self::ngram_tokenizer::NgramTokenizer;
|
|||||||
pub use self::raw_tokenizer::RawTokenizer;
|
pub use self::raw_tokenizer::RawTokenizer;
|
||||||
pub use self::remove_long::RemoveLongFilter;
|
pub use self::remove_long::RemoveLongFilter;
|
||||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||||
pub use self::stemmer::Stemmer;
|
pub use self::stemmer::{Language, Stemmer};
|
||||||
pub use self::stop_word_filter::StopWordFilter;
|
pub use self::stop_word_filter::StopWordFilter;
|
||||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||||
pub(crate) use self::tokenizer::box_tokenizer;
|
pub(crate) use self::tokenizer::box_tokenizer;
|
||||||
@@ -159,8 +159,10 @@ pub use self::tokenizer_manager::TokenizerManager;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
use super::Token;
|
use super::{
|
||||||
use super::TokenizerManager;
|
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, Tokenizer,
|
||||||
|
TokenizerManager,
|
||||||
|
};
|
||||||
|
|
||||||
/// This is a function that can be used in tests and doc tests
|
/// This is a function that can be used in tests and doc tests
|
||||||
/// to assert a token's correctness.
|
/// to assert a token's correctness.
|
||||||
@@ -214,6 +216,7 @@ pub mod tests {
|
|||||||
.token_stream("Hello, happy tax payer!")
|
.token_stream("Hello, happy tax payer!")
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(tokens.len(), 4);
|
assert_eq!(tokens.len(), 4);
|
||||||
assert_token(&tokens[0], 0, "hello", 0, 5);
|
assert_token(&tokens[0], 0, "hello", 0, 5);
|
||||||
assert_token(&tokens[1], 1, "happi", 7, 12);
|
assert_token(&tokens[1], 1, "happi", 7, 12);
|
||||||
@@ -221,6 +224,33 @@ pub mod tests {
|
|||||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_non_en_tokenizer() {
|
||||||
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
|
tokenizer_manager.register(
|
||||||
|
"es_stem",
|
||||||
|
SimpleTokenizer
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(Stemmer::new(Language::Spanish)),
|
||||||
|
);
|
||||||
|
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
|
||||||
|
let mut tokens: Vec<Token> = vec![];
|
||||||
|
{
|
||||||
|
let mut add_token = |token: &Token| {
|
||||||
|
tokens.push(token.clone());
|
||||||
|
};
|
||||||
|
en_tokenizer
|
||||||
|
.token_stream("Hola, feliz contribuyente!")
|
||||||
|
.process(&mut add_token);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(tokens.len(), 3);
|
||||||
|
assert_token(&tokens[0], 0, "hola", 0, 4);
|
||||||
|
assert_token(&tokens[1], 1, "feliz", 6, 11);
|
||||||
|
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_tokenizer_empty() {
|
fn test_tokenizer_empty() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
|
|||||||
@@ -4,22 +4,78 @@ use super::{Token, TokenFilter, TokenStream};
|
|||||||
use rust_stemmers::{self, Algorithm};
|
use rust_stemmers::{self, Algorithm};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
/// `Stemmer` token filter. Currently only English is supported.
|
/// Available stemmer languages.
|
||||||
/// Tokens are expected to be lowercased beforehands.
|
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||||
|
#[allow(missing_docs)]
|
||||||
|
pub enum Language {
|
||||||
|
Arabic,
|
||||||
|
Danish,
|
||||||
|
Dutch,
|
||||||
|
English,
|
||||||
|
Finnish,
|
||||||
|
French,
|
||||||
|
German,
|
||||||
|
Greek,
|
||||||
|
Hungarian,
|
||||||
|
Italian,
|
||||||
|
Portuguese,
|
||||||
|
Romanian,
|
||||||
|
Russian,
|
||||||
|
Spanish,
|
||||||
|
Swedish,
|
||||||
|
Tamil,
|
||||||
|
Turkish,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Language {
|
||||||
|
fn algorithm(self) -> Algorithm {
|
||||||
|
use self::Language::*;
|
||||||
|
match self {
|
||||||
|
Arabic => Algorithm::Arabic,
|
||||||
|
Danish => Algorithm::Danish,
|
||||||
|
Dutch => Algorithm::Dutch,
|
||||||
|
English => Algorithm::English,
|
||||||
|
Finnish => Algorithm::Finnish,
|
||||||
|
French => Algorithm::French,
|
||||||
|
German => Algorithm::German,
|
||||||
|
Greek => Algorithm::Greek,
|
||||||
|
Hungarian => Algorithm::Hungarian,
|
||||||
|
Italian => Algorithm::Italian,
|
||||||
|
Portuguese => Algorithm::Portuguese,
|
||||||
|
Romanian => Algorithm::Romanian,
|
||||||
|
Russian => Algorithm::Russian,
|
||||||
|
Spanish => Algorithm::Spanish,
|
||||||
|
Swedish => Algorithm::Swedish,
|
||||||
|
Tamil => Algorithm::Tamil,
|
||||||
|
Turkish => Algorithm::Turkish,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `Stemmer` token filter. Several languages are supported, see `Language` for the available
|
||||||
|
/// languages.
|
||||||
|
/// Tokens are expected to be lowercased beforehand.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Stemmer {
|
pub struct Stemmer {
|
||||||
stemmer_algorithm: Arc<Algorithm>,
|
stemmer_algorithm: Arc<Algorithm>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Stemmer {
|
impl Stemmer {
|
||||||
/// Creates a new Stemmer `TokenFilter`.
|
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
||||||
pub fn new() -> Stemmer {
|
pub fn new(language: Language) -> Stemmer {
|
||||||
Stemmer {
|
Stemmer {
|
||||||
stemmer_algorithm: Arc::new(Algorithm::English),
|
stemmer_algorithm: Arc::new(language.algorithm()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for Stemmer {
|
||||||
|
/// Creates a new Stemmer `TokenFilter` for English.
|
||||||
|
fn default() -> Self {
|
||||||
|
Stemmer::new(Language::English)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
||||||
where
|
where
|
||||||
TailTokenStream: TokenStream,
|
TailTokenStream: TokenStream,
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
|
|||||||
/// let en_stem = SimpleTokenizer
|
/// let en_stem = SimpleTokenizer
|
||||||
/// .filter(RemoveLongFilter::limit(40))
|
/// .filter(RemoveLongFilter::limit(40))
|
||||||
/// .filter(LowerCaser)
|
/// .filter(LowerCaser)
|
||||||
/// .filter(Stemmer::new());
|
/// .filter(Stemmer::default());
|
||||||
/// # }
|
/// # }
|
||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use tokenizer::box_tokenizer;
|
use tokenizer::box_tokenizer;
|
||||||
|
use tokenizer::stemmer::Language;
|
||||||
use tokenizer::BoxedTokenizer;
|
use tokenizer::BoxedTokenizer;
|
||||||
use tokenizer::LowerCaser;
|
use tokenizer::LowerCaser;
|
||||||
use tokenizer::RawTokenizer;
|
use tokenizer::RawTokenizer;
|
||||||
@@ -71,7 +72,7 @@ impl Default for TokenizerManager {
|
|||||||
SimpleTokenizer
|
SimpleTokenizer
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new()),
|
.filter(Stemmer::new(Language::English)),
|
||||||
);
|
);
|
||||||
manager
|
manager
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user