mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 17:22:54 +00:00
Compare commits
22 Commits
softcommit
...
perf/exper
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a7a98b11d7 | ||
|
|
a18932165f | ||
|
|
8f82d0b773 | ||
|
|
7102b363f5 | ||
|
|
66b4615e4e | ||
|
|
3df037961f | ||
|
|
dac50c6aeb | ||
|
|
31b22c5acc | ||
|
|
96a4f503ec | ||
|
|
9df288b0c9 | ||
|
|
b7c2d0de97 | ||
|
|
62445e0ec8 | ||
|
|
a228825462 | ||
|
|
d3eabd14bc | ||
|
|
c967031d21 | ||
|
|
d823163d52 | ||
|
|
c4f59f202d | ||
|
|
acd29b535d | ||
|
|
2cd31bcda2 | ||
|
|
99870de55c | ||
|
|
cad2d91845 | ||
|
|
79f3cd6cf4 |
@@ -29,7 +29,7 @@ addons:
|
|||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
# Android
|
# Android
|
||||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS
|
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||||
@@ -68,6 +68,11 @@ cache: cargo
|
|||||||
before_cache:
|
before_cache:
|
||||||
# Travis can't cache files that are not readable by "others"
|
# Travis can't cache files that are not readable by "others"
|
||||||
- chmod -R a+r $HOME/.cargo
|
- chmod -R a+r $HOME/.cargo
|
||||||
|
- find ./target/debug -type f -maxdepth 1 -delete
|
||||||
|
- rm -f ./target/.rustc_info.json
|
||||||
|
- rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||||
|
- rm -r target/debug/examples/
|
||||||
|
- ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||||
|
|
||||||
#branches:
|
#branches:
|
||||||
# only:
|
# only:
|
||||||
|
|||||||
61
CHANGELOG.md
61
CHANGELOG.md
@@ -1,3 +1,35 @@
|
|||||||
|
Tantivy 0.10.0
|
||||||
|
=====================
|
||||||
|
|
||||||
|
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
|
||||||
|
|
||||||
|
- Added an ASCII folding filter (@drusellers)
|
||||||
|
- Bugfix in `query.count` in presence of deletes (@pmasurel)
|
||||||
|
|
||||||
|
Minor
|
||||||
|
---------
|
||||||
|
- Small simplification of the code.
|
||||||
|
Calling .freq() or .doc() when .advance() has never
|
||||||
|
on segment postings should panic from now on.
|
||||||
|
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||||
|
- Fast fields are now preloaded when the `SegmentReader` is created.
|
||||||
|
|
||||||
|
## How to update?
|
||||||
|
|
||||||
|
Your existing indexes are usable as is. Your may or may need some
|
||||||
|
trivial updates.
|
||||||
|
|
||||||
|
### Fast fields
|
||||||
|
|
||||||
|
Fast fields used to be accessed directly from the `SegmentReader`.
|
||||||
|
The API changed, you are now required to acquire your fast field reader via the
|
||||||
|
`segment_reader.fast_fields()`, and use one of the typed method:
|
||||||
|
- `.u64()`, `.i64()` if your field is single-valued ;
|
||||||
|
- `.u64s()`, `.i64s()` if your field is multi-valued ;
|
||||||
|
- `.bytes()` if your field is bytes fast field.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.9.0
|
Tantivy 0.9.0
|
||||||
=====================
|
=====================
|
||||||
*0.9.0 index format is not compatible with the
|
*0.9.0 index format is not compatible with the
|
||||||
@@ -17,6 +49,35 @@ previous index format.*
|
|||||||
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
||||||
- SIMD linear search within blocks (@fulmicoton)
|
- SIMD linear search within blocks (@fulmicoton)
|
||||||
|
|
||||||
|
## How to update ?
|
||||||
|
|
||||||
|
tantivy 0.9 brought some API breaking change.
|
||||||
|
To update from tantivy 0.8, you will need to go through the following steps.
|
||||||
|
|
||||||
|
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
|
||||||
|
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
|
||||||
|
`IndexReader` for this.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// create the reader. You typically need to create 1 reader for the entire
|
||||||
|
// lifetime of you program.
|
||||||
|
let reader = index.reader()?;
|
||||||
|
|
||||||
|
// Acquire a searcher (previously `index.searcher()`) is now written:
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
// With the default setting of the reader, you are not required to
|
||||||
|
// call `index.load_searchers()` anymore.
|
||||||
|
//
|
||||||
|
// The IndexReader will pick up that change automatically, regardless
|
||||||
|
// of whether the update was done in a different process or not.
|
||||||
|
// If this behavior is not wanted, you can create your reader with
|
||||||
|
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
|
||||||
|
// by calling `reader.reload()?`.
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.8.2
|
Tantivy 0.8.2
|
||||||
=====================
|
=====================
|
||||||
Fixing build for x86_64 platforms. (#496)
|
Fixing build for x86_64 platforms. (#496)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.9.0"
|
version = "0.10.0-dev"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -23,7 +23,7 @@ snap = {version="0.2"}
|
|||||||
atomicwrites = {version="0.2.2", optional=true}
|
atomicwrites = {version="0.2.2", optional=true}
|
||||||
tempfile = "3.0"
|
tempfile = "3.0"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
combine = "3"
|
combine = ">=3.6.0,<4.0.0"
|
||||||
tempdir = "0.3"
|
tempdir = "0.3"
|
||||||
serde = "1.0"
|
serde = "1.0"
|
||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
|
|||||||
@@ -18,8 +18,8 @@ use tantivy::fastfield::FastFieldReader;
|
|||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::Field;
|
use tantivy::schema::Field;
|
||||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||||
use tantivy::Index;
|
|
||||||
use tantivy::SegmentReader;
|
use tantivy::SegmentReader;
|
||||||
|
use tantivy::{Index, TantivyError};
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct Stats {
|
struct Stats {
|
||||||
@@ -75,9 +75,18 @@ impl Collector for StatsCollector {
|
|||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
_segment_local_id: u32,
|
_segment_local_id: u32,
|
||||||
segment: &SegmentReader,
|
segment_reader: &SegmentReader,
|
||||||
) -> tantivy::Result<StatsSegmentCollector> {
|
) -> tantivy::Result<StatsSegmentCollector> {
|
||||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
let fast_field_reader = segment_reader
|
||||||
|
.fast_fields()
|
||||||
|
.u64(self.field)
|
||||||
|
.ok_or_else(|| {
|
||||||
|
let field_name = segment_reader.schema().get_field_name(self.field);
|
||||||
|
TantivyError::SchemaError(format!(
|
||||||
|
"Field {:?} is not a u64 fast field.",
|
||||||
|
field_name
|
||||||
|
))
|
||||||
|
})?;
|
||||||
Ok(StatsSegmentCollector {
|
Ok(StatsSegmentCollector {
|
||||||
fast_field_reader,
|
fast_field_reader,
|
||||||
stats: Stats::default(),
|
stats: Stats::default(),
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ use Result;
|
|||||||
use Score;
|
use Score;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
use TantivyError;
|
||||||
|
|
||||||
struct Hit<'a> {
|
struct Hit<'a> {
|
||||||
count: u64,
|
count: u64,
|
||||||
@@ -264,7 +265,10 @@ impl Collector for FacetCollector {
|
|||||||
_: SegmentLocalId,
|
_: SegmentLocalId,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
) -> Result<FacetSegmentCollector> {
|
) -> Result<FacetSegmentCollector> {
|
||||||
let facet_reader = reader.facet_reader(self.field)?;
|
let field_name = reader.schema().get_field_name(self.field);
|
||||||
|
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
|
||||||
|
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
|
||||||
|
})?;
|
||||||
|
|
||||||
let mut collapse_mapping = Vec::new();
|
let mut collapse_mapping = Vec::new();
|
||||||
let mut counts = Vec::new();
|
let mut counts = Vec::new();
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use super::Collector;
|
|||||||
use super::SegmentCollector;
|
use super::SegmentCollector;
|
||||||
use collector::Fruit;
|
use collector::Fruit;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
|
use std::ops::Deref;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -199,7 +200,10 @@ impl<'a> Collector for MultiCollector<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
self.collector_wrappers
|
||||||
|
.iter()
|
||||||
|
.map(Deref::deref)
|
||||||
|
.any(Collector::requires_scoring)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
||||||
|
|||||||
@@ -114,11 +114,15 @@ impl Collector for FastFieldTestCollector {
|
|||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
_: SegmentLocalId,
|
_: SegmentLocalId,
|
||||||
reader: &SegmentReader,
|
segment_reader: &SegmentReader,
|
||||||
) -> Result<FastFieldSegmentCollector> {
|
) -> Result<FastFieldSegmentCollector> {
|
||||||
|
let reader = segment_reader
|
||||||
|
.fast_fields()
|
||||||
|
.u64(self.field)
|
||||||
|
.expect("Requested field is not a fast field.");
|
||||||
Ok(FastFieldSegmentCollector {
|
Ok(FastFieldSegmentCollector {
|
||||||
vals: Vec::new(),
|
vals: Vec::new(),
|
||||||
reader: reader.fast_field_reader(self.field)?,
|
reader,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -170,11 +174,14 @@ impl Collector for BytesFastFieldTestCollector {
|
|||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
_segment_local_id: u32,
|
_segment_local_id: u32,
|
||||||
segment: &SegmentReader,
|
segment_reader: &SegmentReader,
|
||||||
) -> Result<BytesFastFieldSegmentCollector> {
|
) -> Result<BytesFastFieldSegmentCollector> {
|
||||||
Ok(BytesFastFieldSegmentCollector {
|
Ok(BytesFastFieldSegmentCollector {
|
||||||
vals: Vec::new(),
|
vals: Vec::new(),
|
||||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
reader: segment_reader
|
||||||
|
.fast_fields()
|
||||||
|
.bytes(self.field)
|
||||||
|
.expect("Field is not a bytes fast field."),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -191,7 +198,7 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
|
|||||||
type Fruit = Vec<u8>;
|
type Fruit = Vec<u8>;
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
fn collect(&mut self, doc: u32, _score: f32) {
|
||||||
let data = self.reader.get_val(doc);
|
let data = self.reader.get_bytes(doc);
|
||||||
self.vals.extend(data);
|
self.vals.extend(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -98,11 +98,11 @@ where
|
|||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn for_segment(
|
pub(crate) fn for_segment<F: PartialOrd>(
|
||||||
&self,
|
&self,
|
||||||
segment_id: SegmentLocalId,
|
segment_id: SegmentLocalId,
|
||||||
_: &SegmentReader,
|
_: &SegmentReader,
|
||||||
) -> Result<TopSegmentCollector<T>> {
|
) -> Result<TopSegmentCollector<F>> {
|
||||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,10 +5,12 @@ use collector::SegmentCollector;
|
|||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use fastfield::FastValue;
|
use fastfield::FastValue;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
|
use std::marker::PhantomData;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use Result;
|
use Result;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
use TantivyError;
|
||||||
|
|
||||||
/// The Top Field Collector keeps track of the K documents
|
/// The Top Field Collector keeps track of the K documents
|
||||||
/// sorted by a fast field in the index
|
/// sorted by a fast field in the index
|
||||||
@@ -106,8 +108,15 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
|||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
) -> Result<TopFieldSegmentCollector<T>> {
|
) -> Result<TopFieldSegmentCollector<T>> {
|
||||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
||||||
let reader = reader.fast_field_reader(self.field)?;
|
let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
|
||||||
Ok(TopFieldSegmentCollector { collector, reader })
|
let field_name = reader.schema().get_field_name(self.field);
|
||||||
|
TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
|
||||||
|
})?;
|
||||||
|
Ok(TopFieldSegmentCollector {
|
||||||
|
collector,
|
||||||
|
reader,
|
||||||
|
_type: PhantomData,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
@@ -122,9 +131,10 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
pub struct TopFieldSegmentCollector<T> {
|
||||||
collector: TopSegmentCollector<T>,
|
collector: TopSegmentCollector<u64>,
|
||||||
reader: FastFieldReader<T>,
|
reader: FastFieldReader<u64>,
|
||||||
|
_type: PhantomData<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||||
@@ -138,7 +148,11 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||||
self.collector.harvest()
|
self.collector
|
||||||
|
.harvest()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(val, doc_address)| (T::from_u64(val), doc_address))
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,7 +249,7 @@ mod tests {
|
|||||||
.for_segment(0, segment)
|
.for_segment(0, segment)
|
||||||
.map(|_| ())
|
.map(|_| ())
|
||||||
.unwrap_err(),
|
.unwrap_err(),
|
||||||
TantivyError::FastFieldError(_)
|
TantivyError::SchemaError(_)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ pub use self::serialize::{BinarySerializable, FixedSize};
|
|||||||
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
||||||
pub use byteorder::LittleEndian as Endianness;
|
pub use byteorder::LittleEndian as Endianness;
|
||||||
|
|
||||||
|
|
||||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||||
///
|
///
|
||||||
/// We do not allow segments with more than
|
/// We do not allow segments with more than
|
||||||
|
|||||||
@@ -340,7 +340,7 @@ impl Index {
|
|||||||
Ok(self
|
Ok(self
|
||||||
.searchable_segment_metas()?
|
.searchable_segment_metas()?
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_meta| segment_meta.id())
|
.map(SegmentMeta::id)
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ impl Searcher {
|
|||||||
) -> Searcher {
|
) -> Searcher {
|
||||||
let store_readers = segment_readers
|
let store_readers = segment_readers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| segment_reader.get_store_reader())
|
.map(SegmentReader::get_store_reader)
|
||||||
.collect();
|
.collect();
|
||||||
Searcher {
|
Searcher {
|
||||||
schema,
|
schema,
|
||||||
@@ -218,7 +218,7 @@ impl fmt::Debug for Searcher {
|
|||||||
let segment_ids = self
|
let segment_ids = self
|
||||||
.segment_readers
|
.segment_readers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| segment_reader.segment_id())
|
.map(SegmentReader::segment_id)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
write!(f, "Searcher({:?})", segment_ids)
|
write!(f, "Searcher({:?})", segment_ids)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,14 +5,10 @@ use core::Segment;
|
|||||||
use core::SegmentComponent;
|
use core::SegmentComponent;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use error::TantivyError;
|
|
||||||
use fastfield::DeleteBitSet;
|
use fastfield::DeleteBitSet;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReaders;
|
||||||
use fastfield::{self, FastFieldNotAvailableError};
|
|
||||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
|
||||||
use fieldnorm::FieldNormReader;
|
use fieldnorm::FieldNormReader;
|
||||||
use schema::Cardinality;
|
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
@@ -51,7 +47,7 @@ pub struct SegmentReader {
|
|||||||
postings_composite: CompositeFile,
|
postings_composite: CompositeFile,
|
||||||
positions_composite: CompositeFile,
|
positions_composite: CompositeFile,
|
||||||
positions_idx_composite: CompositeFile,
|
positions_idx_composite: CompositeFile,
|
||||||
fast_fields_composite: CompositeFile,
|
fast_fields_readers: Arc<FastFieldReaders>,
|
||||||
fieldnorms_composite: CompositeFile,
|
fieldnorms_composite: CompositeFile,
|
||||||
|
|
||||||
store_source: ReadOnlySource,
|
store_source: ReadOnlySource,
|
||||||
@@ -105,93 +101,21 @@ impl SegmentReader {
|
|||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// May panic if the index is corrupted.
|
/// May panic if the index is corrupted.
|
||||||
pub fn fast_field_reader<Item: FastValue>(
|
pub fn fast_fields(&self) -> &FastFieldReaders {
|
||||||
&self,
|
&self.fast_fields_readers
|
||||||
field: Field,
|
|
||||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
|
||||||
{
|
|
||||||
self.fast_fields_composite
|
|
||||||
.open_read(field)
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
|
||||||
.map(FastFieldReader::open)
|
|
||||||
} else {
|
|
||||||
Err(FastFieldNotAvailableError::new(field_entry))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
|
||||||
&self,
|
|
||||||
field: Field,
|
|
||||||
idx: usize,
|
|
||||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
|
||||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
|
||||||
Ok(FastFieldReader::open(ff_source))
|
|
||||||
} else {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
Err(FastFieldNotAvailableError::new(field_entry))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
|
||||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
|
||||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
|
||||||
&self,
|
|
||||||
field: Field,
|
|
||||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
|
||||||
{
|
|
||||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
|
||||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
|
||||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
|
||||||
} else {
|
|
||||||
Err(FastFieldNotAvailableError::new(field_entry))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
|
||||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
match *field_entry.field_type() {
|
|
||||||
FieldType::Bytes => {}
|
|
||||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
|
||||||
}
|
|
||||||
let idx_reader = self
|
|
||||||
.fast_fields_composite
|
|
||||||
.open_read_with_idx(field, 0)
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
|
||||||
.map(FastFieldReader::open)?;
|
|
||||||
let values = self
|
|
||||||
.fast_fields_composite
|
|
||||||
.open_read_with_idx(field, 1)
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
|
||||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||||
return Err(TantivyError::InvalidArgument(format!(
|
return None;
|
||||||
"The field {:?} is not a \
|
|
||||||
hierarchical facet.",
|
|
||||||
field_entry
|
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
let termdict_source = self.termdict_composite.open_read(field)?;
|
||||||
TantivyError::InvalidArgument(format!(
|
|
||||||
"The field \"{}\" is a hierarchical \
|
|
||||||
but this segment does not seem to have the field term \
|
|
||||||
dictionary.",
|
|
||||||
field_entry.name()
|
|
||||||
))
|
|
||||||
})?;
|
|
||||||
let termdict = TermDictionary::from_source(&termdict_source);
|
let termdict = TermDictionary::from_source(&termdict_source);
|
||||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||||
Ok(facet_reader)
|
Some(facet_reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the segment's `Field norms`'s reader.
|
/// Accessor to the segment's `Field norms`'s reader.
|
||||||
@@ -247,8 +171,12 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let schema = segment.schema();
|
||||||
|
|
||||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||||
|
let fast_field_readers =
|
||||||
|
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||||
|
|
||||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||||
@@ -260,14 +188,13 @@ impl SegmentReader {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let schema = segment.schema();
|
|
||||||
Ok(SegmentReader {
|
Ok(SegmentReader {
|
||||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||||
max_doc: segment.meta().max_doc(),
|
max_doc: segment.meta().max_doc(),
|
||||||
num_docs: segment.meta().num_docs(),
|
num_docs: segment.meta().num_docs(),
|
||||||
termdict_composite,
|
termdict_composite,
|
||||||
postings_composite,
|
postings_composite,
|
||||||
fast_fields_composite,
|
fast_fields_readers: fast_field_readers,
|
||||||
fieldnorms_composite,
|
fieldnorms_composite,
|
||||||
segment_id: segment.id(),
|
segment_id: segment.id(),
|
||||||
store_source,
|
store_source,
|
||||||
@@ -381,12 +308,12 @@ impl SegmentReader {
|
|||||||
self.postings_composite.space_usage(),
|
self.postings_composite.space_usage(),
|
||||||
self.positions_composite.space_usage(),
|
self.positions_composite.space_usage(),
|
||||||
self.positions_idx_composite.space_usage(),
|
self.positions_idx_composite.space_usage(),
|
||||||
self.fast_fields_composite.space_usage(),
|
self.fast_fields_readers.space_usage(),
|
||||||
self.fieldnorms_composite.space_usage(),
|
self.fieldnorms_composite.space_usage(),
|
||||||
self.get_store_reader().space_usage(),
|
self.get_store_reader().space_usage(),
|
||||||
self.delete_bitset_opt
|
self.delete_bitset_opt
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|x| x.space_usage())
|
.map(DeleteBitSet::space_usage)
|
||||||
.unwrap_or(0),
|
.unwrap_or(0),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -205,16 +205,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
|||||||
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
|
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
|
||||||
/// `OnCommit` `ReloadPolicy` to work properly.
|
/// `OnCommit` `ReloadPolicy` to work properly.
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
|
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
|
||||||
|
|
||||||
/// Ensure that all volatile files reach are persisted (in directory where that makes sense.)
|
|
||||||
///
|
|
||||||
/// In order to make Near Real Time efficient, tantivy introduced the notion of soft_commit vs
|
|
||||||
/// commit. Commit will call `.flush()`, while softcommit won't.
|
|
||||||
///
|
|
||||||
/// `meta.json` should be the last file to be flushed.
|
|
||||||
fn flush(&self) -> io::Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// DirectoryClone
|
/// DirectoryClone
|
||||||
|
|||||||
@@ -368,7 +368,7 @@ impl Drop for ReleaseLockFile {
|
|||||||
|
|
||||||
/// This Write wraps a File, but has the specificity of
|
/// This Write wraps a File, but has the specificity of
|
||||||
/// call `sync_all` on flush.
|
/// call `sync_all` on flush.
|
||||||
pub struct SafeFileWriter(File);
|
struct SafeFileWriter(File);
|
||||||
|
|
||||||
impl SafeFileWriter {
|
impl SafeFileWriter {
|
||||||
fn new(file: File) -> SafeFileWriter {
|
fn new(file: File) -> SafeFileWriter {
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ mod managed_directory;
|
|||||||
mod ram_directory;
|
mod ram_directory;
|
||||||
mod read_only_source;
|
mod read_only_source;
|
||||||
mod watch_event_router;
|
mod watch_event_router;
|
||||||
mod nrt_directory;
|
|
||||||
|
|
||||||
/// Errors specific to the directory module.
|
/// Errors specific to the directory module.
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
|||||||
@@ -1,195 +0,0 @@
|
|||||||
use directory::Directory;
|
|
||||||
use std::path::{PathBuf, Path};
|
|
||||||
use directory::ReadOnlySource;
|
|
||||||
use directory::error::OpenReadError;
|
|
||||||
use directory::error::DeleteError;
|
|
||||||
use std::io::{BufWriter, Cursor};
|
|
||||||
use directory::SeekableWrite;
|
|
||||||
use directory::error::OpenWriteError;
|
|
||||||
use directory::WatchHandle;
|
|
||||||
use directory::ram_directory::InnerRamDirectory;
|
|
||||||
use std::sync::RwLock;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use directory::WatchCallback;
|
|
||||||
use std::fmt;
|
|
||||||
use std::io;
|
|
||||||
use std::io::{Seek, Write};
|
|
||||||
use directory::DirectoryClone;
|
|
||||||
|
|
||||||
|
|
||||||
const BUFFER_LEN: usize = 1_000_000;
|
|
||||||
|
|
||||||
|
|
||||||
pub enum NRTWriter {
|
|
||||||
InRam {
|
|
||||||
buffer: Cursor<Vec<u8>>,
|
|
||||||
path: PathBuf,
|
|
||||||
nrt_directory: NRTDirectory
|
|
||||||
},
|
|
||||||
UnderlyingFile(BufWriter<Box<SeekableWrite>>)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NRTWriter {
|
|
||||||
pub fn new(path: PathBuf, nrt_directory: NRTDirectory) -> NRTWriter {
|
|
||||||
NRTWriter::InRam {
|
|
||||||
buffer: Cursor::new(Vec::with_capacity(BUFFER_LEN)),
|
|
||||||
path,
|
|
||||||
nrt_directory,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl io::Seek for NRTWriter {
|
|
||||||
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
|
||||||
match self {
|
|
||||||
NRTWriter::InRam { buffer, path, nrt_directory } => {
|
|
||||||
buffer.seek(pos)
|
|
||||||
}
|
|
||||||
NRTWriter::UnderlyingFile(file) => {
|
|
||||||
file.seek(pos)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl io::Write for NRTWriter {
|
|
||||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
||||||
self.write_all(buf)?;
|
|
||||||
Ok(buf.len())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
|
||||||
match self {
|
|
||||||
NRTWriter::InRam { buffer, path, nrt_directory } => {
|
|
||||||
let mut cache_wlock = nrt_directory.cache.write().unwrap();
|
|
||||||
cache_wlock.write(path.clone(), buffer.get_ref());
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
NRTWriter::UnderlyingFile(file) => {
|
|
||||||
file.flush()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
|
||||||
// Working around the borrow checker.
|
|
||||||
let mut underlying_write_opt: Option<BufWriter<Box<SeekableWrite>>> = None;
|
|
||||||
if let NRTWriter::InRam { buffer, path, nrt_directory } = self {
|
|
||||||
if buffer.get_ref().len() + buf.len() > BUFFER_LEN {
|
|
||||||
// We can't keep this in RAM. Let's move it to the underlying directory.
|
|
||||||
underlying_write_opt = Some(nrt_directory.open_write(path)
|
|
||||||
.map_err(|open_err| {
|
|
||||||
io::Error::new(io::ErrorKind::Other, open_err)
|
|
||||||
})?);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Some(underlying_write) = underlying_write_opt {
|
|
||||||
*self = NRTWriter::UnderlyingFile(underlying_write);
|
|
||||||
}
|
|
||||||
match self {
|
|
||||||
NRTWriter::InRam { buffer, path, nrt_directory } => {
|
|
||||||
assert!(buffer.get_ref().len() + buf.len() <= BUFFER_LEN);
|
|
||||||
buffer.write_all(buf)
|
|
||||||
}
|
|
||||||
NRTWriter::UnderlyingFile(file) => {
|
|
||||||
file.write_all(buf)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct NRTDirectory {
|
|
||||||
underlying: Box<Directory>,
|
|
||||||
cache: Arc<RwLock<InnerRamDirectory>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
impl Clone for NRTDirectory {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
NRTDirectory {
|
|
||||||
underlying: self.underlying.box_clone(),
|
|
||||||
cache: self.cache.clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NRTDirectory {
|
|
||||||
fn wrap(underlying: Box<Directory>) -> NRTDirectory {
|
|
||||||
NRTDirectory {
|
|
||||||
underlying,
|
|
||||||
cache: Default::default()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for NRTDirectory {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
write!(f, "NRTDirectory({:?})", self.underlying)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Directory for NRTDirectory {
|
|
||||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
|
||||||
// We explicitly release the lock, to prevent a panic on the underlying directory
|
|
||||||
// to poison the lock.
|
|
||||||
//
|
|
||||||
// File can only go from cache to underlying so the result does not lead to
|
|
||||||
// any inconsistency.
|
|
||||||
{
|
|
||||||
let mut cache_wlock = self.cache.write().unwrap();
|
|
||||||
if cache_wlock.exists(path) {
|
|
||||||
return cache_wlock.delete(path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.underlying.delete(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exists(&self, path: &Path) -> bool {
|
|
||||||
// We explicitly release the lock, to prevent a panic on the underlying directory
|
|
||||||
// to poison the lock.
|
|
||||||
//
|
|
||||||
// File can only go from cache to underlying so the result does not lead to
|
|
||||||
// any inconsistency.
|
|
||||||
{
|
|
||||||
let rlock_cache = self.cache.read().unwrap();
|
|
||||||
if rlock_cache.exists(path) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.underlying.exists(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
|
|
||||||
let mut cache_wlock = self.cache.write().unwrap();
|
|
||||||
// TODO might poison our lock. I don't know have a sound solution yet.
|
|
||||||
let path_buf = path.to_owned();
|
|
||||||
if self.underlying.exists(path) {
|
|
||||||
return Err(OpenWriteError::FileAlreadyExists(path_buf));
|
|
||||||
}
|
|
||||||
let exists = cache_wlock.write(path_buf.clone(), &[]);
|
|
||||||
// force the creation of the file to mimic the MMap directory.
|
|
||||||
if exists {
|
|
||||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
|
||||||
} else {
|
|
||||||
let vec_writer = NRTWriter::new(path_buf.clone(), self.clone());
|
|
||||||
Ok(BufWriter::new(Box::new(vec_writer)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
|
||||||
self.underlying.atomic_read(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
|
||||||
self.underlying.atomic_write(path, data)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
self.underlying.watch(watch_callback)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -71,36 +71,36 @@ impl Write for VecWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub(crate) struct InnerRamDirectory {
|
struct InnerDirectory {
|
||||||
fs: HashMap<PathBuf, ReadOnlySource>,
|
fs: HashMap<PathBuf, ReadOnlySource>,
|
||||||
watch_router: WatchCallbackList,
|
watch_router: WatchCallbackList,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl InnerRamDirectory {
|
impl InnerDirectory {
|
||||||
pub fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
|
fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
|
||||||
let data = ReadOnlySource::new(Vec::from(data));
|
let data = ReadOnlySource::new(Vec::from(data));
|
||||||
self.fs.insert(path, data).is_some()
|
self.fs.insert(path, data).is_some()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||||
self.fs
|
self.fs
|
||||||
.get(path)
|
.get(path)
|
||||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||||
.map(|el| el.clone())
|
.map(Clone::clone)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
||||||
match self.fs.remove(path) {
|
match self.fs.remove(path) {
|
||||||
Some(_) => Ok(()),
|
Some(_) => Ok(()),
|
||||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn exists(&self, path: &Path) -> bool {
|
fn exists(&self, path: &Path) -> bool {
|
||||||
self.fs.contains_key(path)
|
self.fs.contains_key(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
|
fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
|
||||||
self.watch_router.subscribe(watch_handle)
|
self.watch_router.subscribe(watch_handle)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -118,7 +118,7 @@ impl fmt::Debug for RAMDirectory {
|
|||||||
///
|
///
|
||||||
#[derive(Clone, Default)]
|
#[derive(Clone, Default)]
|
||||||
pub struct RAMDirectory {
|
pub struct RAMDirectory {
|
||||||
fs: Arc<RwLock<InnerRamDirectory>>,
|
fs: Arc<RwLock<InnerDirectory>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RAMDirectory {
|
impl RAMDirectory {
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
|
use fastfield::DeleteBitSet;
|
||||||
use std::borrow::Borrow;
|
use std::borrow::Borrow;
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
@@ -95,9 +96,23 @@ pub trait DocSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number documents matching.
|
/// Returns the number documents matching.
|
||||||
///
|
|
||||||
/// Calling this method consumes the `DocSet`.
|
/// Calling this method consumes the `DocSet`.
|
||||||
fn count(&mut self) -> u32 {
|
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||||
|
let mut count = 0u32;
|
||||||
|
while self.advance() {
|
||||||
|
if !delete_bitset.is_deleted(self.doc()) {
|
||||||
|
count += 1u32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
count
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the count of documents, deleted or not.
|
||||||
|
/// Calling this method consumes the `DocSet`.
|
||||||
|
///
|
||||||
|
/// Of course, the result is an upper bound of the result
|
||||||
|
/// given by `count()`.
|
||||||
|
fn count_including_deleted(&mut self) -> u32 {
|
||||||
let mut count = 0u32;
|
let mut count = 0u32;
|
||||||
while self.advance() {
|
while self.advance() {
|
||||||
count += 1u32;
|
count += 1u32;
|
||||||
@@ -127,13 +142,18 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
|||||||
unboxed.size_hint()
|
unboxed.size_hint()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn count(&mut self) -> u32 {
|
|
||||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
|
||||||
unboxed.count()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||||
unboxed.append_to_bitset(bitset);
|
unboxed.append_to_bitset(bitset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||||
|
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||||
|
unboxed.count(delete_bitset)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn count_including_deleted(&mut self) -> u32 {
|
||||||
|
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||||
|
unboxed.count_including_deleted()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,14 +23,14 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap();
|
||||||
|
|
||||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]);
|
||||||
assert!(bytes_reader.get_val(1).is_empty());
|
assert!(bytes_reader.get_bytes(1).is_empty());
|
||||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
assert_eq!(bytes_reader.get_bytes(2), &[255u8]);
|
||||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]);
|
||||||
let long = vec![0u8; 1000];
|
let long = vec![0u8; 1000];
|
||||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
assert_eq!(bytes_reader.get_bytes(4), long.as_slice());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ use DocId;
|
|||||||
///
|
///
|
||||||
/// Reading the value for a document is done by reading the start index for it,
|
/// Reading the value for a document is done by reading the start index for it,
|
||||||
/// and the start index for the next document, and keeping the bytes in between.
|
/// and the start index for the next document, and keeping the bytes in between.
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct BytesFastFieldReader {
|
pub struct BytesFastFieldReader {
|
||||||
idx_reader: FastFieldReader<u64>,
|
idx_reader: FastFieldReader<u64>,
|
||||||
values: OwningRef<ReadOnlySource, [u8]>,
|
values: OwningRef<ReadOnlySource, [u8]>,
|
||||||
@@ -28,10 +29,20 @@ impl BytesFastFieldReader {
|
|||||||
BytesFastFieldReader { idx_reader, values }
|
BytesFastFieldReader { idx_reader, values }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the bytes associated to the given `doc`
|
fn range(&self, doc: DocId) -> (usize, usize) {
|
||||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
|
||||||
let start = self.idx_reader.get(doc) as usize;
|
let start = self.idx_reader.get(doc) as usize;
|
||||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||||
|
(start, stop)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the bytes associated to the given `doc`
|
||||||
|
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
|
||||||
|
let (start, stop) = self.range(doc);
|
||||||
&self.values[start..stop]
|
&self.values[start..stop]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the overall number of bytes in this bytes fast field.
|
||||||
|
pub fn total_num_bytes(&self) -> usize {
|
||||||
|
self.values.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -53,16 +53,18 @@ impl DeleteBitSet {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns whether the document has been marked as deleted.
|
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||||
|
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||||
|
!self.is_deleted(doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true iff the document has been marked as deleted.
|
||||||
|
#[inline(always)]
|
||||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||||
if self.len == 0 {
|
let byte_offset = doc / 8u32;
|
||||||
false
|
let b: u8 = (*self.data)[byte_offset as usize];
|
||||||
} else {
|
let shift = (doc & 7u32) as u8;
|
||||||
let byte_offset = doc / 8u32;
|
b & (1u8 << shift) != 0
|
||||||
let b: u8 = (*self.data)[byte_offset as usize];
|
|
||||||
let shift = (doc & 7u32) as u8;
|
|
||||||
b & (1u8 << shift) != 0
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this bitset.
|
/// Summarize total space usage of this bitset.
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ pub use self::error::{FastFieldNotAvailableError, Result};
|
|||||||
pub use self::facet_reader::FacetReader;
|
pub use self::facet_reader::FacetReader;
|
||||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||||
pub use self::reader::FastFieldReader;
|
pub use self::reader::FastFieldReader;
|
||||||
|
pub use self::readers::FastFieldReaders;
|
||||||
pub use self::serializer::FastFieldSerializer;
|
pub use self::serializer::FastFieldSerializer;
|
||||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||||
use common;
|
use common;
|
||||||
@@ -43,6 +44,7 @@ mod error;
|
|||||||
mod facet_reader;
|
mod facet_reader;
|
||||||
mod multivalued;
|
mod multivalued;
|
||||||
mod reader;
|
mod reader;
|
||||||
|
mod readers;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
mod writer;
|
mod writer;
|
||||||
|
|
||||||
@@ -78,10 +80,6 @@ impl FastValue for u64 {
|
|||||||
*self
|
*self
|
||||||
}
|
}
|
||||||
|
|
||||||
fn as_u64(&self) -> u64 {
|
|
||||||
*self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||||
match *field_type {
|
match *field_type {
|
||||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||||
@@ -89,6 +87,10 @@ impl FastValue for u64 {
|
|||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn as_u64(&self) -> u64 {
|
||||||
|
*self
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FastValue for i64 {
|
impl FastValue for i64 {
|
||||||
|
|||||||
@@ -37,9 +37,7 @@ mod tests {
|
|||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = segment_reader
|
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||||
.multi_fast_field_reader::<u64>(field)
|
|
||||||
.unwrap();
|
|
||||||
{
|
{
|
||||||
multi_value_reader.get_vals(2, &mut vals);
|
multi_value_reader.get_vals(2, &mut vals);
|
||||||
assert_eq!(&vals, &[4u64]);
|
assert_eq!(&vals, &[4u64]);
|
||||||
@@ -198,9 +196,9 @@ mod tests {
|
|||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||||
{
|
{
|
||||||
multi_value_reader.get_vals(2, &mut vals);
|
multi_value_reader.get_vals(2, &mut vals);
|
||||||
assert_eq!(&vals, &[-4i64]);
|
assert_eq!(&vals, &[-4i64]);
|
||||||
|
|||||||
@@ -26,6 +26,13 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
|
||||||
|
MultiValueIntFastFieldReader {
|
||||||
|
idx_reader: self.idx_reader,
|
||||||
|
vals_reader: self.vals_reader.into_u64_reader(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns `(start, stop)`, such that the values associated
|
/// Returns `(start, stop)`, such that the values associated
|
||||||
/// to the given document are `start..stop`.
|
/// to the given document are `start..stop`.
|
||||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||||
@@ -41,13 +48,24 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
vals.resize(len, Item::default());
|
vals.resize(len, Item::default());
|
||||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the number of values associated with the document `DocId`.
|
||||||
|
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||||
|
let (start, stop) = self.range(doc);
|
||||||
|
(stop - start) as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the overall number of values in this field .
|
||||||
|
pub fn total_num_vals(&self) -> u64 {
|
||||||
|
self.idx_reader.max_value()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{Document, Facet, Schema};
|
use schema::{Facet, Schema};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multifastfield_reader() {
|
fn test_multifastfield_reader() {
|
||||||
@@ -58,22 +76,12 @@ mod tests {
|
|||||||
let mut index_writer = index
|
let mut index_writer = index
|
||||||
.writer_with_num_threads(1, 30_000_000)
|
.writer_with_num_threads(1, 30_000_000)
|
||||||
.expect("Failed to create index writer.");
|
.expect("Failed to create index writer.");
|
||||||
{
|
index_writer.add_document(doc!(
|
||||||
let mut doc = Document::new();
|
facet_field => Facet::from("/category/cat2"),
|
||||||
doc.add_facet(facet_field, "/category/cat2");
|
facet_field => Facet::from("/category/cat1"),
|
||||||
doc.add_facet(facet_field, "/category/cat1");
|
));
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||||
}
|
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||||
{
|
|
||||||
let mut doc = Document::new();
|
|
||||||
doc.add_facet(facet_field, "/category/cat2");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut doc = Document::new();
|
|
||||||
doc.add_facet(facet_field, "/category/cat3");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
index_writer.commit().expect("Commit failed");
|
index_writer.commit().expect("Commit failed");
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
|||||||
@@ -50,6 +50,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
|
||||||
|
FastFieldReader {
|
||||||
|
bit_unpacker: self.bit_unpacker,
|
||||||
|
min_value_u64: self.min_value_u64,
|
||||||
|
max_value_u64: self.max_value_u64,
|
||||||
|
_phantom: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the value associated to the given document.
|
/// Return the value associated to the given document.
|
||||||
///
|
///
|
||||||
/// This accessor should return as fast as possible.
|
/// This accessor should return as fast as possible.
|
||||||
|
|||||||
191
src/fastfield/readers.rs
Normal file
191
src/fastfield/readers.rs
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
use common::CompositeFile;
|
||||||
|
use fastfield::BytesFastFieldReader;
|
||||||
|
use fastfield::MultiValueIntFastFieldReader;
|
||||||
|
use fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||||
|
use schema::{Cardinality, Field, FieldType, Schema};
|
||||||
|
use space_usage::PerFieldSpaceUsage;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use Result;
|
||||||
|
|
||||||
|
/// Provides access to all of the FastFieldReader.
|
||||||
|
///
|
||||||
|
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||||
|
/// and just wraps several `HashMap`.
|
||||||
|
pub struct FastFieldReaders {
|
||||||
|
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
|
||||||
|
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
|
||||||
|
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
|
||||||
|
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
|
||||||
|
fast_bytes: HashMap<Field, BytesFastFieldReader>,
|
||||||
|
fast_fields_composite: CompositeFile,
|
||||||
|
}
|
||||||
|
|
||||||
|
enum FastType {
|
||||||
|
I64,
|
||||||
|
U64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||||
|
match field_type {
|
||||||
|
FieldType::U64(options) => options
|
||||||
|
.get_fastfield_cardinality()
|
||||||
|
.map(|cardinality| (FastType::U64, cardinality)),
|
||||||
|
FieldType::I64(options) => options
|
||||||
|
.get_fastfield_cardinality()
|
||||||
|
.map(|cardinality| (FastType::I64, cardinality)),
|
||||||
|
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldReaders {
|
||||||
|
pub(crate) fn load_all(
|
||||||
|
schema: &Schema,
|
||||||
|
fast_fields_composite: &CompositeFile,
|
||||||
|
) -> Result<FastFieldReaders> {
|
||||||
|
let mut fast_field_readers = FastFieldReaders {
|
||||||
|
fast_field_i64: Default::default(),
|
||||||
|
fast_field_u64: Default::default(),
|
||||||
|
fast_field_i64s: Default::default(),
|
||||||
|
fast_field_u64s: Default::default(),
|
||||||
|
fast_bytes: Default::default(),
|
||||||
|
fast_fields_composite: fast_fields_composite.clone(),
|
||||||
|
};
|
||||||
|
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||||
|
let field = Field(field_id as u32);
|
||||||
|
let field_type = field_entry.field_type();
|
||||||
|
if field_type == &FieldType::Bytes {
|
||||||
|
let idx_reader = fast_fields_composite
|
||||||
|
.open_read_with_idx(field, 0)
|
||||||
|
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||||
|
.map(FastFieldReader::open)?;
|
||||||
|
let data = fast_fields_composite
|
||||||
|
.open_read_with_idx(field, 1)
|
||||||
|
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||||
|
fast_field_readers
|
||||||
|
.fast_bytes
|
||||||
|
.insert(field, BytesFastFieldReader::open(idx_reader, data));
|
||||||
|
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
|
||||||
|
match cardinality {
|
||||||
|
Cardinality::SingleValue => {
|
||||||
|
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
|
||||||
|
match fast_type {
|
||||||
|
FastType::U64 => {
|
||||||
|
let fast_field_reader = FastFieldReader::open(fast_field_data);
|
||||||
|
fast_field_readers
|
||||||
|
.fast_field_u64
|
||||||
|
.insert(field, fast_field_reader);
|
||||||
|
}
|
||||||
|
FastType::I64 => {
|
||||||
|
fast_field_readers.fast_field_i64.insert(
|
||||||
|
field,
|
||||||
|
FastFieldReader::open(fast_field_data.clone()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Cardinality::MultiValues => {
|
||||||
|
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
|
||||||
|
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
|
||||||
|
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
|
||||||
|
let idx_reader = FastFieldReader::open(fast_field_idx);
|
||||||
|
match fast_type {
|
||||||
|
FastType::I64 => {
|
||||||
|
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||||
|
let multivalued_int_fast_field =
|
||||||
|
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||||
|
fast_field_readers
|
||||||
|
.fast_field_i64s
|
||||||
|
.insert(field, multivalued_int_fast_field);
|
||||||
|
}
|
||||||
|
FastType::U64 => {
|
||||||
|
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||||
|
let multivalued_int_fast_field =
|
||||||
|
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||||
|
fast_field_readers
|
||||||
|
.fast_field_u64s
|
||||||
|
.insert(field, multivalued_int_fast_field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(fast_field_readers)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||||
|
self.fast_fields_composite.space_usage()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a u64 fast field, this method returns `None`.
|
||||||
|
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||||
|
self.fast_field_u64.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If the field is a u64-fast field return the associated reader.
|
||||||
|
/// If the field is a i64-fast field, return the associated u64 reader. Values are
|
||||||
|
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
|
||||||
|
///
|
||||||
|
/// This method is useful when merging segment reader.
|
||||||
|
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||||
|
if let Some(u64_ff_reader) = self.u64(field) {
|
||||||
|
return Some(u64_ff_reader);
|
||||||
|
}
|
||||||
|
if let Some(i64_ff_reader) = self.i64(field) {
|
||||||
|
return Some(i64_ff_reader.into_u64_reader());
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||||
|
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
|
||||||
|
self.fast_field_i64.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
|
||||||
|
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||||
|
self.fast_field_u64s.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If the field is a u64s-fast field return the associated reader.
|
||||||
|
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
|
||||||
|
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
|
||||||
|
///
|
||||||
|
/// This method is useful when merging segment reader.
|
||||||
|
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||||
|
if let Some(u64s_ff_reader) = self.u64s(field) {
|
||||||
|
return Some(u64s_ff_reader);
|
||||||
|
}
|
||||||
|
if let Some(i64s_ff_reader) = self.i64s(field) {
|
||||||
|
return Some(i64s_ff_reader.into_u64s_reader());
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
|
||||||
|
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
|
||||||
|
self.fast_field_i64s.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `bytes` fast field reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a bytes fast field, returns `None`.
|
||||||
|
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
|
||||||
|
self.fast_bytes.get(&field).cloned()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -179,11 +179,6 @@ pub struct DeleteCursor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DeleteCursor {
|
impl DeleteCursor {
|
||||||
|
|
||||||
pub fn empty() -> DeleteCursor {
|
|
||||||
DeleteQueue::new().cursor()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Skips operations and position it so that
|
/// Skips operations and position it so that
|
||||||
/// - either all of the delete operation currently in the
|
/// - either all of the delete operation currently in the
|
||||||
/// queue are consume and the next get will return None.
|
/// queue are consume and the next get will return None.
|
||||||
|
|||||||
@@ -259,7 +259,7 @@ pub fn advance_deletes(
|
|||||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
segment_entry.set_meta(target_opstamp, segment.meta().clone());
|
segment_entry.set_meta(segment.meta().clone());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -326,12 +326,7 @@ fn index_documents(
|
|||||||
// to even open the segment.
|
// to even open the segment.
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
let segment_entry = SegmentEntry::new(
|
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||||
segment_meta,
|
|
||||||
delete_cursor,
|
|
||||||
delete_bitset_opt,
|
|
||||||
last_docstamp,
|
|
||||||
);
|
|
||||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -366,9 +361,9 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta, opstamp: u64) {
|
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||||
let delete_cursor = self.delete_queue.cursor();
|
let delete_cursor = self.delete_queue.cursor();
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None, opstamp);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||||
self.segment_updater
|
self.segment_updater
|
||||||
.add_segment(self.generation, segment_entry);
|
.add_segment(self.generation, segment_entry);
|
||||||
}
|
}
|
||||||
@@ -532,7 +527,7 @@ impl IndexWriter {
|
|||||||
//
|
//
|
||||||
// This will reach an end as the only document_sender
|
// This will reach an end as the only document_sender
|
||||||
// was dropped with the index_writer.
|
// was dropped with the index_writer.
|
||||||
for _ in document_receiver.iter() {}
|
for _ in document_receiver.clone() {}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -559,16 +554,6 @@ impl IndexWriter {
|
|||||||
/// using this API.
|
/// using this API.
|
||||||
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
||||||
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
|
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
|
||||||
info!("Preparing commit");
|
|
||||||
self.prepare_commit_internal(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn prepare_commit_soft(&mut self) -> Result<PreparedCommit> {
|
|
||||||
info!("Preparing soft commit");
|
|
||||||
self.prepare_commit_internal(true)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn prepare_commit_internal(&mut self, soft: bool) -> Result<PreparedCommit> {
|
|
||||||
// Here, because we join all of the worker threads,
|
// Here, because we join all of the worker threads,
|
||||||
// all of the segment update for this commit have been
|
// all of the segment update for this commit have been
|
||||||
// sent.
|
// sent.
|
||||||
@@ -591,13 +576,13 @@ impl IndexWriter {
|
|||||||
let indexing_worker_result = worker_handle
|
let indexing_worker_result = worker_handle
|
||||||
.join()
|
.join()
|
||||||
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
||||||
// add a new worker for the next generation, whether the worker failed or not.
|
|
||||||
self.add_indexing_worker()?;
|
|
||||||
indexing_worker_result?;
|
indexing_worker_result?;
|
||||||
|
// add a new worker for the next generation.
|
||||||
|
self.add_indexing_worker()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let commit_opstamp = self.stamper.stamp();
|
let commit_opstamp = self.stamper.stamp();
|
||||||
let prepared_commit = PreparedCommit::new(self, commit_opstamp, soft);
|
let prepared_commit = PreparedCommit::new(self, commit_opstamp);
|
||||||
info!("Prepared commit {}", commit_opstamp);
|
info!("Prepared commit {}", commit_opstamp);
|
||||||
Ok(prepared_commit)
|
Ok(prepared_commit)
|
||||||
}
|
}
|
||||||
@@ -620,11 +605,6 @@ impl IndexWriter {
|
|||||||
self.prepare_commit()?.commit()
|
self.prepare_commit()?.commit()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub fn soft_commit(&mut self) -> Result<u64> {
|
|
||||||
self.prepare_commit_soft()?.commit()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
||||||
&self.segment_updater
|
&self.segment_updater
|
||||||
}
|
}
|
||||||
@@ -752,7 +732,6 @@ mod tests {
|
|||||||
use Index;
|
use Index;
|
||||||
use ReloadPolicy;
|
use ReloadPolicy;
|
||||||
use Term;
|
use Term;
|
||||||
use IndexReader;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_operations_group() {
|
fn test_operations_group() {
|
||||||
@@ -886,13 +865,6 @@ mod tests {
|
|||||||
let _index_writer_two = index.writer(3_000_000).unwrap();
|
let _index_writer_two = index.writer(3_000_000).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn num_docs_containing_text(reader: &IndexReader, term: &str) -> u64 {
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let text_field = reader.schema().get_field("text").unwrap();
|
|
||||||
let term = Term::from_field_text(text_field, term);
|
|
||||||
searcher.doc_freq(&term)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_commit_and_rollback() {
|
fn test_commit_and_rollback() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
@@ -909,12 +881,9 @@ mod tests {
|
|||||||
searcher.doc_freq(&term)
|
searcher.doc_freq(&term)
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0);
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
|
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||||
@@ -933,35 +902,6 @@ mod tests {
|
|||||||
reader.searcher();
|
reader.searcher();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_softcommit_and_rollback() {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
|
||||||
index_writer.rollback().unwrap();
|
|
||||||
|
|
||||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
|
|
||||||
{
|
|
||||||
index_writer.add_document(doc!(text_field=>"b"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"c"));
|
|
||||||
}
|
|
||||||
assert!(index_writer.soft_commit().is_ok());
|
|
||||||
reader.reload().unwrap(); // we need to load soft committed stuff.
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "b"), 1u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "c"), 1u64);
|
|
||||||
index_writer.rollback().unwrap();
|
|
||||||
reader.reload().unwrap();
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "b"), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "c"), 0u64);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_with_merges() {
|
fn test_with_merges() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
@@ -995,7 +935,7 @@ mod tests {
|
|||||||
|
|
||||||
reader.reload().unwrap();
|
reader.reload().unwrap();
|
||||||
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 200);
|
assert_eq!(num_docs_containing("a"), 200);
|
||||||
assert!(index.searchable_segments().unwrap().len() < 8);
|
assert!(index.searchable_segments().unwrap().len() < 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1038,7 +978,7 @@ mod tests {
|
|||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index.reader();
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ impl MergePolicy for LogMergePolicy {
|
|||||||
|
|
||||||
let mut size_sorted_tuples = segments
|
let mut size_sorted_tuples = segments
|
||||||
.iter()
|
.iter()
|
||||||
.map(|x| x.num_docs())
|
.map(SegmentMeta::num_docs)
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.collect::<Vec<(usize, u32)>>();
|
.collect::<Vec<(usize, u32)>>();
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use core::Segment;
|
|||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use core::SerializableSegment;
|
use core::SerializableSegment;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
|
use fastfield::BytesFastFieldReader;
|
||||||
use fastfield::DeleteBitSet;
|
use fastfield::DeleteBitSet;
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use fastfield::FastFieldSerializer;
|
use fastfield::FastFieldSerializer;
|
||||||
@@ -72,7 +73,7 @@ fn compute_min_max_val(
|
|||||||
// some deleted documents,
|
// some deleted documents,
|
||||||
// we need to recompute the max / min
|
// we need to recompute the max / min
|
||||||
(0..max_doc)
|
(0..max_doc)
|
||||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
|
||||||
.map(|doc_id| u64_reader.get(doc_id))
|
.map(|doc_id| u64_reader.get(doc_id))
|
||||||
.minmax()
|
.minmax()
|
||||||
.into_option()
|
.into_option()
|
||||||
@@ -239,7 +240,10 @@ impl IndexMerger {
|
|||||||
let mut max_value = u64::min_value();
|
let mut max_value = u64::min_value();
|
||||||
|
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let u64_reader: FastFieldReader<u64> = reader.fast_field_reader(field)?;
|
let u64_reader: FastFieldReader<u64> = reader
|
||||||
|
.fast_fields()
|
||||||
|
.u64_lenient(field)
|
||||||
|
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||||
if let Some((seg_min_val, seg_max_val)) =
|
if let Some((seg_min_val, seg_max_val)) =
|
||||||
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
|
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
|
||||||
{
|
{
|
||||||
@@ -282,24 +286,28 @@ impl IndexMerger {
|
|||||||
fast_field_serializer: &mut FastFieldSerializer,
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut total_num_vals = 0u64;
|
let mut total_num_vals = 0u64;
|
||||||
|
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
|
||||||
|
|
||||||
// In the first pass, we compute the total number of vals.
|
// In the first pass, we compute the total number of vals.
|
||||||
//
|
//
|
||||||
// This is required by the bitpacker, as it needs to know
|
// This is required by the bitpacker, as it needs to know
|
||||||
// what should be the bit length use for bitpacking.
|
// what should be the bit length use for bitpacking.
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
let u64s_reader = reader.fast_fields()
|
||||||
|
.u64s_lenient(field)
|
||||||
|
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
|
||||||
|
|
||||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||||
for doc in 0u32..reader.max_doc() {
|
for doc in 0u32..reader.max_doc() {
|
||||||
if !delete_bitset.is_deleted(doc) {
|
if delete_bitset.is_alive(doc) {
|
||||||
let start = idx_reader.get(doc);
|
let num_vals = u64s_reader.num_vals(doc) as u64;
|
||||||
let end = idx_reader.get(doc + 1);
|
total_num_vals += num_vals;
|
||||||
total_num_vals += end - start;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
total_num_vals += idx_reader.max_value();
|
total_num_vals += u64s_reader.total_num_vals();
|
||||||
}
|
}
|
||||||
|
u64s_readers.push(u64s_reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can now create our `idx` serializer, and in a second pass,
|
// We can now create our `idx` serializer, and in a second pass,
|
||||||
@@ -307,13 +315,10 @@ impl IndexMerger {
|
|||||||
let mut serialize_idx =
|
let mut serialize_idx =
|
||||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||||
let mut idx = 0;
|
let mut idx = 0;
|
||||||
for reader in &self.readers {
|
for (segment_reader, u64s_reader) in self.readers.iter().zip(&u64s_readers) {
|
||||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
for doc in reader.doc_ids_alive() {
|
|
||||||
serialize_idx.add_val(idx)?;
|
serialize_idx.add_val(idx)?;
|
||||||
let start = idx_reader.get(doc);
|
idx += u64s_reader.num_vals(doc) as u64;
|
||||||
let end = idx_reader.get(doc + 1);
|
|
||||||
idx += end - start;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
serialize_idx.add_val(idx)?;
|
serialize_idx.add_val(idx)?;
|
||||||
@@ -344,8 +349,10 @@ impl IndexMerger {
|
|||||||
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
||||||
let term_ordinal_mapping: &[TermOrdinal] =
|
let term_ordinal_mapping: &[TermOrdinal] =
|
||||||
term_ordinal_mappings.get_segment(segment_ord);
|
term_ordinal_mappings.get_segment(segment_ord);
|
||||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
|
||||||
segment_reader.multi_fast_field_reader(field)?;
|
.fast_fields()
|
||||||
|
.u64s(field)
|
||||||
|
.expect("Could not find multivalued u64 fast value reader.");
|
||||||
// TODO optimize if no deletes
|
// TODO optimize if no deletes
|
||||||
for doc in segment_reader.doc_ids_alive() {
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
ff_reader.get_vals(doc, &mut vals);
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
@@ -377,6 +384,8 @@ impl IndexMerger {
|
|||||||
|
|
||||||
let mut vals = Vec::with_capacity(100);
|
let mut vals = Vec::with_capacity(100);
|
||||||
|
|
||||||
|
let mut ff_readers = Vec::new();
|
||||||
|
|
||||||
// Our values are bitpacked and we need to know what should be
|
// Our values are bitpacked and we need to know what should be
|
||||||
// our bitwidth and our minimum value before serializing any values.
|
// our bitwidth and our minimum value before serializing any values.
|
||||||
//
|
//
|
||||||
@@ -385,7 +394,10 @@ impl IndexMerger {
|
|||||||
// maximum value and initialize our Serializer.
|
// maximum value and initialize our Serializer.
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||||
reader.multi_fast_field_reader(field)?;
|
reader.fast_fields().u64s_lenient(field).expect(
|
||||||
|
"Failed to find multivalued fast field reader. This is a bug in \
|
||||||
|
tantivy. Please report.",
|
||||||
|
);
|
||||||
for doc in reader.doc_ids_alive() {
|
for doc in reader.doc_ids_alive() {
|
||||||
ff_reader.get_vals(doc, &mut vals);
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
for &val in &vals {
|
for &val in &vals {
|
||||||
@@ -393,6 +405,7 @@ impl IndexMerger {
|
|||||||
max_value = cmp::max(val, max_value);
|
max_value = cmp::max(val, max_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ff_readers.push(ff_reader);
|
||||||
// TODO optimize when no deletes
|
// TODO optimize when no deletes
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -405,9 +418,7 @@ impl IndexMerger {
|
|||||||
{
|
{
|
||||||
let mut serialize_vals = fast_field_serializer
|
let mut serialize_vals = fast_field_serializer
|
||||||
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
||||||
for reader in &self.readers {
|
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
|
||||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
|
||||||
reader.multi_fast_field_reader(field)?;
|
|
||||||
// TODO optimize if no deletes
|
// TODO optimize if no deletes
|
||||||
for doc in reader.doc_ids_alive() {
|
for doc in reader.doc_ids_alive() {
|
||||||
ff_reader.get_vals(doc, &mut vals);
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
@@ -426,19 +437,53 @@ impl IndexMerger {
|
|||||||
field: Field,
|
field: Field,
|
||||||
fast_field_serializer: &mut FastFieldSerializer,
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.write_fast_field_idx(field, fast_field_serializer)?;
|
let mut total_num_vals = 0u64;
|
||||||
|
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
|
||||||
|
|
||||||
|
for reader in &self.readers {
|
||||||
|
let bytes_reader = reader.fast_fields().bytes(field).expect(
|
||||||
|
"Failed to find bytes fast field reader. This is a bug in tantivy, please report.",
|
||||||
|
);
|
||||||
|
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||||
|
for doc in 0u32..reader.max_doc() {
|
||||||
|
if delete_bitset.is_alive(doc) {
|
||||||
|
let num_vals = bytes_reader.get_bytes(doc).len() as u64;
|
||||||
|
total_num_vals += num_vals;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
total_num_vals += bytes_reader.total_num_bytes() as u64;
|
||||||
|
}
|
||||||
|
bytes_readers.push(bytes_reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// We can now create our `idx` serializer, and in a second pass,
|
||||||
|
// can effectively push the different indexes.
|
||||||
|
let mut serialize_idx =
|
||||||
|
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||||
|
let mut idx = 0;
|
||||||
|
for (segment_reader, bytes_reader) in self.readers.iter().zip(&bytes_readers) {
|
||||||
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
|
serialize_idx.add_val(idx)?;
|
||||||
|
idx += bytes_reader.get_bytes(doc).len() as u64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
serialize_idx.add_val(idx)?;
|
||||||
|
serialize_idx.close_field()?;
|
||||||
|
}
|
||||||
|
|
||||||
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
|
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
|
||||||
for reader in &self.readers {
|
for segment_reader in &self.readers {
|
||||||
let bytes_reader = reader.bytes_fast_field_reader(field)?;
|
let bytes_reader = segment_reader.fast_fields().bytes(field)
|
||||||
|
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
|
||||||
// TODO: optimize if no deletes
|
// TODO: optimize if no deletes
|
||||||
for doc in reader.doc_ids_alive() {
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
let val = bytes_reader.get_val(doc);
|
let val = bytes_reader.get_bytes(doc);
|
||||||
serialize_vals.write_all(val)?;
|
serialize_vals.write_all(val)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
serialize_vals.flush()?;
|
serialize_vals.flush()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -979,14 +1024,16 @@ mod tests {
|
|||||||
|
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 4000);
|
assert_eq!(score_field_reader.min_value(), 4000);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
|
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(1)
|
.segment_reader(1)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 1);
|
assert_eq!(score_field_reader.min_value(), 1);
|
||||||
assert_eq!(score_field_reader.max_value(), 3);
|
assert_eq!(score_field_reader.max_value(), 3);
|
||||||
@@ -1037,7 +1084,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 3);
|
assert_eq!(score_field_reader.min_value(), 3);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
@@ -1083,7 +1131,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 3);
|
assert_eq!(score_field_reader.min_value(), 3);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
@@ -1135,7 +1184,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 6000);
|
assert_eq!(score_field_reader.min_value(), 6000);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
@@ -1381,7 +1431,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(0u32);
|
let segment = searcher.segment_reader(0u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
|
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[1, 2]);
|
assert_eq!(&vals, &[1, 2]);
|
||||||
@@ -1416,7 +1466,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(1u32);
|
let segment = searcher.segment_reader(1u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
@@ -1426,7 +1476,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(2u32);
|
let segment = searcher.segment_reader(2u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[20]);
|
assert_eq!(&vals, &[20]);
|
||||||
}
|
}
|
||||||
@@ -1459,7 +1509,7 @@ mod tests {
|
|||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
);
|
);
|
||||||
let segment = searcher.segment_reader(0u32);
|
let segment = searcher.segment_reader(0u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
|
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[1, 2]);
|
assert_eq!(&vals, &[1, 2]);
|
||||||
|
|||||||
@@ -6,20 +6,14 @@ pub struct PreparedCommit<'a> {
|
|||||||
index_writer: &'a mut IndexWriter,
|
index_writer: &'a mut IndexWriter,
|
||||||
payload: Option<String>,
|
payload: Option<String>,
|
||||||
opstamp: u64,
|
opstamp: u64,
|
||||||
soft: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> PreparedCommit<'a> {
|
impl<'a> PreparedCommit<'a> {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
||||||
index_writer: &'a mut IndexWriter,
|
|
||||||
opstamp: u64,
|
|
||||||
soft: bool,
|
|
||||||
) -> PreparedCommit {
|
|
||||||
PreparedCommit {
|
PreparedCommit {
|
||||||
index_writer,
|
index_writer,
|
||||||
payload: None,
|
payload: None,
|
||||||
opstamp,
|
opstamp,
|
||||||
soft,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -39,7 +33,7 @@ impl<'a> PreparedCommit<'a> {
|
|||||||
info!("committing {}", self.opstamp);
|
info!("committing {}", self.opstamp);
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.segment_updater()
|
.segment_updater()
|
||||||
.commit(self.opstamp, self.payload, self.soft)?;
|
.commit(self.opstamp, self.payload)?;
|
||||||
Ok(self.opstamp)
|
Ok(self.opstamp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ pub struct SegmentEntry {
|
|||||||
meta: SegmentMeta,
|
meta: SegmentMeta,
|
||||||
delete_bitset: Option<BitSet>,
|
delete_bitset: Option<BitSet>,
|
||||||
delete_cursor: DeleteCursor,
|
delete_cursor: DeleteCursor,
|
||||||
opstamp: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentEntry {
|
impl SegmentEntry {
|
||||||
@@ -31,20 +30,14 @@ impl SegmentEntry {
|
|||||||
segment_meta: SegmentMeta,
|
segment_meta: SegmentMeta,
|
||||||
delete_cursor: DeleteCursor,
|
delete_cursor: DeleteCursor,
|
||||||
delete_bitset: Option<BitSet>,
|
delete_bitset: Option<BitSet>,
|
||||||
opstamp: u64,
|
|
||||||
) -> SegmentEntry {
|
) -> SegmentEntry {
|
||||||
SegmentEntry {
|
SegmentEntry {
|
||||||
meta: segment_meta,
|
meta: segment_meta,
|
||||||
delete_bitset,
|
delete_bitset,
|
||||||
delete_cursor,
|
delete_cursor,
|
||||||
opstamp,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn opstamp(&self) -> u64 {
|
|
||||||
self.opstamp
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return a reference to the segment entry deleted bitset.
|
/// Return a reference to the segment entry deleted bitset.
|
||||||
///
|
///
|
||||||
/// `DocId` in this bitset are flagged as deleted.
|
/// `DocId` in this bitset are flagged as deleted.
|
||||||
@@ -53,8 +46,7 @@ impl SegmentEntry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Set the `SegmentMeta` for this segment.
|
/// Set the `SegmentMeta` for this segment.
|
||||||
pub fn set_meta(&mut self, opstamp: u64, segment_meta: SegmentMeta) {
|
pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
|
||||||
self.opstamp = opstamp;
|
|
||||||
self.meta = segment_meta;
|
self.meta = segment_meta;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,47 +11,11 @@ use std::path::PathBuf;
|
|||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||||
use Result as TantivyResult;
|
use Result as TantivyResult;
|
||||||
use std::sync::Arc;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
/// Provides a read-only view of the available segments.
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct AvailableSegments {
|
|
||||||
registers: Arc<RwLock<SegmentRegisters>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AvailableSegments {
|
|
||||||
pub fn committed(&self) -> Vec<SegmentMeta> {
|
|
||||||
self.registers
|
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.committed
|
|
||||||
.segment_metas()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn soft_committed(&self) -> Vec<SegmentMeta> {
|
|
||||||
self.registers
|
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.soft_committed
|
|
||||||
.segment_metas()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
struct SegmentRegisters {
|
struct SegmentRegisters {
|
||||||
uncommitted: HashMap<SegmentId, SegmentEntry>,
|
uncommitted: SegmentRegister,
|
||||||
committed: SegmentRegister,
|
committed: SegmentRegister,
|
||||||
/// soft commits can advance committed segment to a future delete
|
|
||||||
/// opstamp.
|
|
||||||
///
|
|
||||||
/// In that case the same `SegmentId` can appear in both `committed`
|
|
||||||
/// and in `committed_in_the_future`.
|
|
||||||
///
|
|
||||||
/// We do not consider these segments for merges.
|
|
||||||
soft_committed: SegmentRegister,
|
|
||||||
/// `DeleteCursor`, positionned on the soft commit.
|
|
||||||
delete_cursor: DeleteCursor,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The segment manager stores the list of segments
|
/// The segment manager stores the list of segments
|
||||||
@@ -59,8 +23,9 @@ struct SegmentRegisters {
|
|||||||
///
|
///
|
||||||
/// It guarantees the atomicity of the
|
/// It guarantees the atomicity of the
|
||||||
/// changes (merges especially)
|
/// changes (merges especially)
|
||||||
|
#[derive(Default)]
|
||||||
pub struct SegmentManager {
|
pub struct SegmentManager {
|
||||||
registers: Arc<RwLock<SegmentRegisters>>
|
registers: RwLock<SegmentRegisters>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for SegmentManager {
|
impl Debug for SegmentManager {
|
||||||
@@ -81,17 +46,11 @@ pub fn get_mergeable_segments(
|
|||||||
let registers_lock = segment_manager.read();
|
let registers_lock = segment_manager.read();
|
||||||
(
|
(
|
||||||
registers_lock
|
registers_lock
|
||||||
.soft_committed
|
.committed
|
||||||
.get_mergeable_segments(in_merge_segment_ids),
|
.get_mergeable_segments(in_merge_segment_ids),
|
||||||
registers_lock
|
registers_lock
|
||||||
.uncommitted
|
.uncommitted
|
||||||
.values()
|
.get_mergeable_segments(in_merge_segment_ids),
|
||||||
.map(|segment_entry| segment_entry.meta())
|
|
||||||
.filter(|segment_meta| {
|
|
||||||
!in_merge_segment_ids.contains(&segment_meta.id())
|
|
||||||
})
|
|
||||||
.cloned()
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -99,22 +58,21 @@ impl SegmentManager {
|
|||||||
pub fn from_segments(
|
pub fn from_segments(
|
||||||
segment_metas: Vec<SegmentMeta>,
|
segment_metas: Vec<SegmentMeta>,
|
||||||
delete_cursor: &DeleteCursor,
|
delete_cursor: &DeleteCursor,
|
||||||
opstamp: u64,
|
|
||||||
) -> SegmentManager {
|
) -> SegmentManager {
|
||||||
SegmentManager {
|
SegmentManager {
|
||||||
registers: Arc::new(RwLock::new(SegmentRegisters {
|
registers: RwLock::new(SegmentRegisters {
|
||||||
uncommitted: HashMap::default(),
|
uncommitted: SegmentRegister::default(),
|
||||||
committed: SegmentRegister::new(segment_metas.clone(), opstamp),
|
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
||||||
soft_committed: SegmentRegister::new(segment_metas, opstamp),
|
}),
|
||||||
delete_cursor: delete_cursor.clone(),
|
|
||||||
}))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn available_segments_view(&self) -> AvailableSegments {
|
/// Returns all of the segment entries (committed or uncommitted)
|
||||||
AvailableSegments {
|
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||||
registers: self.registers.clone()
|
let registers_lock = self.read();
|
||||||
}
|
let mut segment_entries = registers_lock.uncommitted.segment_entries();
|
||||||
|
segment_entries.extend(registers_lock.committed.segment_entries());
|
||||||
|
segment_entries
|
||||||
}
|
}
|
||||||
|
|
||||||
/// List the files that are useful to the index.
|
/// List the files that are useful to the index.
|
||||||
@@ -150,76 +108,44 @@ impl SegmentManager {
|
|||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
registers_lock
|
registers_lock
|
||||||
.committed
|
.committed
|
||||||
.segment_metas()
|
.segment_entries()
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|segment_meta| segment_meta.num_docs() == 0)
|
.filter(|segment| segment.meta().num_docs() == 0)
|
||||||
.for_each(|segment_meta| {
|
.for_each(|segment| {
|
||||||
registers_lock
|
registers_lock
|
||||||
.committed
|
.committed
|
||||||
.remove_segment(&segment_meta.id())
|
.remove_segment(&segment.segment_id())
|
||||||
});
|
|
||||||
registers_lock
|
|
||||||
.soft_committed
|
|
||||||
.segment_metas()
|
|
||||||
.iter()
|
|
||||||
.filter(|segment_meta| segment_meta.num_docs() == 0)
|
|
||||||
.for_each(|segment_meta| {
|
|
||||||
registers_lock
|
|
||||||
.committed
|
|
||||||
.remove_segment(&segment_meta.id())
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns all of the segment entries (soft committed or uncommitted)
|
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||||
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
|
||||||
let registers_lock = self.read();
|
|
||||||
let mut segment_entries: Vec<SegmentEntry > = registers_lock.uncommitted.values().cloned().collect();
|
|
||||||
segment_entries.extend(registers_lock.soft_committed.segment_entries(®isters_lock.delete_cursor).into_iter());
|
|
||||||
segment_entries
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
pub fn commit(&self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
|
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
|
registers_lock.committed.clear();
|
||||||
registers_lock.uncommitted.clear();
|
registers_lock.uncommitted.clear();
|
||||||
registers_lock
|
for segment_entry in segment_entries {
|
||||||
.committed
|
registers_lock.committed.add_segment_entry(segment_entry);
|
||||||
.set_commit(opstamp, segment_entries.clone());
|
}
|
||||||
registers_lock
|
|
||||||
.soft_committed
|
|
||||||
.set_commit(opstamp, segment_entries);
|
|
||||||
registers_lock.delete_cursor.skip_to(opstamp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn soft_commit(&self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
|
/// Marks a list of segments as in merge.
|
||||||
let mut registers_lock = self.write();
|
|
||||||
registers_lock.uncommitted.clear();
|
|
||||||
registers_lock
|
|
||||||
.soft_committed
|
|
||||||
.set_commit(opstamp, segment_entries);
|
|
||||||
registers_lock.delete_cursor.skip_to(opstamp);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Gets the list of segment_entries associated to a list of `segment_ids`.
|
|
||||||
/// This method is used when starting a merge operations.
|
|
||||||
///
|
///
|
||||||
/// Returns an error if some segments are missing, or if
|
/// Returns an error if some segments are missing, or if
|
||||||
/// the `segment_ids` are not either all soft_committed or all
|
/// the `segment_ids` are not either all committed or all
|
||||||
/// uncommitted.
|
/// uncommitted.
|
||||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
||||||
let registers_lock = self.read();
|
let registers_lock = self.read();
|
||||||
let mut segment_entries = vec![];
|
let mut segment_entries = vec![];
|
||||||
if segment_ids.iter().all(|segment_id| registers_lock.uncommitted.contains_key(segment_id)) {
|
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||||
for segment_id in segment_ids {
|
for segment_id in segment_ids {
|
||||||
let segment_entry = registers_lock.uncommitted
|
let segment_entry = registers_lock.uncommitted
|
||||||
.get(segment_id)
|
.get(segment_id)
|
||||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||||
segment_entries.push(segment_entry.clone());
|
segment_entries.push(segment_entry);
|
||||||
}
|
}
|
||||||
} else if registers_lock.soft_committed.contains_all(segment_ids) {
|
} else if registers_lock.committed.contains_all(segment_ids) {
|
||||||
for segment_id in segment_ids {
|
for segment_id in segment_ids {
|
||||||
let segment_entry = registers_lock.soft_committed
|
let segment_entry = registers_lock.committed
|
||||||
.get(segment_id, ®isters_lock.delete_cursor)
|
.get(segment_id)
|
||||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||||
segment_entries.push(segment_entry);
|
segment_entries.push(segment_entry);
|
||||||
}
|
}
|
||||||
@@ -234,32 +160,35 @@ impl SegmentManager {
|
|||||||
|
|
||||||
pub fn add_segment(&self, segment_entry: SegmentEntry) {
|
pub fn add_segment(&self, segment_entry: SegmentEntry) {
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
registers_lock
|
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
||||||
.uncommitted
|
|
||||||
.insert(segment_entry.segment_id(), segment_entry);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn end_merge(
|
pub fn end_merge(
|
||||||
&self,
|
&self,
|
||||||
before_merge_segment_ids: &[SegmentId],
|
before_merge_segment_ids: &[SegmentId],
|
||||||
after_merge_segment_entry: SegmentEntry
|
after_merge_segment_entry: SegmentEntry,
|
||||||
) {
|
) {
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
|
let target_register: &mut SegmentRegister = {
|
||||||
if before_merge_segment_ids.iter().all(|seg_id|
|
if registers_lock
|
||||||
registers_lock
|
|
||||||
.uncommitted
|
.uncommitted
|
||||||
.contains_key(seg_id))
|
.contains_all(before_merge_segment_ids)
|
||||||
{
|
{
|
||||||
for segment_id in before_merge_segment_ids {
|
&mut registers_lock.uncommitted
|
||||||
registers_lock.uncommitted.remove(&segment_id);
|
} else if registers_lock
|
||||||
|
.committed
|
||||||
|
.contains_all(before_merge_segment_ids)
|
||||||
|
{
|
||||||
|
&mut registers_lock.committed
|
||||||
|
} else {
|
||||||
|
warn!("couldn't find segment in SegmentManager");
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
registers_lock.uncommitted.insert(after_merge_segment_entry.segment_id(),
|
};
|
||||||
after_merge_segment_entry);
|
for segment_id in before_merge_segment_ids {
|
||||||
} else {
|
target_register.remove_segment(segment_id);
|
||||||
registers_lock.committed.receive_merge(&before_merge_segment_ids, &after_merge_segment_entry);
|
|
||||||
registers_lock.soft_committed.receive_merge(&before_merge_segment_ids, &after_merge_segment_entry)
|
|
||||||
}
|
}
|
||||||
|
target_register.add_segment_entry(after_merge_segment_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||||
|
|||||||
@@ -16,8 +16,7 @@ use std::fmt::{self, Debug, Formatter};
|
|||||||
/// merge candidates.
|
/// merge candidates.
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct SegmentRegister {
|
pub struct SegmentRegister {
|
||||||
segment_states: HashMap<SegmentId, SegmentMeta>,
|
segment_states: HashMap<SegmentId, SegmentEntry>,
|
||||||
opstamp_constraint: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for SegmentRegister {
|
impl Debug for SegmentRegister {
|
||||||
@@ -42,28 +41,23 @@ impl SegmentRegister {
|
|||||||
) -> Vec<SegmentMeta> {
|
) -> Vec<SegmentMeta> {
|
||||||
self.segment_states
|
self.segment_states
|
||||||
.values()
|
.values()
|
||||||
.filter(|segment_meta| !in_merge_segment_ids.contains(&segment_meta.id()))
|
.filter(|segment_entry| !in_merge_segment_ids.contains(&segment_entry.segment_id()))
|
||||||
.cloned()
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||||
|
self.segment_states.values().cloned().collect()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||||
let mut segment_metas: Vec<SegmentMeta> = self
|
let mut segment_ids: Vec<SegmentMeta> = self
|
||||||
.segment_states
|
.segment_states
|
||||||
.values()
|
.values()
|
||||||
.cloned()
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect();
|
.collect();
|
||||||
segment_metas.sort_by_key(|meta| meta.id());
|
segment_ids.sort_by_key(SegmentMeta::id);
|
||||||
segment_metas
|
segment_ids
|
||||||
}
|
|
||||||
|
|
||||||
pub fn segment_entries(&self, delete_cursor: &DeleteCursor) -> Vec<SegmentEntry> {
|
|
||||||
self.segment_states
|
|
||||||
.values()
|
|
||||||
.map(|segment_meta| {
|
|
||||||
SegmentEntry::new(segment_meta.clone(), delete_cursor.clone(), None, self.opstamp_constraint)
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
||||||
@@ -72,77 +66,27 @@ impl SegmentRegister {
|
|||||||
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn receive_merge(&mut self,
|
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
|
||||||
before_merge_segment_ids: &[SegmentId],
|
|
||||||
after_merge_segment_entry: &SegmentEntry) {
|
|
||||||
if after_merge_segment_entry.opstamp() != self.opstamp_constraint {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if !self.contains_all(before_merge_segment_ids) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for segment_id in before_merge_segment_ids {
|
|
||||||
self.segment_states.remove(segment_id);
|
|
||||||
}
|
|
||||||
self.register_segment_entry(after_merge_segment_entry.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Registers a `SegmentEntry`.
|
|
||||||
///
|
|
||||||
/// If a segment entry associated to this `SegmentId` is already there,
|
|
||||||
/// override it with the new `SegmentEntry`.
|
|
||||||
pub fn register_segment_entry(&mut self, segment_entry: SegmentEntry) {
|
|
||||||
if self.opstamp_constraint != segment_entry.opstamp() {
|
|
||||||
panic!(format!(
|
|
||||||
"Invalid segment. Expect opstamp {}, got {}.",
|
|
||||||
self.opstamp_constraint,
|
|
||||||
segment_entry.opstamp()
|
|
||||||
));
|
|
||||||
}
|
|
||||||
if segment_entry.meta().num_docs() == 0 {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let segment_id = segment_entry.segment_id();
|
let segment_id = segment_entry.segment_id();
|
||||||
// Check that we are ok with deletes.
|
self.segment_states.insert(segment_id, segment_entry);
|
||||||
self.segment_states.insert(segment_id, segment_entry.meta().clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn set_commit(&mut self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
|
|
||||||
self.segment_states.clear();
|
|
||||||
self.opstamp_constraint = opstamp;
|
|
||||||
for segment_entry in segment_entries {
|
|
||||||
self.register_segment_entry(segment_entry);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
|
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
|
||||||
self.segment_states.remove(&segment_id);
|
self.segment_states.remove(segment_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get(&self, segment_id: &SegmentId, delete_cursor: &DeleteCursor) -> Option<SegmentEntry> {
|
pub fn get(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||||
self.segment_states
|
self.segment_states.get(segment_id).cloned()
|
||||||
.get(&segment_id)
|
|
||||||
.map(|segment_meta|
|
|
||||||
SegmentEntry::new(
|
|
||||||
segment_meta.clone(),
|
|
||||||
delete_cursor.clone(),
|
|
||||||
None,
|
|
||||||
self.opstamp_constraint
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(
|
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
|
||||||
segment_metas: Vec<SegmentMeta>,
|
|
||||||
opstamp: u64,
|
|
||||||
) -> SegmentRegister {
|
|
||||||
let mut segment_states = HashMap::new();
|
let mut segment_states = HashMap::new();
|
||||||
for segment_meta in segment_metas {
|
for segment_meta in segment_metas {
|
||||||
segment_states.insert(segment_meta.id(), segment_meta);
|
let segment_id = segment_meta.id();
|
||||||
}
|
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None);
|
||||||
SegmentRegister {
|
segment_states.insert(segment_id, segment_entry);
|
||||||
segment_states,
|
|
||||||
opstamp_constraint: opstamp,
|
|
||||||
}
|
}
|
||||||
|
SegmentRegister { segment_states }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -171,22 +115,22 @@ mod tests {
|
|||||||
let segment_id_merged = SegmentId::generate_random();
|
let segment_id_merged = SegmentId::generate_random();
|
||||||
|
|
||||||
{
|
{
|
||||||
let segment_meta = SegmentMeta::new(segment_id_a, 1u32);
|
let segment_meta = SegmentMeta::new(segment_id_a, 0u32);
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None, 0u64);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||||
segment_register.register_segment_entry(segment_entry);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
}
|
}
|
||||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||||
{
|
{
|
||||||
let segment_meta = SegmentMeta::new(segment_id_b, 2u32);
|
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None, 0u64);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||||
segment_register.register_segment_entry(segment_entry);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
}
|
}
|
||||||
|
segment_register.remove_segment(&segment_id_a);
|
||||||
|
segment_register.remove_segment(&segment_id_b);
|
||||||
{
|
{
|
||||||
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 3u32);
|
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 0u32);
|
||||||
let segment_entry =
|
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||||
SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None, 0u64);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
segment_register.receive_merge(&[segment_id_a, segment_id_b], &segment_entry);
|
|
||||||
segment_register.register_segment_entry(segment_entry);
|
|
||||||
}
|
}
|
||||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
|
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ fn perform_merge(
|
|||||||
|
|
||||||
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
|
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
|
||||||
|
|
||||||
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None, target_opstamp);
|
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
|
||||||
Ok(after_merge_segment_entry)
|
Ok(after_merge_segment_entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -155,11 +155,8 @@ impl SegmentUpdater {
|
|||||||
stamper: Stamper,
|
stamper: Stamper,
|
||||||
delete_cursor: &DeleteCursor,
|
delete_cursor: &DeleteCursor,
|
||||||
) -> Result<SegmentUpdater> {
|
) -> Result<SegmentUpdater> {
|
||||||
|
|
||||||
let index_meta = index.load_metas()?;
|
|
||||||
let segments = index.searchable_segment_metas()?;
|
let segments = index.searchable_segment_metas()?;
|
||||||
let opstamp = index_meta.opstamp;
|
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor, opstamp);
|
|
||||||
let pool = CpuPoolBuilder::new()
|
let pool = CpuPoolBuilder::new()
|
||||||
.name_prefix("segment_updater")
|
.name_prefix("segment_updater")
|
||||||
.pool_size(1)
|
.pool_size(1)
|
||||||
@@ -283,30 +280,14 @@ impl SegmentUpdater {
|
|||||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn commit(&self, opstamp: u64, payload: Option<String>, soft: bool) -> Result<()> {
|
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
if segment_updater.is_alive() {
|
if segment_updater.is_alive() {
|
||||||
let segment_entries = segment_updater
|
let segment_entries = segment_updater
|
||||||
.purge_deletes(opstamp)
|
.purge_deletes(opstamp)
|
||||||
.expect("Failed purge deletes");
|
.expect("Failed purge deletes");
|
||||||
if soft {
|
segment_updater.0.segment_manager.commit(segment_entries);
|
||||||
// Soft commit.
|
segment_updater.save_metas(opstamp, payload);
|
||||||
//
|
|
||||||
// The list `segment_entries` above is what we might want to use as searchable
|
|
||||||
// segment. However, we do not want to mark them as committed, and we want
|
|
||||||
// to keep the current set of committed segment.
|
|
||||||
segment_updater.0.segment_manager.soft_commit(opstamp, segment_entries);
|
|
||||||
// ... We do not save the meta file.
|
|
||||||
} else {
|
|
||||||
// Hard_commit. We register the new segment entries as committed.
|
|
||||||
segment_updater
|
|
||||||
.0
|
|
||||||
.segment_manager
|
|
||||||
.commit(opstamp, segment_entries);
|
|
||||||
// TODO error handling.
|
|
||||||
segment_updater.save_metas(opstamp, payload);
|
|
||||||
segment_updater.0.index.directory().flush().unwrap();
|
|
||||||
}
|
|
||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use fastfield::FastFieldsWriter;
|
|||||||
use fieldnorm::FieldNormsWriter;
|
use fieldnorm::FieldNormsWriter;
|
||||||
use indexer::segment_serializer::SegmentSerializer;
|
use indexer::segment_serializer::SegmentSerializer;
|
||||||
use postings::MultiFieldPostingsWriter;
|
use postings::MultiFieldPostingsWriter;
|
||||||
|
use schema::FieldEntry;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
@@ -53,7 +54,7 @@ impl SegmentWriter {
|
|||||||
schema
|
schema
|
||||||
.fields()
|
.fields()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|field_entry| field_entry.field_type())
|
.map(FieldEntry::field_type)
|
||||||
.map(|field_type| match *field_type {
|
.map(|field_type| match *field_type {
|
||||||
FieldType::Str(ref text_options) => text_options
|
FieldType::Str(ref text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
|
|||||||
24
src/lib.rs
24
src/lib.rs
@@ -876,28 +876,28 @@ mod tests {
|
|||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
|
||||||
assert!(fast_field_reader_res.is_err());
|
assert!(fast_field_reader_opt.is_none());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
|
let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
|
||||||
assert!(fast_field_reader_res.is_err());
|
assert!(fast_field_reader_opt.is_none());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
|
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
|
||||||
assert!(fast_field_reader_res.is_err());
|
assert!(fast_field_reader_opt.is_none());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||||
assert!(fast_field_reader_res.is_ok());
|
assert!(fast_field_reader_opt.is_some());
|
||||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||||
assert!(fast_field_reader_res.is_ok());
|
assert!(fast_field_reader_opt.is_some());
|
||||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
use postings::compression::AlignedBuffer;
|
||||||
|
|
||||||
/// This modules define the logic used to search for a doc in a given
|
/// This modules define the logic used to search for a doc in a given
|
||||||
/// block. (at most 128 docs)
|
/// block. (at most 128 docs)
|
||||||
///
|
///
|
||||||
@@ -6,7 +8,7 @@
|
|||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
#[cfg(target_arch = "x86_64")]
|
||||||
mod sse2 {
|
mod sse2 {
|
||||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||||
use std::arch::x86_64::__m128i as DataType;
|
use std::arch::x86_64::__m128i as DataType;
|
||||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
||||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
||||||
@@ -23,9 +25,9 @@ mod sse2 {
|
|||||||
///
|
///
|
||||||
/// There is no early exit here. We simply count the
|
/// There is no early exit here. We simply count the
|
||||||
/// number of elements that are `< target`.
|
/// number of elements that are `< target`.
|
||||||
pub fn linear_search_sse2_128(arr: &[u32], target: u32) -> usize {
|
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||||
unsafe {
|
unsafe {
|
||||||
let ptr = arr.as_ptr() as *const DataType;
|
let ptr = arr as *const AlignedBuffer as *const DataType;
|
||||||
let vkey = set1(target as i32);
|
let vkey = set1(target as i32);
|
||||||
let mut cnt = set0();
|
let mut cnt = set0();
|
||||||
// We work over 4 `__m128i` at a time.
|
// We work over 4 `__m128i` at a time.
|
||||||
@@ -47,14 +49,16 @@ mod sse2 {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::linear_search_sse2_128;
|
use super::linear_search_sse2_128;
|
||||||
|
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_linear_search_sse2_128_u32() {
|
fn test_linear_search_sse2_128_u32() {
|
||||||
for i in 0..23 {
|
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||||
dbg!(i);
|
for el in 0u32..128u32 {
|
||||||
let arr: Vec<u32> = (0..128).map(|el| el * 2 + 1 << 18).collect();
|
block[el as usize] = el * 2 + 1 << 18;
|
||||||
assert_eq!(linear_search_sse2_128(&arr, arr[64] + 1), 65);
|
|
||||||
}
|
}
|
||||||
|
let target = block[64] + 1;
|
||||||
|
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -127,17 +131,21 @@ impl BlockSearcher {
|
|||||||
/// then we use a different implementation that does an exhaustive linear search over
|
/// then we use a different implementation that does an exhaustive linear search over
|
||||||
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
||||||
/// of branch.
|
/// of branch.
|
||||||
pub fn search_in_block(&self, block_docs: &[u32], start: usize, target: u32) -> usize {
|
pub(crate) fn search_in_block(
|
||||||
|
self,
|
||||||
|
block_docs: &AlignedBuffer,
|
||||||
|
len: usize,
|
||||||
|
start: usize,
|
||||||
|
target: u32,
|
||||||
|
) -> usize {
|
||||||
#[cfg(target_arch = "x86_64")]
|
#[cfg(target_arch = "x86_64")]
|
||||||
{
|
{
|
||||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||||
if *self == BlockSearcher::SSE2 {
|
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
|
||||||
if block_docs.len() == COMPRESSION_BLOCK_SIZE {
|
return sse2::linear_search_sse2_128(block_docs, target);
|
||||||
return sse2::linear_search_sse2_128(block_docs, target);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
start + galloping(&block_docs[start..], target)
|
start + galloping(&block_docs.0[start..len], target)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -158,6 +166,7 @@ mod tests {
|
|||||||
use super::exponential_search;
|
use super::exponential_search;
|
||||||
use super::linear_search;
|
use super::linear_search;
|
||||||
use super::BlockSearcher;
|
use super::BlockSearcher;
|
||||||
|
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_linear_search() {
|
fn test_linear_search() {
|
||||||
@@ -186,8 +195,19 @@ mod tests {
|
|||||||
|
|
||||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
||||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
let cursor = search_in_block_trivial_but_slow(block, target);
|
||||||
|
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
||||||
|
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
|
||||||
|
output_buffer[..block.len()].copy_from_slice(block);
|
||||||
for i in 0..cursor {
|
for i in 0..cursor {
|
||||||
assert_eq!(block_searcher.search_in_block(block, i, target), cursor);
|
assert_eq!(
|
||||||
|
block_searcher.search_in_block(
|
||||||
|
&AlignedBuffer(output_buffer),
|
||||||
|
block.len(),
|
||||||
|
i,
|
||||||
|
target
|
||||||
|
),
|
||||||
|
cursor
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -46,11 +46,11 @@ impl BlockEncoder {
|
|||||||
/// We ensure that the OutputBuffer is align on 128 bits
|
/// We ensure that the OutputBuffer is align on 128 bits
|
||||||
/// in order to run SSE2 linear search on it.
|
/// in order to run SSE2 linear search on it.
|
||||||
#[repr(align(128))]
|
#[repr(align(128))]
|
||||||
struct OutputBuffer([u32; COMPRESSION_BLOCK_SIZE + 1]);
|
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
||||||
|
|
||||||
pub struct BlockDecoder {
|
pub struct BlockDecoder {
|
||||||
bitpacker: BitPacker4x,
|
bitpacker: BitPacker4x,
|
||||||
output: OutputBuffer,
|
output: AlignedBuffer,
|
||||||
pub output_len: usize,
|
pub output_len: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,11 +60,9 @@ impl BlockDecoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_val(val: u32) -> BlockDecoder {
|
pub fn with_val(val: u32) -> BlockDecoder {
|
||||||
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
|
|
||||||
output[COMPRESSION_BLOCK_SIZE] = 0u32;
|
|
||||||
BlockDecoder {
|
BlockDecoder {
|
||||||
bitpacker: BitPacker4x::new(),
|
bitpacker: BitPacker4x::new(),
|
||||||
output: OutputBuffer(output),
|
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
|
||||||
output_len: 0,
|
output_len: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -91,6 +89,11 @@ impl BlockDecoder {
|
|||||||
&self.output.0[..self.output_len]
|
&self.output.0[..self.output_len]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||||
|
(&self.output, self.output_len)
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn output(&self, idx: usize) -> u32 {
|
pub fn output(&self, idx: usize) -> u32 {
|
||||||
self.output.0[idx]
|
self.output.0[idx]
|
||||||
|
|||||||
@@ -55,13 +55,15 @@ pub mod tests {
|
|||||||
use fieldnorm::FieldNormReader;
|
use fieldnorm::FieldNormReader;
|
||||||
use indexer::operation::AddOperation;
|
use indexer::operation::AddOperation;
|
||||||
use indexer::SegmentWriter;
|
use indexer::SegmentWriter;
|
||||||
|
use merge_policy::NoMergePolicy;
|
||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
use schema::Field;
|
|
||||||
use schema::IndexRecordOption;
|
|
||||||
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
||||||
|
use schema::{Field, TextOptions};
|
||||||
|
use schema::{IndexRecordOption, TextFieldIndexing};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
@@ -160,6 +162,52 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
pub fn test_drop_token_that_are_too_long() {
|
||||||
|
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
|
||||||
|
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
|
||||||
|
exceeding_token_text.push_str(" hello");
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text_options = TextOptions::default().set_indexing_options(
|
||||||
|
TextFieldIndexing::default()
|
||||||
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.set_tokenizer("simple_no_truncation"),
|
||||||
|
);
|
||||||
|
let text_field = schema_builder.add_text_field("text", text_options);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
index
|
||||||
|
.tokenizers()
|
||||||
|
.register("simple_no_truncation", SimpleTokenizer);
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
{
|
||||||
|
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
reader.reload().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let segment_reader = searcher.segment_reader(0u32);
|
||||||
|
let inverted_index = segment_reader.inverted_index(text_field);
|
||||||
|
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||||
|
let mut bytes = vec![];
|
||||||
|
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||||
|
assert_eq!(&bytes, b"hello");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
reader.reload().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let segment_reader = searcher.segment_reader(1u32);
|
||||||
|
let inverted_index = segment_reader.inverted_index(text_field);
|
||||||
|
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||||
|
let mut bytes = vec![];
|
||||||
|
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||||
|
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm1() {
|
pub fn test_position_and_fieldnorm1() {
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ use std::io;
|
|||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use termdict::TermOrdinal;
|
use termdict::TermOrdinal;
|
||||||
use tokenizer::Token;
|
|
||||||
use tokenizer::TokenStream;
|
use tokenizer::TokenStream;
|
||||||
|
use tokenizer::{Token, MAX_TOKEN_LEN};
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
@@ -210,8 +210,18 @@ pub trait PostingsWriter {
|
|||||||
) -> u32 {
|
) -> u32 {
|
||||||
let mut term = Term::for_field(field);
|
let mut term = Term::for_field(field);
|
||||||
let mut sink = |token: &Token| {
|
let mut sink = |token: &Token| {
|
||||||
term.set_text(token.text.as_str());
|
// We skip all tokens with a len greater than u16.
|
||||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
if token.text.len() <= MAX_TOKEN_LEN {
|
||||||
|
term.set_text(token.text.as_str());
|
||||||
|
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||||
|
} else {
|
||||||
|
info!(
|
||||||
|
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||||
|
MAX_TOKEN_LEN in the documentation for more information.",
|
||||||
|
token.text.len(),
|
||||||
|
MAX_TOKEN_LEN
|
||||||
|
);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
token_stream.process(&mut sink)
|
token_stream.process(&mut sink)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use common::{BinarySerializable, VInt};
|
|||||||
use docset::{DocSet, SkipResult};
|
use docset::{DocSet, SkipResult};
|
||||||
use owned_read::OwnedRead;
|
use owned_read::OwnedRead;
|
||||||
use positions::PositionReader;
|
use positions::PositionReader;
|
||||||
use postings::compression::compressed_block_size;
|
use postings::compression::{compressed_block_size, AlignedBuffer};
|
||||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||||
use postings::serializer::PostingsSerializer;
|
use postings::serializer::PostingsSerializer;
|
||||||
use postings::BlockSearcher;
|
use postings::BlockSearcher;
|
||||||
@@ -130,9 +130,11 @@ impl DocSet for SegmentPostings {
|
|||||||
// next needs to be called a first time to point to the correct element.
|
// next needs to be called a first time to point to the correct element.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.position_computer.is_some() {
|
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
|
||||||
let term_freq = self.term_freq() as usize;
|
let term_freq = self.term_freq() as usize;
|
||||||
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||||
|
position_computer.add_skip(term_freq);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
self.cur += 1;
|
self.cur += 1;
|
||||||
if self.cur >= self.block_cursor.block_len() {
|
if self.cur >= self.block_cursor.block_len() {
|
||||||
@@ -167,7 +169,6 @@ impl DocSet for SegmentPostings {
|
|||||||
|
|
||||||
// skip blocks until one that might contain the target
|
// skip blocks until one that might contain the target
|
||||||
// check if we need to go to the next block
|
// check if we need to go to the next block
|
||||||
let need_positions = self.position_computer.is_some();
|
|
||||||
let mut sum_freqs_skipped: u32 = 0;
|
let mut sum_freqs_skipped: u32 = 0;
|
||||||
if !self
|
if !self
|
||||||
.block_cursor
|
.block_cursor
|
||||||
@@ -181,7 +182,7 @@ impl DocSet for SegmentPostings {
|
|||||||
// we are not in the right block.
|
// we are not in the right block.
|
||||||
//
|
//
|
||||||
// First compute all of the freqs skipped from the current block.
|
// First compute all of the freqs skipped from the current block.
|
||||||
if need_positions {
|
if self.position_computer.is_some() {
|
||||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
||||||
match self.block_cursor.skip_to(target) {
|
match self.block_cursor.skip_to(target) {
|
||||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||||
@@ -200,24 +201,21 @@ impl DocSet for SegmentPostings {
|
|||||||
self.cur = 0;
|
self.cur = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let cur = self.cur;
|
||||||
|
|
||||||
// we're in the right block now, start with an exponential search
|
// we're in the right block now, start with an exponential search
|
||||||
let block_docs = self.block_cursor.docs();
|
let (output, len) = self.block_cursor.docs_aligned();
|
||||||
let new_cur = self
|
let new_cur = self
|
||||||
.block_searcher
|
.block_searcher
|
||||||
.search_in_block(&block_docs, self.cur, target);
|
.search_in_block(&output, len, cur, target);
|
||||||
if need_positions {
|
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
|
||||||
.iter()
|
position_computer.add_skip(sum_freqs_skipped as usize);
|
||||||
.sum::<u32>();
|
|
||||||
self.position_computer
|
|
||||||
.as_mut()
|
|
||||||
.unwrap()
|
|
||||||
.add_skip(sum_freqs_skipped as usize);
|
|
||||||
}
|
}
|
||||||
self.cur = new_cur;
|
self.cur = new_cur;
|
||||||
|
|
||||||
// `doc` is now the first element >= `target`
|
// `doc` is now the first element >= `target`
|
||||||
let doc = block_docs[new_cur];
|
let doc = output.0[new_cur];
|
||||||
debug_assert!(doc >= target);
|
debug_assert!(doc >= target);
|
||||||
if doc == target {
|
if doc == target {
|
||||||
SkipResult::Reached
|
SkipResult::Reached
|
||||||
@@ -227,12 +225,16 @@ impl DocSet for SegmentPostings {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return the current document's `DocId`.
|
/// Return the current document's `DocId`.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// Will panics if called without having called advance before.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn doc(&self) -> DocId {
|
fn doc(&self) -> DocId {
|
||||||
let docs = self.block_cursor.docs();
|
let docs = self.block_cursor.docs();
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
self.cur < docs.len(),
|
self.cur < docs.len(),
|
||||||
"Have you forgotten to call `.advance()` at least once before calling .doc()."
|
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
|
||||||
);
|
);
|
||||||
docs[self.cur]
|
docs[self.cur]
|
||||||
}
|
}
|
||||||
@@ -264,17 +266,33 @@ impl HasLen for SegmentPostings {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Postings for SegmentPostings {
|
impl Postings for SegmentPostings {
|
||||||
|
/// Returns the frequency associated to the current document.
|
||||||
|
/// If the schema is set up so that no frequency have been encoded,
|
||||||
|
/// this method should always return 1.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// Will panics if called without having called advance before.
|
||||||
fn term_freq(&self) -> u32 {
|
fn term_freq(&self) -> u32 {
|
||||||
|
debug_assert!(
|
||||||
|
// Here we do not use the len of `freqs()`
|
||||||
|
// because it is actually ok to request for the freq of doc
|
||||||
|
// even if no frequency were encoded for the field.
|
||||||
|
//
|
||||||
|
// In that case we hit the block just as if the frequency had been
|
||||||
|
// decoded. The block is simply prefilled by the value 1.
|
||||||
|
self.cur < COMPRESSION_BLOCK_SIZE,
|
||||||
|
"Have you forgotten to call `.advance()` at least once before calling \
|
||||||
|
`.term_freq()`."
|
||||||
|
);
|
||||||
self.block_cursor.freq(self.cur)
|
self.block_cursor.freq(self.cur)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||||
if self.position_computer.is_some() {
|
let term_freq = self.term_freq() as usize;
|
||||||
output.resize(self.term_freq() as usize, 0u32);
|
if let Some(position_comp) = self.position_computer.as_mut() {
|
||||||
self.position_computer
|
output.resize(term_freq, 0u32);
|
||||||
.as_mut()
|
position_comp.positions_with_offset(offset, &mut output[..]);
|
||||||
.unwrap()
|
|
||||||
.positions_with_offset(offset, &mut output[..])
|
|
||||||
} else {
|
} else {
|
||||||
output.clear();
|
output.clear();
|
||||||
}
|
}
|
||||||
@@ -396,6 +414,10 @@ impl BlockSegmentPostings {
|
|||||||
self.doc_decoder.output_array()
|
self.doc_decoder.output_array()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||||
|
self.doc_decoder.output_aligned()
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the document at index `idx` of the block.
|
/// Return the document at index `idx` of the block.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn doc(&self, idx: usize) -> u32 {
|
pub fn doc(&self, idx: usize) -> u32 {
|
||||||
@@ -592,6 +614,7 @@ mod tests {
|
|||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
|
use postings::postings::Postings;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
@@ -608,6 +631,18 @@ mod tests {
|
|||||||
assert_eq!(postings.len(), 0);
|
assert_eq!(postings.len(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||||
|
fn test_panic_if_doc_called_before_advance() {
|
||||||
|
SegmentPostings::empty().doc();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||||
|
fn test_panic_if_freq_called_before_advance() {
|
||||||
|
SegmentPostings::empty().term_freq();
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_block_segment_postings() {
|
fn test_empty_block_segment_postings() {
|
||||||
let mut postings = BlockSegmentPostings::empty();
|
let mut postings = BlockSegmentPostings::empty();
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
|
|||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// `PostingsSerializer` is in charge of serializing
|
/// `InvertedIndexSerializer` is in charge of serializing
|
||||||
/// postings on disk, in the
|
/// postings on disk, in the
|
||||||
/// * `.idx` (inverted index)
|
/// * `.idx` (inverted index)
|
||||||
/// * `.pos` (positions file)
|
/// * `.pos` (positions file)
|
||||||
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl InvertedIndexSerializer {
|
impl InvertedIndexSerializer {
|
||||||
/// Open a new `PostingsSerializer` for the given segment
|
/// Open a new `InvertedIndexSerializer` for the given segment
|
||||||
fn create(
|
fn create(
|
||||||
terms_write: CompositeWrite<WritePtr>,
|
terms_write: CompositeWrite<WritePtr>,
|
||||||
postings_write: CompositeWrite<WritePtr>,
|
postings_write: CompositeWrite<WritePtr>,
|
||||||
@@ -175,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
let positions_idx = self
|
let positions_idx = self
|
||||||
.positions_serializer_opt
|
.positions_serializer_opt
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|positions_serializer| positions_serializer.positions_idx())
|
.map(PositionSerializer::positions_idx)
|
||||||
.unwrap_or(0u64);
|
.unwrap_or(0u64);
|
||||||
TermInfo {
|
TermInfo {
|
||||||
doc_freq: 0,
|
doc_freq: 0,
|
||||||
|
|||||||
@@ -214,6 +214,102 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// `ahead` is assumed to be initialized (ahead.advance() has been called at least once,
|
||||||
|
// and this returned true).
|
||||||
|
//
|
||||||
|
// If behind is either uninitialized or `ahead.doc() > behind.doc()`.
|
||||||
|
fn next_in_intersection<'a, TScorer: Scorer>(
|
||||||
|
ahead: &'a mut TScorer,
|
||||||
|
behind: &'a mut TScorer,
|
||||||
|
) -> Option<DocId> {
|
||||||
|
let candidate = ahead.doc();
|
||||||
|
match behind.skip_next(candidate) {
|
||||||
|
SkipResult::Reached => Some(candidate),
|
||||||
|
SkipResult::OverStep => {
|
||||||
|
// yeah for tail-recursion
|
||||||
|
next_in_intersection(behind, ahead)
|
||||||
|
}
|
||||||
|
SkipResult::End => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum SkipResultComplex {
|
||||||
|
Reached,
|
||||||
|
Overstep { other_ord: usize, candidate: DocId },
|
||||||
|
End,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn skip_several_scorers<TDocSet: DocSet>(
|
||||||
|
others: &mut [TDocSet],
|
||||||
|
except_candidate_ord: usize,
|
||||||
|
target: DocId,
|
||||||
|
) -> SkipResultComplex {
|
||||||
|
for (ord, docset) in others.iter_mut().enumerate() {
|
||||||
|
// `candidate_ord` is already at the
|
||||||
|
// right position.
|
||||||
|
//
|
||||||
|
// Calling `skip_next` would advance this docset
|
||||||
|
// and miss it.
|
||||||
|
if ord == except_candidate_ord {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match docset.skip_next(target) {
|
||||||
|
SkipResult::Reached => {}
|
||||||
|
SkipResult::OverStep => {
|
||||||
|
return SkipResultComplex::Overstep {
|
||||||
|
other_ord: ord,
|
||||||
|
candidate: docset.doc(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
SkipResult::End => {
|
||||||
|
return SkipResultComplex::End;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SkipResultComplex::Reached
|
||||||
|
}
|
||||||
|
|
||||||
|
fn for_each<'a, TScorer: Scorer, TOtherscorer: Scorer>(
|
||||||
|
left: &'a mut TScorer,
|
||||||
|
right: &'a mut TScorer,
|
||||||
|
others: &'a mut [TOtherscorer],
|
||||||
|
callback: &mut FnMut(DocId, Score),
|
||||||
|
) {
|
||||||
|
let mut other_candidate_ord: usize = usize::max_value();
|
||||||
|
if !left.advance() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
while let Some(candidate) = next_in_intersection(left, right) {
|
||||||
|
// test the remaining scorers
|
||||||
|
match skip_several_scorers(others, other_candidate_ord, candidate) {
|
||||||
|
SkipResultComplex::Reached => {
|
||||||
|
let intersection_score: Score = left.score()
|
||||||
|
+ right.score()
|
||||||
|
+ others.iter_mut().map(|other| other.score()).sum::<Score>();
|
||||||
|
callback(candidate, intersection_score);
|
||||||
|
if !left.advance() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SkipResultComplex::Overstep {
|
||||||
|
other_ord,
|
||||||
|
candidate,
|
||||||
|
} => match left.skip_next(candidate) {
|
||||||
|
SkipResult::End => {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
SkipResult::Reached => {
|
||||||
|
other_candidate_ord = other_ord;
|
||||||
|
}
|
||||||
|
SkipResult::OverStep => other_candidate_ord = usize::max_value(),
|
||||||
|
},
|
||||||
|
SkipResultComplex::End => {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<TScorer, TOtherScorer> Scorer for Intersection<TScorer, TOtherScorer>
|
impl<TScorer, TOtherScorer> Scorer for Intersection<TScorer, TOtherScorer>
|
||||||
where
|
where
|
||||||
TScorer: Scorer,
|
TScorer: Scorer,
|
||||||
@@ -224,6 +320,10 @@ where
|
|||||||
+ self.right.score()
|
+ self.right.score()
|
||||||
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
|
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) {
|
||||||
|
for_each(&mut self.left, &mut self.right, &mut self.others, callback);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use error::TantivyError;
|
|||||||
use query::bm25::BM25Weight;
|
use query::bm25::BM25Weight;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use query::Weight;
|
use query::Weight;
|
||||||
|
use schema::IndexRecordOption;
|
||||||
use schema::{Field, Term};
|
use schema::{Field, Term};
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -83,7 +84,7 @@ impl Query for PhraseQuery {
|
|||||||
let has_positions = field_entry
|
let has_positions = field_entry
|
||||||
.field_type()
|
.field_type()
|
||||||
.get_index_record_option()
|
.get_index_record_option()
|
||||||
.map(|index_record_option| index_record_option.has_positions())
|
.map(IndexRecordOption::has_positions)
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
if !has_positions {
|
if !has_positions {
|
||||||
let field_name = field_entry.name();
|
let field_name = field_entry.name();
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))]
|
#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))]
|
||||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))]
|
#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))]
|
||||||
|
|
||||||
|
use super::query_grammar;
|
||||||
use super::user_input_ast::*;
|
use super::user_input_ast::*;
|
||||||
use combine::char::*;
|
use combine::char::*;
|
||||||
use combine::error::StreamError;
|
use combine::error::StreamError;
|
||||||
@@ -22,7 +23,7 @@ parser! {
|
|||||||
parser! {
|
parser! {
|
||||||
fn word[I]()(I) -> String
|
fn word[I]()(I) -> String
|
||||||
where [I: Stream<Item = char>] {
|
where [I: Stream<Item = char>] {
|
||||||
many1(satisfy(|c: char| c.is_alphanumeric()))
|
many1(satisfy(char::is_alphanumeric))
|
||||||
.and_then(|s: String| {
|
.and_then(|s: String| {
|
||||||
match s.as_str() {
|
match s.as_str() {
|
||||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||||
@@ -62,7 +63,7 @@ parser! {
|
|||||||
fn negative_number[I]()(I) -> String
|
fn negative_number[I]()(I) -> String
|
||||||
where [I: Stream<Item = char>]
|
where [I: Stream<Item = char>]
|
||||||
{
|
{
|
||||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
(char('-'), many1(satisfy(char::is_numeric)))
|
||||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -184,7 +185,7 @@ parser! {
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.map(|el| el.into_dnf())
|
.map(query_grammar::Element::into_dnf)
|
||||||
.map(|fnd| {
|
.map(|fnd| {
|
||||||
if fnd.len() == 1 {
|
if fnd.len() == 1 {
|
||||||
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
|
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
|
||||||
|
|||||||
@@ -16,6 +16,9 @@ pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
|
|||||||
|
|
||||||
/// Iterates through all of the document matched by the DocSet
|
/// Iterates through all of the document matched by the DocSet
|
||||||
/// `DocSet` and push the scored documents to the collector.
|
/// `DocSet` and push the scored documents to the collector.
|
||||||
|
///
|
||||||
|
/// This method assumes that the Scorer is brand new, and `.advance()`
|
||||||
|
/// and `.skip()` haven't been called yet.
|
||||||
fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) {
|
fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) {
|
||||||
while self.advance() {
|
while self.advance() {
|
||||||
callback(self.doc(), self.score());
|
callback(self.doc(), self.score());
|
||||||
|
|||||||
@@ -98,4 +98,20 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_term_query_count_when_there_are_deletes() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 5_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
|
index_writer.add_document(doc!(text_field=>"a c"));
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
|
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,15 +39,15 @@ impl Weight for TermWeight {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||||
if reader.num_deleted_docs() == 0 {
|
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||||
|
Ok(self.scorer(reader)?.count(delete_bitset))
|
||||||
|
} else {
|
||||||
let field = self.term.field();
|
let field = self.term.field();
|
||||||
Ok(reader
|
Ok(reader
|
||||||
.inverted_index(field)
|
.inverted_index(field)
|
||||||
.get_term_info(&self.term)
|
.get_term_info(&self.term)
|
||||||
.map(|term_info| term_info.doc_freq)
|
.map(|term_info| term_info.doc_freq)
|
||||||
.unwrap_or(0))
|
.unwrap_or(0))
|
||||||
} else {
|
|
||||||
Ok(self.scorer(reader)?.count())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
|
|||||||
|
|
||||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
||||||
fn refill(&mut self) -> bool {
|
fn refill(&mut self) -> bool {
|
||||||
if let Some(min_doc) = self.docsets.iter_mut().map(|docset| docset.doc()).min() {
|
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
|
||||||
self.offset = min_doc;
|
self.offset = min_doc;
|
||||||
self.cursor = 0;
|
self.cursor = 0;
|
||||||
refill(
|
refill(
|
||||||
@@ -145,7 +145,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn count(&mut self) -> u32 {
|
fn count_including_deleted(&mut self) -> u32 {
|
||||||
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|bitset| bitset.len())
|
.map(|bitset| bitset.len())
|
||||||
@@ -163,6 +163,8 @@ where
|
|||||||
count
|
count
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO implement `count` efficiently.
|
||||||
|
|
||||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||||
if !self.advance() {
|
if !self.advance() {
|
||||||
return SkipResult::End;
|
return SkipResult::End;
|
||||||
@@ -258,6 +260,23 @@ where
|
|||||||
fn score(&mut self) -> Score {
|
fn score(&mut self) -> Score {
|
||||||
self.score
|
self.score
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn for_each(&mut self, callback: &mut FnMut(DocId, Score)) {
|
||||||
|
// TODO how do we deal with the fact that people may have called .advance() before.
|
||||||
|
while self.refill() {
|
||||||
|
let offset = self.offset;
|
||||||
|
for cursor in 0..HORIZON_NUM_TINYBITSETS {
|
||||||
|
while let Some(val) = self.bitsets[cursor].pop_lowest() {
|
||||||
|
let delta = val + (cursor as u32) * 64;
|
||||||
|
let doc = offset + delta;
|
||||||
|
let score_combiner = &mut self.scores[delta as usize];
|
||||||
|
let score = score_combiner.score();
|
||||||
|
callback(doc, score);
|
||||||
|
score_combiner.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -300,7 +319,7 @@ mod tests {
|
|||||||
count += 1;
|
count += 1;
|
||||||
}
|
}
|
||||||
assert!(!union_expected.advance());
|
assert!(!union_expected.advance());
|
||||||
assert_eq!(count, make_union().count());
|
assert_eq!(count, make_union().count_including_deleted());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -13,6 +13,11 @@ pub trait Weight: Send + Sync + 'static {
|
|||||||
|
|
||||||
/// Returns the number documents within the given `SegmentReader`.
|
/// Returns the number documents within the given `SegmentReader`.
|
||||||
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
fn count(&self, reader: &SegmentReader) -> Result<u32> {
|
||||||
Ok(self.scorer(reader)?.count())
|
let mut scorer = self.scorer(reader)?;
|
||||||
|
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||||
|
Ok(scorer.count(delete_bitset))
|
||||||
|
} else {
|
||||||
|
Ok(scorer.count_including_deleted())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ use Index;
|
|||||||
use Result;
|
use Result;
|
||||||
use Searcher;
|
use Searcher;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
use schema::Schema;
|
|
||||||
|
|
||||||
/// Defines when a new version of the index should be reloaded.
|
/// Defines when a new version of the index should be reloaded.
|
||||||
///
|
///
|
||||||
@@ -159,11 +158,6 @@ pub struct IndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl IndexReader {
|
impl IndexReader {
|
||||||
|
|
||||||
pub fn schema(&self) -> Schema {
|
|
||||||
self.inner.index.schema()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Update searchers so that they reflect the state of the last
|
/// Update searchers so that they reflect the state of the last
|
||||||
/// `.commit()`.
|
/// `.commit()`.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ impl Document {
|
|||||||
self.field_values
|
self.field_values
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|field_value| field_value.field() == field)
|
.filter(|field_value| field_value.field() == field)
|
||||||
.map(|field_value| field_value.value())
|
.map(FieldValue::value)
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,7 +137,7 @@ impl Document {
|
|||||||
self.field_values
|
self.field_values
|
||||||
.iter()
|
.iter()
|
||||||
.find(|field_value| field_value.field() == field)
|
.find(|field_value| field_value.field() == field)
|
||||||
.map(|field_value| field_value.value())
|
.map(FieldValue::value)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use schema::{IntOptions, TextOptions};
|
|||||||
|
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
|
use schema::TextFieldIndexing;
|
||||||
use schema::Value;
|
use schema::Value;
|
||||||
use serde_json::Value as JsonValue;
|
use serde_json::Value as JsonValue;
|
||||||
|
|
||||||
@@ -94,7 +95,7 @@ impl FieldType {
|
|||||||
match *self {
|
match *self {
|
||||||
FieldType::Str(ref text_options) => text_options
|
FieldType::Str(ref text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
.map(|indexing_options| indexing_options.index_option()),
|
.map(TextFieldIndexing::index_option),
|
||||||
FieldType::U64(ref int_options)
|
FieldType::U64(ref int_options)
|
||||||
| FieldType::I64(ref int_options)
|
| FieldType::I64(ref int_options)
|
||||||
| FieldType::Date(ref int_options) => {
|
| FieldType::Date(ref int_options) => {
|
||||||
|
|||||||
@@ -130,7 +130,16 @@ impl SchemaBuilder {
|
|||||||
self.add_field(field_entry)
|
self.add_field(field_entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds a fast bytes field to the schema
|
/// Adds a fast bytes field to the schema.
|
||||||
|
///
|
||||||
|
/// Bytes field are not searchable and are only used
|
||||||
|
/// as fast field, to associate any kind of payload
|
||||||
|
/// to a document.
|
||||||
|
///
|
||||||
|
/// For instance, learning-to-rank often requires to access
|
||||||
|
/// some document features at scoring time.
|
||||||
|
/// These can be serializing and stored as a bytes field to
|
||||||
|
/// get access rapidly when scoring each document.
|
||||||
pub fn add_bytes_field(&mut self, field_name: &str) -> Field {
|
pub fn add_bytes_field(&mut self, field_name: &str) -> Field {
|
||||||
let field_entry = FieldEntry::new_bytes(field_name.to_string());
|
let field_entry = FieldEntry::new_bytes(field_name.to_string());
|
||||||
self.add_field(field_entry)
|
self.add_field(field_entry)
|
||||||
@@ -224,7 +233,7 @@ impl Schema {
|
|||||||
let field_name = self.get_field_name(field);
|
let field_name = self.get_field_name(field);
|
||||||
let values: Vec<Value> = field_values
|
let values: Vec<Value> = field_values
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|field_val| field_val.value())
|
.map(FieldValue::value)
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect();
|
.collect();
|
||||||
field_map.insert(field_name.to_string(), values);
|
field_map.insert(field_name.to_string(), values);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use htmlescape::encode_minimal;
|
use htmlescape::encode_minimal;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
|
use schema::Value;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
@@ -303,7 +304,7 @@ impl SnippetGenerator {
|
|||||||
let text: String = doc
|
let text: String = doc
|
||||||
.get_all(self.field)
|
.get_all(self.field)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.flat_map(|val| val.text())
|
.flat_map(Value::text)
|
||||||
.collect::<Vec<&str>>()
|
.collect::<Vec<&str>>()
|
||||||
.join(" ");
|
.join(" ");
|
||||||
self.snippet(&text)
|
self.snippet(&text)
|
||||||
|
|||||||
@@ -227,7 +227,7 @@ pub struct PerFieldSpaceUsage {
|
|||||||
|
|
||||||
impl PerFieldSpaceUsage {
|
impl PerFieldSpaceUsage {
|
||||||
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
||||||
let total = fields.values().map(|x| x.total()).sum();
|
let total = fields.values().map(FieldUsage::total).sum();
|
||||||
PerFieldSpaceUsage { fields, total }
|
PerFieldSpaceUsage { fields, total }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
4064
src/tokenizer/ascii_folding_filter.rs
Normal file
4064
src/tokenizer/ascii_folding_filter.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -44,18 +44,17 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.tail.advance() {
|
if !self.tail.advance() {
|
||||||
if self.token_mut().text.is_ascii() {
|
return false;
|
||||||
// fast track for ascii.
|
|
||||||
self.token_mut().text.make_ascii_lowercase();
|
|
||||||
} else {
|
|
||||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
|
||||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
|
||||||
}
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
}
|
||||||
|
if self.token_mut().text.is_ascii() {
|
||||||
|
// fast track for ascii.
|
||||||
|
self.token_mut().text.make_ascii_lowercase();
|
||||||
|
} else {
|
||||||
|
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
}
|
||||||
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -97,6 +97,8 @@
|
|||||||
//! If you built your schema programmatically, a complete example
|
//! If you built your schema programmatically, a complete example
|
||||||
//! could like this for instance.
|
//! could like this for instance.
|
||||||
//!
|
//!
|
||||||
|
//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html).
|
||||||
|
//!
|
||||||
//! # Example
|
//! # Example
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
@@ -129,6 +131,7 @@
|
|||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
mod alphanum_only;
|
mod alphanum_only;
|
||||||
|
mod ascii_folding_filter;
|
||||||
mod facet_tokenizer;
|
mod facet_tokenizer;
|
||||||
mod lower_caser;
|
mod lower_caser;
|
||||||
mod ngram_tokenizer;
|
mod ngram_tokenizer;
|
||||||
@@ -142,6 +145,7 @@ mod tokenizer;
|
|||||||
mod tokenizer_manager;
|
mod tokenizer_manager;
|
||||||
|
|
||||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||||
|
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||||
pub use self::facet_tokenizer::FacetTokenizer;
|
pub use self::facet_tokenizer::FacetTokenizer;
|
||||||
pub use self::lower_caser::LowerCaser;
|
pub use self::lower_caser::LowerCaser;
|
||||||
pub use self::ngram_tokenizer::NgramTokenizer;
|
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||||
@@ -157,6 +161,13 @@ pub use self::tokenizer::BoxedTokenizer;
|
|||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
|
/// Maximum authorized len (in bytes) for a token.
|
||||||
|
///
|
||||||
|
/// Tokenizer are in charge of not emitting tokens larger than this value.
|
||||||
|
/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
|
||||||
|
/// `2^16 - 1 - 4`, the token will simply be ignored downstream.
|
||||||
|
pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
use super::{
|
use super::{
|
||||||
@@ -228,27 +239,27 @@ pub mod tests {
|
|||||||
fn test_non_en_tokenizer() {
|
fn test_non_en_tokenizer() {
|
||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"es_stem",
|
"el_stem",
|
||||||
SimpleTokenizer
|
SimpleTokenizer
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new(Language::Spanish)),
|
.filter(Stemmer::new(Language::Greek)),
|
||||||
);
|
);
|
||||||
let en_tokenizer = tokenizer_manager.get("es_stem").unwrap();
|
let en_tokenizer = tokenizer_manager.get("el_stem").unwrap();
|
||||||
let mut tokens: Vec<Token> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
{
|
{
|
||||||
let mut add_token = |token: &Token| {
|
let mut add_token = |token: &Token| {
|
||||||
tokens.push(token.clone());
|
tokens.push(token.clone());
|
||||||
};
|
};
|
||||||
en_tokenizer
|
en_tokenizer
|
||||||
.token_stream("Hola, feliz contribuyente!")
|
.token_stream("Καλημέρα, χαρούμενε φορολογούμενε!")
|
||||||
.process(&mut add_token);
|
.process(&mut add_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(tokens.len(), 3);
|
assert_eq!(tokens.len(), 3);
|
||||||
assert_token(&tokens[0], 0, "hola", 0, 4);
|
assert_token(&tokens[0], 0, "καλημερ", 0, 16);
|
||||||
assert_token(&tokens[1], 1, "feliz", 6, 11);
|
assert_token(&tokens[1], 1, "χαρουμεν", 18, 36);
|
||||||
assert_token(&tokens[2], 2, "contribuyent", 12, 25);
|
assert_token(&tokens[2], 2, "φορολογουμεν", 37, 63);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -29,12 +29,9 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
|
|||||||
|
|
||||||
impl TokenStream for RawTokenStream {
|
impl TokenStream for RawTokenStream {
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.has_token {
|
let result = self.has_token;
|
||||||
self.has_token = false;
|
self.has_token = false;
|
||||||
true
|
result
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token(&self) -> &Token {
|
fn token(&self) -> &Token {
|
||||||
|
|||||||
@@ -91,7 +91,6 @@ where
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,23 +38,16 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
|
|||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
self.token.text.clear();
|
self.token.text.clear();
|
||||||
self.token.position = self.token.position.wrapping_add(1);
|
self.token.position = self.token.position.wrapping_add(1);
|
||||||
|
while let Some((offset_from, c)) = self.chars.next() {
|
||||||
loop {
|
if c.is_alphanumeric() {
|
||||||
match self.chars.next() {
|
let offset_to = self.search_token_end();
|
||||||
Some((offset_from, c)) => {
|
self.token.offset_from = offset_from;
|
||||||
if c.is_alphanumeric() {
|
self.token.offset_to = offset_to;
|
||||||
let offset_to = self.search_token_end();
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||||
self.token.offset_from = offset_from;
|
return true;
|
||||||
self.token.offset_to = offset_to;
|
|
||||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token(&self) -> &Token {
|
fn token(&self) -> &Token {
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
use rust_stemmers::{self, Algorithm};
|
use rust_stemmers::{self, Algorithm};
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
/// Available stemmer languages.
|
/// Available stemmer languages.
|
||||||
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
|
||||||
@@ -57,14 +56,14 @@ impl Language {
|
|||||||
/// Tokens are expected to be lowercased beforehand.
|
/// Tokens are expected to be lowercased beforehand.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Stemmer {
|
pub struct Stemmer {
|
||||||
stemmer_algorithm: Arc<Algorithm>,
|
stemmer_algorithm: Algorithm,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Stemmer {
|
impl Stemmer {
|
||||||
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
/// Creates a new Stemmer `TokenFilter` for a given language algorithm.
|
||||||
pub fn new(language: Language) -> Stemmer {
|
pub fn new(language: Language) -> Stemmer {
|
||||||
Stemmer {
|
Stemmer {
|
||||||
stemmer_algorithm: Arc::new(language.algorithm()),
|
stemmer_algorithm: language.algorithm(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -83,7 +82,7 @@ where
|
|||||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||||
|
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||||
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
|
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||||
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -109,15 +108,14 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.tail.advance() {
|
if !self.tail.advance() {
|
||||||
// TODO remove allocation
|
return false;
|
||||||
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
|
||||||
self.token_mut().text.clear();
|
|
||||||
self.token_mut().text.push_str(&stemmed_str);
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
}
|
||||||
|
// TODO remove allocation
|
||||||
|
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
||||||
|
self.token_mut().text.clear();
|
||||||
|
self.token_mut().text.push_str(&stemmed_str);
|
||||||
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ where
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::ops::Deref;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use tokenizer::box_tokenizer;
|
use tokenizer::box_tokenizer;
|
||||||
use tokenizer::stemmer::Language;
|
use tokenizer::stemmer::Language;
|
||||||
@@ -46,7 +47,8 @@ impl TokenizerManager {
|
|||||||
.read()
|
.read()
|
||||||
.expect("Acquiring the lock should never fail")
|
.expect("Acquiring the lock should never fail")
|
||||||
.get(tokenizer_name)
|
.get(tokenizer_name)
|
||||||
.map(|boxed_tokenizer| boxed_tokenizer.boxed_clone())
|
.map(Deref::deref)
|
||||||
|
.map(BoxedTokenizer::boxed_clone)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user