mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 10:02:55 +00:00
Compare commits
15 Commits
issue/526b
...
issue/554
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
94c73363e4 | ||
|
|
96f194635f | ||
|
|
444662485f | ||
|
|
943c25d0f8 | ||
|
|
5c0b2a4579 | ||
|
|
9870a9258d | ||
|
|
7102b363f5 | ||
|
|
66b4615e4e | ||
|
|
da46913839 | ||
|
|
3df037961f | ||
|
|
8ffae47854 | ||
|
|
1a90a1f3b0 | ||
|
|
dac50c6aeb | ||
|
|
31b22c5acc | ||
|
|
8e50921363 |
@@ -61,6 +61,9 @@ before_script:
|
|||||||
script:
|
script:
|
||||||
- bash ci/script.sh
|
- bash ci/script.sh
|
||||||
|
|
||||||
|
after_success:
|
||||||
|
- cargo doc-upload
|
||||||
|
|
||||||
before_deploy:
|
before_deploy:
|
||||||
- sh ci/before_deploy.sh
|
- sh ci/before_deploy.sh
|
||||||
|
|
||||||
|
|||||||
30
CHANGELOG.md
30
CHANGELOG.md
@@ -1,13 +1,39 @@
|
|||||||
Tantivy 0.10.0
|
Tantivy 0.10.0
|
||||||
====================
|
=====================
|
||||||
|
|
||||||
|
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
|
||||||
|
|
||||||
|
- Added an ASCII folding filter (@drusellers)
|
||||||
|
- Bugfix in `query.count` in presence of deletes (@pmasurel)
|
||||||
|
|
||||||
Minor
|
Minor
|
||||||
---------
|
---------
|
||||||
- Small simplification of the code.
|
- Small simplification of the code.
|
||||||
Calling .freq() or .doc() when .advance() has never
|
Calling .freq() or .doc() when .advance() has never been called
|
||||||
on segment postings should panic from now on.
|
on segment postings should panic from now on.
|
||||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||||
|
- Fast fields are now preloaded when the `SegmentReader` is created.
|
||||||
|
- `IndexMeta` is now public. (@hntd187)
|
||||||
|
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
|
||||||
|
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
|
||||||
|
only require a read lock. (@pmasurel)
|
||||||
|
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
|
||||||
|
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
|
||||||
|
|
||||||
|
## How to update?
|
||||||
|
|
||||||
|
Your existing indexes are usable as is, but you may need some
|
||||||
|
trivial updates.
|
||||||
|
|
||||||
|
### Fast fields
|
||||||
|
|
||||||
|
Fast fields used to be accessed directly from the `SegmentReader`.
|
||||||
|
The API changed, you are now required to acquire your fast field reader via the
|
||||||
|
`segment_reader.fast_fields()`, and use one of the typed method:
|
||||||
|
- `.u64()`, `.i64()` if your field is single-valued ;
|
||||||
|
- `.u64s()`, `.i64s()` if your field is multi-valued ;
|
||||||
|
- `.bytes()` if your field is bytes fast field.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.9.0
|
Tantivy 0.9.0
|
||||||
|
|||||||
@@ -18,8 +18,8 @@ use tantivy::fastfield::FastFieldReader;
|
|||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::Field;
|
use tantivy::schema::Field;
|
||||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||||
use tantivy::Index;
|
|
||||||
use tantivy::SegmentReader;
|
use tantivy::SegmentReader;
|
||||||
|
use tantivy::{Index, TantivyError};
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct Stats {
|
struct Stats {
|
||||||
@@ -75,9 +75,18 @@ impl Collector for StatsCollector {
|
|||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
_segment_local_id: u32,
|
_segment_local_id: u32,
|
||||||
segment: &SegmentReader,
|
segment_reader: &SegmentReader,
|
||||||
) -> tantivy::Result<StatsSegmentCollector> {
|
) -> tantivy::Result<StatsSegmentCollector> {
|
||||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
let fast_field_reader = segment_reader
|
||||||
|
.fast_fields()
|
||||||
|
.u64(self.field)
|
||||||
|
.ok_or_else(|| {
|
||||||
|
let field_name = segment_reader.schema().get_field_name(self.field);
|
||||||
|
TantivyError::SchemaError(format!(
|
||||||
|
"Field {:?} is not a u64 fast field.",
|
||||||
|
field_name
|
||||||
|
))
|
||||||
|
})?;
|
||||||
Ok(StatsSegmentCollector {
|
Ok(StatsSegmentCollector {
|
||||||
fast_field_reader,
|
fast_field_reader,
|
||||||
stats: Stats::default(),
|
stats: Stats::default(),
|
||||||
|
|||||||
107
examples/multiple_producer.rs
Normal file
107
examples/multiple_producer.rs
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
// # Indexing from different threads.
|
||||||
|
//
|
||||||
|
// It is fairly common to have to index from different threads.
|
||||||
|
// Tantivy forbids to create more than one `IndexWriter` at a time.
|
||||||
|
//
|
||||||
|
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
|
||||||
|
// indexing threads will not help. However, it can still be useful for some applications.
|
||||||
|
//
|
||||||
|
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
|
||||||
|
// your application, it is reasonable to have multiple threads.
|
||||||
|
//
|
||||||
|
// Another very common reason to want to index from multiple threads, is implementing a webserver
|
||||||
|
// with CRUD capabilities. The server framework will most likely handle request from
|
||||||
|
// different threads.
|
||||||
|
//
|
||||||
|
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
|
||||||
|
// `Arc<RwLock<IndexWriter>>`.
|
||||||
|
//
|
||||||
|
// While this is counterintuitive, adding and deleting documents do not require mutability
|
||||||
|
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
|
||||||
|
//
|
||||||
|
// The example below does not represent an actual real-life use case (who would spawn thread to
|
||||||
|
// index a single document?), but aims at demonstrating the mechanism that makes indexing
|
||||||
|
// from several threads possible.
|
||||||
|
|
||||||
|
extern crate tempdir;
|
||||||
|
|
||||||
|
// ---
|
||||||
|
// Importing tantivy...
|
||||||
|
#[macro_use]
|
||||||
|
extern crate tantivy;
|
||||||
|
use std::sync::{Arc, RwLock};
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tantivy::schema::{Schema, STORED, TEXT};
|
||||||
|
use tantivy::Opstamp;
|
||||||
|
use tantivy::{Index, IndexWriter};
|
||||||
|
|
||||||
|
fn main() -> tantivy::Result<()> {
|
||||||
|
// # Defining the schema
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
|
let body = schema_builder.add_text_field("body", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
|
||||||
|
|
||||||
|
// # First indexing thread.
|
||||||
|
let index_writer_clone_1 = index_writer.clone();
|
||||||
|
thread::spawn(move || {
|
||||||
|
// we index 100 times the document... for the sake of the example.
|
||||||
|
for i in 0..100 {
|
||||||
|
let opstamp = {
|
||||||
|
// A read lock is sufficient here.
|
||||||
|
let index_writer_rlock = index_writer_clone_1.read().unwrap();
|
||||||
|
index_writer_rlock.add_document(
|
||||||
|
doc!(
|
||||||
|
title => "Of Mice and Men",
|
||||||
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
|
limbs and branches that arch over the pool"
|
||||||
|
))
|
||||||
|
};
|
||||||
|
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||||
|
thread::sleep(Duration::from_millis(20));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// # Second indexing thread.
|
||||||
|
let index_writer_clone_2 = index_writer.clone();
|
||||||
|
// For convenience, tantivy also comes with a macro to
|
||||||
|
// reduce the boilerplate above.
|
||||||
|
thread::spawn(move || {
|
||||||
|
// we index 100 times the document... for the sake of the example.
|
||||||
|
for i in 0..100 {
|
||||||
|
// A read lock is sufficient here.
|
||||||
|
let opstamp = {
|
||||||
|
let index_writer_rlock = index_writer_clone_2.read().unwrap();
|
||||||
|
index_writer_rlock.add_document(doc!(
|
||||||
|
title => "Manufacturing consent",
|
||||||
|
body => "Some great book description..."
|
||||||
|
))
|
||||||
|
};
|
||||||
|
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||||
|
thread::sleep(Duration::from_millis(10));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// # In the main thread, we commit 10 times, once every 500ms.
|
||||||
|
for _ in 0..10 {
|
||||||
|
let opstamp: Opstamp = {
|
||||||
|
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||||
|
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||||
|
index_writer_wlock.commit().unwrap()
|
||||||
|
};
|
||||||
|
println!("committed with opstamp {}", opstamp);
|
||||||
|
thread::sleep(Duration::from_millis(500));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@ use Result;
|
|||||||
use Score;
|
use Score;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
use TantivyError;
|
||||||
|
|
||||||
struct Hit<'a> {
|
struct Hit<'a> {
|
||||||
count: u64,
|
count: u64,
|
||||||
@@ -264,7 +265,10 @@ impl Collector for FacetCollector {
|
|||||||
_: SegmentLocalId,
|
_: SegmentLocalId,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
) -> Result<FacetSegmentCollector> {
|
) -> Result<FacetSegmentCollector> {
|
||||||
let facet_reader = reader.facet_reader(self.field)?;
|
let field_name = reader.schema().get_field_name(self.field);
|
||||||
|
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
|
||||||
|
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
|
||||||
|
})?;
|
||||||
|
|
||||||
let mut collapse_mapping = Vec::new();
|
let mut collapse_mapping = Vec::new();
|
||||||
let mut counts = Vec::new();
|
let mut counts = Vec::new();
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use super::Collector;
|
|||||||
use super::SegmentCollector;
|
use super::SegmentCollector;
|
||||||
use collector::Fruit;
|
use collector::Fruit;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
|
use std::ops::Deref;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -199,7 +200,10 @@ impl<'a> Collector for MultiCollector<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
self.collector_wrappers
|
||||||
|
.iter()
|
||||||
|
.map(Deref::deref)
|
||||||
|
.any(Collector::requires_scoring)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
||||||
|
|||||||
@@ -114,11 +114,15 @@ impl Collector for FastFieldTestCollector {
|
|||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
_: SegmentLocalId,
|
_: SegmentLocalId,
|
||||||
reader: &SegmentReader,
|
segment_reader: &SegmentReader,
|
||||||
) -> Result<FastFieldSegmentCollector> {
|
) -> Result<FastFieldSegmentCollector> {
|
||||||
|
let reader = segment_reader
|
||||||
|
.fast_fields()
|
||||||
|
.u64(self.field)
|
||||||
|
.expect("Requested field is not a fast field.");
|
||||||
Ok(FastFieldSegmentCollector {
|
Ok(FastFieldSegmentCollector {
|
||||||
vals: Vec::new(),
|
vals: Vec::new(),
|
||||||
reader: reader.fast_field_reader(self.field)?,
|
reader,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -170,11 +174,14 @@ impl Collector for BytesFastFieldTestCollector {
|
|||||||
fn for_segment(
|
fn for_segment(
|
||||||
&self,
|
&self,
|
||||||
_segment_local_id: u32,
|
_segment_local_id: u32,
|
||||||
segment: &SegmentReader,
|
segment_reader: &SegmentReader,
|
||||||
) -> Result<BytesFastFieldSegmentCollector> {
|
) -> Result<BytesFastFieldSegmentCollector> {
|
||||||
Ok(BytesFastFieldSegmentCollector {
|
Ok(BytesFastFieldSegmentCollector {
|
||||||
vals: Vec::new(),
|
vals: Vec::new(),
|
||||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
reader: segment_reader
|
||||||
|
.fast_fields()
|
||||||
|
.bytes(self.field)
|
||||||
|
.expect("Field is not a bytes fast field."),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -191,7 +198,7 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
|
|||||||
type Fruit = Vec<u8>;
|
type Fruit = Vec<u8>;
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
fn collect(&mut self, doc: u32, _score: f32) {
|
||||||
let data = self.reader.get_val(doc);
|
let data = self.reader.get_bytes(doc);
|
||||||
self.vals.extend(data);
|
self.vals.extend(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -98,11 +98,11 @@ where
|
|||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn for_segment(
|
pub(crate) fn for_segment<F: PartialOrd>(
|
||||||
&self,
|
&self,
|
||||||
segment_id: SegmentLocalId,
|
segment_id: SegmentLocalId,
|
||||||
_: &SegmentReader,
|
_: &SegmentReader,
|
||||||
) -> Result<TopSegmentCollector<T>> {
|
) -> Result<TopSegmentCollector<F>> {
|
||||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,10 +5,12 @@ use collector::SegmentCollector;
|
|||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use fastfield::FastValue;
|
use fastfield::FastValue;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
|
use std::marker::PhantomData;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use Result;
|
use Result;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
use TantivyError;
|
||||||
|
|
||||||
/// The Top Field Collector keeps track of the K documents
|
/// The Top Field Collector keeps track of the K documents
|
||||||
/// sorted by a fast field in the index
|
/// sorted by a fast field in the index
|
||||||
@@ -106,8 +108,15 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
|||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
) -> Result<TopFieldSegmentCollector<T>> {
|
) -> Result<TopFieldSegmentCollector<T>> {
|
||||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
||||||
let reader = reader.fast_field_reader(self.field)?;
|
let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
|
||||||
Ok(TopFieldSegmentCollector { collector, reader })
|
let field_name = reader.schema().get_field_name(self.field);
|
||||||
|
TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
|
||||||
|
})?;
|
||||||
|
Ok(TopFieldSegmentCollector {
|
||||||
|
collector,
|
||||||
|
reader,
|
||||||
|
_type: PhantomData,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
@@ -122,9 +131,10 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
pub struct TopFieldSegmentCollector<T> {
|
||||||
collector: TopSegmentCollector<T>,
|
collector: TopSegmentCollector<u64>,
|
||||||
reader: FastFieldReader<T>,
|
reader: FastFieldReader<u64>,
|
||||||
|
_type: PhantomData<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||||
@@ -138,7 +148,11 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||||
self.collector.harvest()
|
self.collector
|
||||||
|
.harvest()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(val, doc_address)| (T::from_u64(val), doc_address))
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,7 +249,7 @@ mod tests {
|
|||||||
.for_segment(0, segment)
|
.for_segment(0, segment)
|
||||||
.map(|_| ())
|
.map(|_| ())
|
||||||
.unwrap_err(),
|
.unwrap_err(),
|
||||||
TantivyError::FastFieldError(_)
|
TantivyError::SchemaError(_)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -340,7 +340,7 @@ impl Index {
|
|||||||
Ok(self
|
Ok(self
|
||||||
.searchable_segment_metas()?
|
.searchable_segment_metas()?
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_meta| segment_meta.id())
|
.map(SegmentMeta::id)
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use core::SegmentMeta;
|
|||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
use Opstamp;
|
||||||
|
|
||||||
/// Meta information about the `Index`.
|
/// Meta information about the `Index`.
|
||||||
///
|
///
|
||||||
@@ -15,7 +16,7 @@ use std::fmt;
|
|||||||
pub struct IndexMeta {
|
pub struct IndexMeta {
|
||||||
pub segments: Vec<SegmentMeta>,
|
pub segments: Vec<SegmentMeta>,
|
||||||
pub schema: Schema,
|
pub schema: Schema,
|
||||||
pub opstamp: u64,
|
pub opstamp: Opstamp,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub payload: Option<String>,
|
pub payload: Option<String>,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ impl Searcher {
|
|||||||
) -> Searcher {
|
) -> Searcher {
|
||||||
let store_readers = segment_readers
|
let store_readers = segment_readers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| segment_reader.get_store_reader())
|
.map(SegmentReader::get_store_reader)
|
||||||
.collect();
|
.collect();
|
||||||
Searcher {
|
Searcher {
|
||||||
schema,
|
schema,
|
||||||
@@ -218,7 +218,7 @@ impl fmt::Debug for Searcher {
|
|||||||
let segment_ids = self
|
let segment_ids = self
|
||||||
.segment_readers
|
.segment_readers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| segment_reader.segment_id())
|
.map(SegmentReader::segment_id)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
write!(f, "Searcher({:?})", segment_ids)
|
write!(f, "Searcher({:?})", segment_ids)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use schema::Schema;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::result;
|
use std::result;
|
||||||
|
use Opstamp;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// A segment is a piece of the index.
|
/// A segment is a piece of the index.
|
||||||
@@ -50,7 +51,7 @@ impl Segment {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
|
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||||
Segment {
|
Segment {
|
||||||
index: self.index,
|
index: self.index,
|
||||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use serde;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
use Opstamp;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
|
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
|
||||||
@@ -13,7 +14,7 @@ lazy_static! {
|
|||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
struct DeleteMeta {
|
struct DeleteMeta {
|
||||||
num_deleted_docs: u32,
|
num_deleted_docs: u32,
|
||||||
opstamp: u64,
|
opstamp: Opstamp,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `SegmentMeta` contains simple meta information about a segment.
|
/// `SegmentMeta` contains simple meta information about a segment.
|
||||||
@@ -136,9 +137,9 @@ impl SegmentMeta {
|
|||||||
self.max_doc() - self.num_deleted_docs()
|
self.max_doc() - self.num_deleted_docs()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the opstamp of the last delete operation
|
/// Returns the `Opstamp` of the last delete operation
|
||||||
/// taken in account in this segment.
|
/// taken in account in this segment.
|
||||||
pub fn delete_opstamp(&self) -> Option<u64> {
|
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||||
self.tracked
|
self.tracked
|
||||||
.deletes
|
.deletes
|
||||||
.as_ref()
|
.as_ref()
|
||||||
@@ -152,7 +153,7 @@ impl SegmentMeta {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
|
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||||
let delete_meta = DeleteMeta {
|
let delete_meta = DeleteMeta {
|
||||||
num_deleted_docs,
|
num_deleted_docs,
|
||||||
opstamp,
|
opstamp,
|
||||||
|
|||||||
@@ -5,14 +5,10 @@ use core::Segment;
|
|||||||
use core::SegmentComponent;
|
use core::SegmentComponent;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use error::TantivyError;
|
|
||||||
use fastfield::DeleteBitSet;
|
use fastfield::DeleteBitSet;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReaders;
|
||||||
use fastfield::{self, FastFieldNotAvailableError};
|
|
||||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
|
||||||
use fieldnorm::FieldNormReader;
|
use fieldnorm::FieldNormReader;
|
||||||
use schema::Cardinality;
|
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
@@ -51,7 +47,7 @@ pub struct SegmentReader {
|
|||||||
postings_composite: CompositeFile,
|
postings_composite: CompositeFile,
|
||||||
positions_composite: CompositeFile,
|
positions_composite: CompositeFile,
|
||||||
positions_idx_composite: CompositeFile,
|
positions_idx_composite: CompositeFile,
|
||||||
fast_fields_composite: CompositeFile,
|
fast_fields_readers: Arc<FastFieldReaders>,
|
||||||
fieldnorms_composite: CompositeFile,
|
fieldnorms_composite: CompositeFile,
|
||||||
|
|
||||||
store_source: ReadOnlySource,
|
store_source: ReadOnlySource,
|
||||||
@@ -105,93 +101,21 @@ impl SegmentReader {
|
|||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// May panic if the index is corrupted.
|
/// May panic if the index is corrupted.
|
||||||
pub fn fast_field_reader<Item: FastValue>(
|
pub fn fast_fields(&self) -> &FastFieldReaders {
|
||||||
&self,
|
&self.fast_fields_readers
|
||||||
field: Field,
|
|
||||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
|
||||||
{
|
|
||||||
self.fast_fields_composite
|
|
||||||
.open_read(field)
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
|
||||||
.map(FastFieldReader::open)
|
|
||||||
} else {
|
|
||||||
Err(FastFieldNotAvailableError::new(field_entry))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
|
||||||
&self,
|
|
||||||
field: Field,
|
|
||||||
idx: usize,
|
|
||||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
|
||||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
|
||||||
Ok(FastFieldReader::open(ff_source))
|
|
||||||
} else {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
Err(FastFieldNotAvailableError::new(field_entry))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
|
||||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
|
||||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
|
||||||
&self,
|
|
||||||
field: Field,
|
|
||||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
|
||||||
{
|
|
||||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
|
||||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
|
||||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
|
||||||
} else {
|
|
||||||
Err(FastFieldNotAvailableError::new(field_entry))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
|
||||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
match *field_entry.field_type() {
|
|
||||||
FieldType::Bytes => {}
|
|
||||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
|
||||||
}
|
|
||||||
let idx_reader = self
|
|
||||||
.fast_fields_composite
|
|
||||||
.open_read_with_idx(field, 0)
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
|
||||||
.map(FastFieldReader::open)?;
|
|
||||||
let values = self
|
|
||||||
.fast_fields_composite
|
|
||||||
.open_read_with_idx(field, 1)
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
|
||||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||||
return Err(TantivyError::InvalidArgument(format!(
|
return None;
|
||||||
"The field {:?} is not a \
|
|
||||||
hierarchical facet.",
|
|
||||||
field_entry
|
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
let termdict_source = self.termdict_composite.open_read(field)?;
|
||||||
TantivyError::InvalidArgument(format!(
|
|
||||||
"The field \"{}\" is a hierarchical \
|
|
||||||
but this segment does not seem to have the field term \
|
|
||||||
dictionary.",
|
|
||||||
field_entry.name()
|
|
||||||
))
|
|
||||||
})?;
|
|
||||||
let termdict = TermDictionary::from_source(&termdict_source);
|
let termdict = TermDictionary::from_source(&termdict_source);
|
||||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||||
Ok(facet_reader)
|
Some(facet_reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the segment's `Field norms`'s reader.
|
/// Accessor to the segment's `Field norms`'s reader.
|
||||||
@@ -247,8 +171,12 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let schema = segment.schema();
|
||||||
|
|
||||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||||
|
let fast_field_readers =
|
||||||
|
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||||
|
|
||||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||||
@@ -260,14 +188,13 @@ impl SegmentReader {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let schema = segment.schema();
|
|
||||||
Ok(SegmentReader {
|
Ok(SegmentReader {
|
||||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||||
max_doc: segment.meta().max_doc(),
|
max_doc: segment.meta().max_doc(),
|
||||||
num_docs: segment.meta().num_docs(),
|
num_docs: segment.meta().num_docs(),
|
||||||
termdict_composite,
|
termdict_composite,
|
||||||
postings_composite,
|
postings_composite,
|
||||||
fast_fields_composite,
|
fast_fields_readers: fast_field_readers,
|
||||||
fieldnorms_composite,
|
fieldnorms_composite,
|
||||||
segment_id: segment.id(),
|
segment_id: segment.id(),
|
||||||
store_source,
|
store_source,
|
||||||
@@ -381,12 +308,12 @@ impl SegmentReader {
|
|||||||
self.postings_composite.space_usage(),
|
self.postings_composite.space_usage(),
|
||||||
self.positions_composite.space_usage(),
|
self.positions_composite.space_usage(),
|
||||||
self.positions_idx_composite.space_usage(),
|
self.positions_idx_composite.space_usage(),
|
||||||
self.fast_fields_composite.space_usage(),
|
self.fast_fields_readers.space_usage(),
|
||||||
self.fieldnorms_composite.space_usage(),
|
self.fieldnorms_composite.space_usage(),
|
||||||
self.get_store_reader().space_usage(),
|
self.get_store_reader().space_usage(),
|
||||||
self.delete_bitset_opt
|
self.delete_bitset_opt
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|x| x.space_usage())
|
.map(DeleteBitSet::space_usage)
|
||||||
.unwrap_or(0),
|
.unwrap_or(0),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,14 +48,14 @@ impl RetryPolicy {
|
|||||||
///
|
///
|
||||||
/// It is transparently associated to a lock file, that gets deleted
|
/// It is transparently associated to a lock file, that gets deleted
|
||||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||||
pub struct DirectoryLock(Box<Drop + Send + 'static>);
|
pub struct DirectoryLock(Box<Drop + Send + Sync + 'static>);
|
||||||
|
|
||||||
struct DirectoryLockGuard {
|
struct DirectoryLockGuard {
|
||||||
directory: Box<Directory>,
|
directory: Box<Directory>,
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
|
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||||
fn from(underlying: Box<T>) -> Self {
|
fn from(underlying: Box<T>) -> Self {
|
||||||
DirectoryLock(underlying)
|
DirectoryLock(underlying)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ impl InnerDirectory {
|
|||||||
self.fs
|
self.fs
|
||||||
.get(path)
|
.get(path)
|
||||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||||
.map(|el| el.clone())
|
.map(Clone::clone)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
||||||
|
|||||||
@@ -23,14 +23,14 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap();
|
||||||
|
|
||||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]);
|
||||||
assert!(bytes_reader.get_val(1).is_empty());
|
assert!(bytes_reader.get_bytes(1).is_empty());
|
||||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
assert_eq!(bytes_reader.get_bytes(2), &[255u8]);
|
||||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]);
|
||||||
let long = vec![0u8; 1000];
|
let long = vec![0u8; 1000];
|
||||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
assert_eq!(bytes_reader.get_bytes(4), long.as_slice());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ use DocId;
|
|||||||
///
|
///
|
||||||
/// Reading the value for a document is done by reading the start index for it,
|
/// Reading the value for a document is done by reading the start index for it,
|
||||||
/// and the start index for the next document, and keeping the bytes in between.
|
/// and the start index for the next document, and keeping the bytes in between.
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct BytesFastFieldReader {
|
pub struct BytesFastFieldReader {
|
||||||
idx_reader: FastFieldReader<u64>,
|
idx_reader: FastFieldReader<u64>,
|
||||||
values: OwningRef<ReadOnlySource, [u8]>,
|
values: OwningRef<ReadOnlySource, [u8]>,
|
||||||
@@ -28,10 +29,20 @@ impl BytesFastFieldReader {
|
|||||||
BytesFastFieldReader { idx_reader, values }
|
BytesFastFieldReader { idx_reader, values }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the bytes associated to the given `doc`
|
fn range(&self, doc: DocId) -> (usize, usize) {
|
||||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
|
||||||
let start = self.idx_reader.get(doc) as usize;
|
let start = self.idx_reader.get(doc) as usize;
|
||||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||||
|
(start, stop)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the bytes associated to the given `doc`
|
||||||
|
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
|
||||||
|
let (start, stop) = self.range(doc);
|
||||||
&self.values[start..stop]
|
&self.values[start..stop]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the overall number of bytes in this bytes fast field.
|
||||||
|
pub fn total_num_bytes(&self) -> usize {
|
||||||
|
self.values.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -53,16 +53,18 @@ impl DeleteBitSet {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns whether the document has been marked as deleted.
|
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||||
|
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||||
|
!self.is_deleted(doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true iff the document has been marked as deleted.
|
||||||
|
#[inline(always)]
|
||||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||||
if self.len == 0 {
|
let byte_offset = doc / 8u32;
|
||||||
false
|
let b: u8 = (*self.data)[byte_offset as usize];
|
||||||
} else {
|
let shift = (doc & 7u32) as u8;
|
||||||
let byte_offset = doc / 8u32;
|
b & (1u8 << shift) != 0
|
||||||
let b: u8 = (*self.data)[byte_offset as usize];
|
|
||||||
let shift = (doc & 7u32) as u8;
|
|
||||||
b & (1u8 << shift) != 0
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this bitset.
|
/// Summarize total space usage of this bitset.
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ pub use self::error::{FastFieldNotAvailableError, Result};
|
|||||||
pub use self::facet_reader::FacetReader;
|
pub use self::facet_reader::FacetReader;
|
||||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||||
pub use self::reader::FastFieldReader;
|
pub use self::reader::FastFieldReader;
|
||||||
|
pub use self::readers::FastFieldReaders;
|
||||||
pub use self::serializer::FastFieldSerializer;
|
pub use self::serializer::FastFieldSerializer;
|
||||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||||
use common;
|
use common;
|
||||||
@@ -43,6 +44,7 @@ mod error;
|
|||||||
mod facet_reader;
|
mod facet_reader;
|
||||||
mod multivalued;
|
mod multivalued;
|
||||||
mod reader;
|
mod reader;
|
||||||
|
mod readers;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
mod writer;
|
mod writer;
|
||||||
|
|
||||||
@@ -78,10 +80,6 @@ impl FastValue for u64 {
|
|||||||
*self
|
*self
|
||||||
}
|
}
|
||||||
|
|
||||||
fn as_u64(&self) -> u64 {
|
|
||||||
*self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||||
match *field_type {
|
match *field_type {
|
||||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||||
@@ -89,6 +87,10 @@ impl FastValue for u64 {
|
|||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn as_u64(&self) -> u64 {
|
||||||
|
*self
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FastValue for i64 {
|
impl FastValue for i64 {
|
||||||
|
|||||||
@@ -37,9 +37,7 @@ mod tests {
|
|||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = segment_reader
|
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||||
.multi_fast_field_reader::<u64>(field)
|
|
||||||
.unwrap();
|
|
||||||
{
|
{
|
||||||
multi_value_reader.get_vals(2, &mut vals);
|
multi_value_reader.get_vals(2, &mut vals);
|
||||||
assert_eq!(&vals, &[4u64]);
|
assert_eq!(&vals, &[4u64]);
|
||||||
@@ -198,9 +196,9 @@ mod tests {
|
|||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||||
{
|
{
|
||||||
multi_value_reader.get_vals(2, &mut vals);
|
multi_value_reader.get_vals(2, &mut vals);
|
||||||
assert_eq!(&vals, &[-4i64]);
|
assert_eq!(&vals, &[-4i64]);
|
||||||
|
|||||||
@@ -26,6 +26,13 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
|
||||||
|
MultiValueIntFastFieldReader {
|
||||||
|
idx_reader: self.idx_reader,
|
||||||
|
vals_reader: self.vals_reader.into_u64_reader(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns `(start, stop)`, such that the values associated
|
/// Returns `(start, stop)`, such that the values associated
|
||||||
/// to the given document are `start..stop`.
|
/// to the given document are `start..stop`.
|
||||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||||
@@ -41,13 +48,24 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
vals.resize(len, Item::default());
|
vals.resize(len, Item::default());
|
||||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the number of values associated with the document `DocId`.
|
||||||
|
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||||
|
let (start, stop) = self.range(doc);
|
||||||
|
(stop - start) as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the overall number of values in this field .
|
||||||
|
pub fn total_num_vals(&self) -> u64 {
|
||||||
|
self.idx_reader.max_value()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{Document, Facet, Schema};
|
use schema::{Facet, Schema};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multifastfield_reader() {
|
fn test_multifastfield_reader() {
|
||||||
@@ -58,22 +76,12 @@ mod tests {
|
|||||||
let mut index_writer = index
|
let mut index_writer = index
|
||||||
.writer_with_num_threads(1, 30_000_000)
|
.writer_with_num_threads(1, 30_000_000)
|
||||||
.expect("Failed to create index writer.");
|
.expect("Failed to create index writer.");
|
||||||
{
|
index_writer.add_document(doc!(
|
||||||
let mut doc = Document::new();
|
facet_field => Facet::from("/category/cat2"),
|
||||||
doc.add_facet(facet_field, "/category/cat2");
|
facet_field => Facet::from("/category/cat1"),
|
||||||
doc.add_facet(facet_field, "/category/cat1");
|
));
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||||
}
|
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||||
{
|
|
||||||
let mut doc = Document::new();
|
|
||||||
doc.add_facet(facet_field, "/category/cat2");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut doc = Document::new();
|
|
||||||
doc.add_facet(facet_field, "/category/cat3");
|
|
||||||
index_writer.add_document(doc);
|
|
||||||
}
|
|
||||||
index_writer.commit().expect("Commit failed");
|
index_writer.commit().expect("Commit failed");
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
|||||||
@@ -50,6 +50,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
|
||||||
|
FastFieldReader {
|
||||||
|
bit_unpacker: self.bit_unpacker,
|
||||||
|
min_value_u64: self.min_value_u64,
|
||||||
|
max_value_u64: self.max_value_u64,
|
||||||
|
_phantom: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the value associated to the given document.
|
/// Return the value associated to the given document.
|
||||||
///
|
///
|
||||||
/// This accessor should return as fast as possible.
|
/// This accessor should return as fast as possible.
|
||||||
|
|||||||
191
src/fastfield/readers.rs
Normal file
191
src/fastfield/readers.rs
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
use common::CompositeFile;
|
||||||
|
use fastfield::BytesFastFieldReader;
|
||||||
|
use fastfield::MultiValueIntFastFieldReader;
|
||||||
|
use fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||||
|
use schema::{Cardinality, Field, FieldType, Schema};
|
||||||
|
use space_usage::PerFieldSpaceUsage;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use Result;
|
||||||
|
|
||||||
|
/// Provides access to all of the FastFieldReader.
|
||||||
|
///
|
||||||
|
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||||
|
/// and just wraps several `HashMap`.
|
||||||
|
pub struct FastFieldReaders {
|
||||||
|
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
|
||||||
|
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
|
||||||
|
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
|
||||||
|
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
|
||||||
|
fast_bytes: HashMap<Field, BytesFastFieldReader>,
|
||||||
|
fast_fields_composite: CompositeFile,
|
||||||
|
}
|
||||||
|
|
||||||
|
enum FastType {
|
||||||
|
I64,
|
||||||
|
U64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||||
|
match field_type {
|
||||||
|
FieldType::U64(options) => options
|
||||||
|
.get_fastfield_cardinality()
|
||||||
|
.map(|cardinality| (FastType::U64, cardinality)),
|
||||||
|
FieldType::I64(options) => options
|
||||||
|
.get_fastfield_cardinality()
|
||||||
|
.map(|cardinality| (FastType::I64, cardinality)),
|
||||||
|
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldReaders {
|
||||||
|
pub(crate) fn load_all(
|
||||||
|
schema: &Schema,
|
||||||
|
fast_fields_composite: &CompositeFile,
|
||||||
|
) -> Result<FastFieldReaders> {
|
||||||
|
let mut fast_field_readers = FastFieldReaders {
|
||||||
|
fast_field_i64: Default::default(),
|
||||||
|
fast_field_u64: Default::default(),
|
||||||
|
fast_field_i64s: Default::default(),
|
||||||
|
fast_field_u64s: Default::default(),
|
||||||
|
fast_bytes: Default::default(),
|
||||||
|
fast_fields_composite: fast_fields_composite.clone(),
|
||||||
|
};
|
||||||
|
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||||
|
let field = Field(field_id as u32);
|
||||||
|
let field_type = field_entry.field_type();
|
||||||
|
if field_type == &FieldType::Bytes {
|
||||||
|
let idx_reader = fast_fields_composite
|
||||||
|
.open_read_with_idx(field, 0)
|
||||||
|
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||||
|
.map(FastFieldReader::open)?;
|
||||||
|
let data = fast_fields_composite
|
||||||
|
.open_read_with_idx(field, 1)
|
||||||
|
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||||
|
fast_field_readers
|
||||||
|
.fast_bytes
|
||||||
|
.insert(field, BytesFastFieldReader::open(idx_reader, data));
|
||||||
|
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
|
||||||
|
match cardinality {
|
||||||
|
Cardinality::SingleValue => {
|
||||||
|
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
|
||||||
|
match fast_type {
|
||||||
|
FastType::U64 => {
|
||||||
|
let fast_field_reader = FastFieldReader::open(fast_field_data);
|
||||||
|
fast_field_readers
|
||||||
|
.fast_field_u64
|
||||||
|
.insert(field, fast_field_reader);
|
||||||
|
}
|
||||||
|
FastType::I64 => {
|
||||||
|
fast_field_readers.fast_field_i64.insert(
|
||||||
|
field,
|
||||||
|
FastFieldReader::open(fast_field_data.clone()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Cardinality::MultiValues => {
|
||||||
|
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
|
||||||
|
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
|
||||||
|
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
|
||||||
|
let idx_reader = FastFieldReader::open(fast_field_idx);
|
||||||
|
match fast_type {
|
||||||
|
FastType::I64 => {
|
||||||
|
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||||
|
let multivalued_int_fast_field =
|
||||||
|
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||||
|
fast_field_readers
|
||||||
|
.fast_field_i64s
|
||||||
|
.insert(field, multivalued_int_fast_field);
|
||||||
|
}
|
||||||
|
FastType::U64 => {
|
||||||
|
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||||
|
let multivalued_int_fast_field =
|
||||||
|
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||||
|
fast_field_readers
|
||||||
|
.fast_field_u64s
|
||||||
|
.insert(field, multivalued_int_fast_field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(fast_field_readers)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||||
|
self.fast_fields_composite.space_usage()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a u64 fast field, this method returns `None`.
|
||||||
|
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||||
|
self.fast_field_u64.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If the field is a u64-fast field return the associated reader.
|
||||||
|
/// If the field is a i64-fast field, return the associated u64 reader. Values are
|
||||||
|
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
|
||||||
|
///
|
||||||
|
/// This method is useful when merging segment reader.
|
||||||
|
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||||
|
if let Some(u64_ff_reader) = self.u64(field) {
|
||||||
|
return Some(u64_ff_reader);
|
||||||
|
}
|
||||||
|
if let Some(i64_ff_reader) = self.i64(field) {
|
||||||
|
return Some(i64_ff_reader.into_u64_reader());
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||||
|
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
|
||||||
|
self.fast_field_i64.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
|
||||||
|
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||||
|
self.fast_field_u64s.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If the field is a u64s-fast field return the associated reader.
|
||||||
|
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
|
||||||
|
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
|
||||||
|
///
|
||||||
|
/// This method is useful when merging segment reader.
|
||||||
|
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||||
|
if let Some(u64s_ff_reader) = self.u64s(field) {
|
||||||
|
return Some(u64s_ff_reader);
|
||||||
|
}
|
||||||
|
if let Some(i64s_ff_reader) = self.i64s(field) {
|
||||||
|
return Some(i64s_ff_reader.into_u64s_reader());
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
|
||||||
|
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
|
||||||
|
self.fast_field_i64s.get(&field).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `bytes` fast field reader associated to `field`.
|
||||||
|
///
|
||||||
|
/// If `field` is not a bytes fast field, returns `None`.
|
||||||
|
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
|
||||||
|
self.fast_bytes.get(&field).cloned()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,6 +2,7 @@ use super::operation::DeleteOperation;
|
|||||||
use std::mem;
|
use std::mem;
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
|
use Opstamp;
|
||||||
|
|
||||||
// The DeleteQueue is similar in conceptually to a multiple
|
// The DeleteQueue is similar in conceptually to a multiple
|
||||||
// consumer single producer broadcast channel.
|
// consumer single producer broadcast channel.
|
||||||
@@ -184,7 +185,7 @@ impl DeleteCursor {
|
|||||||
/// queue are consume and the next get will return None.
|
/// queue are consume and the next get will return None.
|
||||||
/// - the next get will return the first operation with an
|
/// - the next get will return the first operation with an
|
||||||
/// `opstamp >= target_opstamp`.
|
/// `opstamp >= target_opstamp`.
|
||||||
pub fn skip_to(&mut self, target_opstamp: u64) {
|
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||||
// TODO Can be optimize as we work with block.
|
// TODO Can be optimize as we work with block.
|
||||||
while self.is_behind_opstamp(target_opstamp) {
|
while self.is_behind_opstamp(target_opstamp) {
|
||||||
self.advance();
|
self.advance();
|
||||||
@@ -192,7 +193,7 @@ impl DeleteCursor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
||||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
||||||
self.get()
|
self.get()
|
||||||
.map(|operation| operation.opstamp < target_opstamp)
|
.map(|operation| operation.opstamp < target_opstamp)
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use DocId;
|
use DocId;
|
||||||
|
use Opstamp;
|
||||||
|
|
||||||
// Doc to opstamp is used to identify which
|
// Doc to opstamp is used to identify which
|
||||||
// document should be deleted.
|
// document should be deleted.
|
||||||
@@ -23,7 +24,7 @@ pub enum DocToOpstampMapping {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl From<Vec<u64>> for DocToOpstampMapping {
|
impl From<Vec<u64>> for DocToOpstampMapping {
|
||||||
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
|
fn from(opstamps: Vec<Opstamp>) -> DocToOpstampMapping {
|
||||||
DocToOpstampMapping::WithMap(Arc::new(opstamps))
|
DocToOpstampMapping::WithMap(Arc::new(opstamps))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -35,7 +36,7 @@ impl DocToOpstampMapping {
|
|||||||
//
|
//
|
||||||
// The edge case opstamp = some doc opstamp is in practise
|
// The edge case opstamp = some doc opstamp is in practise
|
||||||
// never called.
|
// never called.
|
||||||
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
|
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
|
||||||
match *self {
|
match *self {
|
||||||
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
|
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
|
||||||
match doc_opstamps.binary_search(&target_opstamp) {
|
match doc_opstamps.binary_search(&target_opstamp) {
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ use std::ops::Range;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::thread::JoinHandle;
|
use std::thread::JoinHandle;
|
||||||
|
use Opstamp;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||||
@@ -99,7 +100,7 @@ pub struct IndexWriter {
|
|||||||
delete_queue: DeleteQueue,
|
delete_queue: DeleteQueue,
|
||||||
|
|
||||||
stamper: Stamper,
|
stamper: Stamper,
|
||||||
committed_opstamp: u64,
|
committed_opstamp: Opstamp,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||||
@@ -177,7 +178,7 @@ pub fn compute_deleted_bitset(
|
|||||||
segment_reader: &SegmentReader,
|
segment_reader: &SegmentReader,
|
||||||
delete_cursor: &mut DeleteCursor,
|
delete_cursor: &mut DeleteCursor,
|
||||||
doc_opstamps: &DocToOpstampMapping,
|
doc_opstamps: &DocToOpstampMapping,
|
||||||
target_opstamp: u64,
|
target_opstamp: Opstamp,
|
||||||
) -> Result<bool> {
|
) -> Result<bool> {
|
||||||
let mut might_have_changed = false;
|
let mut might_have_changed = false;
|
||||||
|
|
||||||
@@ -219,7 +220,7 @@ pub fn compute_deleted_bitset(
|
|||||||
pub fn advance_deletes(
|
pub fn advance_deletes(
|
||||||
mut segment: Segment,
|
mut segment: Segment,
|
||||||
segment_entry: &mut SegmentEntry,
|
segment_entry: &mut SegmentEntry,
|
||||||
target_opstamp: u64,
|
target_opstamp: Opstamp,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
{
|
{
|
||||||
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
||||||
@@ -299,11 +300,11 @@ fn index_documents(
|
|||||||
// the worker thread.
|
// the worker thread.
|
||||||
assert!(num_docs > 0);
|
assert!(num_docs > 0);
|
||||||
|
|
||||||
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
|
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||||
|
|
||||||
let segment_meta = SegmentMeta::new(segment_id, num_docs);
|
let segment_meta = SegmentMeta::new(segment_id, num_docs);
|
||||||
|
|
||||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||||
|
|
||||||
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
||||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||||
@@ -494,7 +495,7 @@ impl IndexWriter {
|
|||||||
/// state as it was after the last commit.
|
/// state as it was after the last commit.
|
||||||
///
|
///
|
||||||
/// The opstamp at the last commit is returned.
|
/// The opstamp at the last commit is returned.
|
||||||
pub fn rollback(&mut self) -> Result<()> {
|
pub fn rollback(&mut self) -> Result<Opstamp> {
|
||||||
info!("Rolling back to opstamp {}", self.committed_opstamp);
|
info!("Rolling back to opstamp {}", self.committed_opstamp);
|
||||||
|
|
||||||
// marks the segment updater as killed. From now on, all
|
// marks the segment updater as killed. From now on, all
|
||||||
@@ -529,7 +530,7 @@ impl IndexWriter {
|
|||||||
// was dropped with the index_writer.
|
// was dropped with the index_writer.
|
||||||
for _ in document_receiver.clone() {}
|
for _ in document_receiver.clone() {}
|
||||||
|
|
||||||
Ok(())
|
Ok(self.committed_opstamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Prepares a commit.
|
/// Prepares a commit.
|
||||||
@@ -567,7 +568,7 @@ impl IndexWriter {
|
|||||||
info!("Preparing commit");
|
info!("Preparing commit");
|
||||||
|
|
||||||
// this will drop the current document channel
|
// this will drop the current document channel
|
||||||
// and recreate a new one channels.
|
// and recreate a new one.
|
||||||
self.recreate_document_channel();
|
self.recreate_document_channel();
|
||||||
|
|
||||||
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||||
@@ -601,7 +602,7 @@ impl IndexWriter {
|
|||||||
/// Commit returns the `opstamp` of the last document
|
/// Commit returns the `opstamp` of the last document
|
||||||
/// that made it in the commit.
|
/// that made it in the commit.
|
||||||
///
|
///
|
||||||
pub fn commit(&mut self) -> Result<u64> {
|
pub fn commit(&mut self) -> Result<Opstamp> {
|
||||||
self.prepare_commit()?.commit()
|
self.prepare_commit()?.commit()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -617,7 +618,7 @@ impl IndexWriter {
|
|||||||
///
|
///
|
||||||
/// Like adds, the deletion itself will be visible
|
/// Like adds, the deletion itself will be visible
|
||||||
/// only after calling `commit()`.
|
/// only after calling `commit()`.
|
||||||
pub fn delete_term(&mut self, term: Term) -> u64 {
|
pub fn delete_term(&self, term: Term) -> Opstamp {
|
||||||
let opstamp = self.stamper.stamp();
|
let opstamp = self.stamper.stamp();
|
||||||
let delete_operation = DeleteOperation { opstamp, term };
|
let delete_operation = DeleteOperation { opstamp, term };
|
||||||
self.delete_queue.push(delete_operation);
|
self.delete_queue.push(delete_operation);
|
||||||
@@ -631,7 +632,7 @@ impl IndexWriter {
|
|||||||
///
|
///
|
||||||
/// This is also the opstamp of the commit that is currently
|
/// This is also the opstamp of the commit that is currently
|
||||||
/// available for searchers.
|
/// available for searchers.
|
||||||
pub fn commit_opstamp(&self) -> u64 {
|
pub fn commit_opstamp(&self) -> Opstamp {
|
||||||
self.committed_opstamp
|
self.committed_opstamp
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -645,7 +646,7 @@ impl IndexWriter {
|
|||||||
///
|
///
|
||||||
/// Currently it represents the number of documents that
|
/// Currently it represents the number of documents that
|
||||||
/// have been added since the creation of the index.
|
/// have been added since the creation of the index.
|
||||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||||
let opstamp = self.stamper.stamp();
|
let opstamp = self.stamper.stamp();
|
||||||
let add_operation = AddOperation { opstamp, document };
|
let add_operation = AddOperation { opstamp, document };
|
||||||
let send_result = self.operation_sender.send(vec![add_operation]);
|
let send_result = self.operation_sender.send(vec![add_operation]);
|
||||||
@@ -662,7 +663,7 @@ impl IndexWriter {
|
|||||||
/// The total number of stamps generated by this method is `count + 1`;
|
/// The total number of stamps generated by this method is `count + 1`;
|
||||||
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
||||||
/// is for the batch itself.
|
/// is for the batch itself.
|
||||||
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
|
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||||
let last_opstamp = end - 1;
|
let last_opstamp = end - 1;
|
||||||
let stamps = Range {
|
let stamps = Range {
|
||||||
@@ -688,7 +689,7 @@ impl IndexWriter {
|
|||||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||||
/// visible to readers only after calling `commit()`.
|
/// visible to readers only after calling `commit()`.
|
||||||
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
|
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||||
let count = user_operations.len() as u64;
|
let count = user_operations.len() as u64;
|
||||||
if count == 0 {
|
if count == 0 {
|
||||||
return self.stamper.stamp();
|
return self.stamper.stamp();
|
||||||
@@ -739,7 +740,7 @@ mod tests {
|
|||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
let operations = vec![
|
let operations = vec![
|
||||||
UserOperation::Add(doc!(text_field=>"a")),
|
UserOperation::Add(doc!(text_field=>"a")),
|
||||||
UserOperation::Add(doc!(text_field=>"b")),
|
UserOperation::Add(doc!(text_field=>"b")),
|
||||||
@@ -801,7 +802,7 @@ mod tests {
|
|||||||
fn test_empty_operations_group() {
|
fn test_empty_operations_group() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::Schema::builder();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
let index_writer = index.writer(3_000_000).unwrap();
|
||||||
let operations1 = vec![];
|
let operations1 = vec![];
|
||||||
let batch_opstamp1 = index_writer.run(operations1);
|
let batch_opstamp1 = index_writer.run(operations1);
|
||||||
assert_eq!(batch_opstamp1, 0u64);
|
assert_eq!(batch_opstamp1, 0u64);
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ impl MergePolicy for LogMergePolicy {
|
|||||||
|
|
||||||
let mut size_sorted_tuples = segments
|
let mut size_sorted_tuples = segments
|
||||||
.iter()
|
.iter()
|
||||||
.map(|x| x.num_docs())
|
.map(SegmentMeta::num_docs)
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.collect::<Vec<(usize, u32)>>();
|
.collect::<Vec<(usize, u32)>>();
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use census::{Inventory, TrackedObject};
|
use census::{Inventory, TrackedObject};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
use Opstamp;
|
||||||
use SegmentId;
|
use SegmentId;
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@@ -17,8 +18,8 @@ impl MergeOperationInventory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A `MergeOperation` has two role.
|
/// A `MergeOperation` has two roles.
|
||||||
/// It carries all of the information required to describe a merge :
|
/// It carries all of the information required to describe a merge:
|
||||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
||||||
/// delete queue and reflect their deletes.
|
/// delete queue and reflect their deletes.
|
||||||
/// - `segment_ids` is the list of segment to be merged.
|
/// - `segment_ids` is the list of segment to be merged.
|
||||||
@@ -35,14 +36,14 @@ pub struct MergeOperation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct InnerMergeOperation {
|
struct InnerMergeOperation {
|
||||||
target_opstamp: u64,
|
target_opstamp: Opstamp,
|
||||||
segment_ids: Vec<SegmentId>,
|
segment_ids: Vec<SegmentId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeOperation {
|
impl MergeOperation {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
inventory: &MergeOperationInventory,
|
inventory: &MergeOperationInventory,
|
||||||
target_opstamp: u64,
|
target_opstamp: Opstamp,
|
||||||
segment_ids: Vec<SegmentId>,
|
segment_ids: Vec<SegmentId>,
|
||||||
) -> MergeOperation {
|
) -> MergeOperation {
|
||||||
let inner_merge_operation = InnerMergeOperation {
|
let inner_merge_operation = InnerMergeOperation {
|
||||||
@@ -54,7 +55,7 @@ impl MergeOperation {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn target_opstamp(&self) -> u64 {
|
pub fn target_opstamp(&self) -> Opstamp {
|
||||||
self.inner.target_opstamp
|
self.inner.target_opstamp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use core::Segment;
|
|||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use core::SerializableSegment;
|
use core::SerializableSegment;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
|
use fastfield::BytesFastFieldReader;
|
||||||
use fastfield::DeleteBitSet;
|
use fastfield::DeleteBitSet;
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use fastfield::FastFieldSerializer;
|
use fastfield::FastFieldSerializer;
|
||||||
@@ -72,7 +73,7 @@ fn compute_min_max_val(
|
|||||||
// some deleted documents,
|
// some deleted documents,
|
||||||
// we need to recompute the max / min
|
// we need to recompute the max / min
|
||||||
(0..max_doc)
|
(0..max_doc)
|
||||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
|
||||||
.map(|doc_id| u64_reader.get(doc_id))
|
.map(|doc_id| u64_reader.get(doc_id))
|
||||||
.minmax()
|
.minmax()
|
||||||
.into_option()
|
.into_option()
|
||||||
@@ -239,7 +240,10 @@ impl IndexMerger {
|
|||||||
let mut max_value = u64::min_value();
|
let mut max_value = u64::min_value();
|
||||||
|
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let u64_reader: FastFieldReader<u64> = reader.fast_field_reader(field)?;
|
let u64_reader: FastFieldReader<u64> = reader
|
||||||
|
.fast_fields()
|
||||||
|
.u64_lenient(field)
|
||||||
|
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||||
if let Some((seg_min_val, seg_max_val)) =
|
if let Some((seg_min_val, seg_max_val)) =
|
||||||
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
|
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
|
||||||
{
|
{
|
||||||
@@ -282,24 +286,28 @@ impl IndexMerger {
|
|||||||
fast_field_serializer: &mut FastFieldSerializer,
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut total_num_vals = 0u64;
|
let mut total_num_vals = 0u64;
|
||||||
|
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
|
||||||
|
|
||||||
// In the first pass, we compute the total number of vals.
|
// In the first pass, we compute the total number of vals.
|
||||||
//
|
//
|
||||||
// This is required by the bitpacker, as it needs to know
|
// This is required by the bitpacker, as it needs to know
|
||||||
// what should be the bit length use for bitpacking.
|
// what should be the bit length use for bitpacking.
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
let u64s_reader = reader.fast_fields()
|
||||||
|
.u64s_lenient(field)
|
||||||
|
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
|
||||||
|
|
||||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||||
for doc in 0u32..reader.max_doc() {
|
for doc in 0u32..reader.max_doc() {
|
||||||
if !delete_bitset.is_deleted(doc) {
|
if delete_bitset.is_alive(doc) {
|
||||||
let start = idx_reader.get(doc);
|
let num_vals = u64s_reader.num_vals(doc) as u64;
|
||||||
let end = idx_reader.get(doc + 1);
|
total_num_vals += num_vals;
|
||||||
total_num_vals += end - start;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
total_num_vals += idx_reader.max_value();
|
total_num_vals += u64s_reader.total_num_vals();
|
||||||
}
|
}
|
||||||
|
u64s_readers.push(u64s_reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can now create our `idx` serializer, and in a second pass,
|
// We can now create our `idx` serializer, and in a second pass,
|
||||||
@@ -307,13 +315,10 @@ impl IndexMerger {
|
|||||||
let mut serialize_idx =
|
let mut serialize_idx =
|
||||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||||
let mut idx = 0;
|
let mut idx = 0;
|
||||||
for reader in &self.readers {
|
for (segment_reader, u64s_reader) in self.readers.iter().zip(&u64s_readers) {
|
||||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
for doc in reader.doc_ids_alive() {
|
|
||||||
serialize_idx.add_val(idx)?;
|
serialize_idx.add_val(idx)?;
|
||||||
let start = idx_reader.get(doc);
|
idx += u64s_reader.num_vals(doc) as u64;
|
||||||
let end = idx_reader.get(doc + 1);
|
|
||||||
idx += end - start;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
serialize_idx.add_val(idx)?;
|
serialize_idx.add_val(idx)?;
|
||||||
@@ -344,8 +349,10 @@ impl IndexMerger {
|
|||||||
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
||||||
let term_ordinal_mapping: &[TermOrdinal] =
|
let term_ordinal_mapping: &[TermOrdinal] =
|
||||||
term_ordinal_mappings.get_segment(segment_ord);
|
term_ordinal_mappings.get_segment(segment_ord);
|
||||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
|
||||||
segment_reader.multi_fast_field_reader(field)?;
|
.fast_fields()
|
||||||
|
.u64s(field)
|
||||||
|
.expect("Could not find multivalued u64 fast value reader.");
|
||||||
// TODO optimize if no deletes
|
// TODO optimize if no deletes
|
||||||
for doc in segment_reader.doc_ids_alive() {
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
ff_reader.get_vals(doc, &mut vals);
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
@@ -377,6 +384,8 @@ impl IndexMerger {
|
|||||||
|
|
||||||
let mut vals = Vec::with_capacity(100);
|
let mut vals = Vec::with_capacity(100);
|
||||||
|
|
||||||
|
let mut ff_readers = Vec::new();
|
||||||
|
|
||||||
// Our values are bitpacked and we need to know what should be
|
// Our values are bitpacked and we need to know what should be
|
||||||
// our bitwidth and our minimum value before serializing any values.
|
// our bitwidth and our minimum value before serializing any values.
|
||||||
//
|
//
|
||||||
@@ -385,7 +394,10 @@ impl IndexMerger {
|
|||||||
// maximum value and initialize our Serializer.
|
// maximum value and initialize our Serializer.
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||||
reader.multi_fast_field_reader(field)?;
|
reader.fast_fields().u64s_lenient(field).expect(
|
||||||
|
"Failed to find multivalued fast field reader. This is a bug in \
|
||||||
|
tantivy. Please report.",
|
||||||
|
);
|
||||||
for doc in reader.doc_ids_alive() {
|
for doc in reader.doc_ids_alive() {
|
||||||
ff_reader.get_vals(doc, &mut vals);
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
for &val in &vals {
|
for &val in &vals {
|
||||||
@@ -393,6 +405,7 @@ impl IndexMerger {
|
|||||||
max_value = cmp::max(val, max_value);
|
max_value = cmp::max(val, max_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ff_readers.push(ff_reader);
|
||||||
// TODO optimize when no deletes
|
// TODO optimize when no deletes
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -405,9 +418,7 @@ impl IndexMerger {
|
|||||||
{
|
{
|
||||||
let mut serialize_vals = fast_field_serializer
|
let mut serialize_vals = fast_field_serializer
|
||||||
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
||||||
for reader in &self.readers {
|
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
|
||||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
|
||||||
reader.multi_fast_field_reader(field)?;
|
|
||||||
// TODO optimize if no deletes
|
// TODO optimize if no deletes
|
||||||
for doc in reader.doc_ids_alive() {
|
for doc in reader.doc_ids_alive() {
|
||||||
ff_reader.get_vals(doc, &mut vals);
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
@@ -426,19 +437,53 @@ impl IndexMerger {
|
|||||||
field: Field,
|
field: Field,
|
||||||
fast_field_serializer: &mut FastFieldSerializer,
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.write_fast_field_idx(field, fast_field_serializer)?;
|
let mut total_num_vals = 0u64;
|
||||||
|
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
|
||||||
|
|
||||||
|
for reader in &self.readers {
|
||||||
|
let bytes_reader = reader.fast_fields().bytes(field).expect(
|
||||||
|
"Failed to find bytes fast field reader. This is a bug in tantivy, please report.",
|
||||||
|
);
|
||||||
|
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||||
|
for doc in 0u32..reader.max_doc() {
|
||||||
|
if delete_bitset.is_alive(doc) {
|
||||||
|
let num_vals = bytes_reader.get_bytes(doc).len() as u64;
|
||||||
|
total_num_vals += num_vals;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
total_num_vals += bytes_reader.total_num_bytes() as u64;
|
||||||
|
}
|
||||||
|
bytes_readers.push(bytes_reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// We can now create our `idx` serializer, and in a second pass,
|
||||||
|
// can effectively push the different indexes.
|
||||||
|
let mut serialize_idx =
|
||||||
|
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||||
|
let mut idx = 0;
|
||||||
|
for (segment_reader, bytes_reader) in self.readers.iter().zip(&bytes_readers) {
|
||||||
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
|
serialize_idx.add_val(idx)?;
|
||||||
|
idx += bytes_reader.get_bytes(doc).len() as u64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
serialize_idx.add_val(idx)?;
|
||||||
|
serialize_idx.close_field()?;
|
||||||
|
}
|
||||||
|
|
||||||
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
|
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
|
||||||
for reader in &self.readers {
|
for segment_reader in &self.readers {
|
||||||
let bytes_reader = reader.bytes_fast_field_reader(field)?;
|
let bytes_reader = segment_reader.fast_fields().bytes(field)
|
||||||
|
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
|
||||||
// TODO: optimize if no deletes
|
// TODO: optimize if no deletes
|
||||||
for doc in reader.doc_ids_alive() {
|
for doc in segment_reader.doc_ids_alive() {
|
||||||
let val = bytes_reader.get_val(doc);
|
let val = bytes_reader.get_bytes(doc);
|
||||||
serialize_vals.write_all(val)?;
|
serialize_vals.write_all(val)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
serialize_vals.flush()?;
|
serialize_vals.flush()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -979,14 +1024,16 @@ mod tests {
|
|||||||
|
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 4000);
|
assert_eq!(score_field_reader.min_value(), 4000);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
|
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(1)
|
.segment_reader(1)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 1);
|
assert_eq!(score_field_reader.min_value(), 1);
|
||||||
assert_eq!(score_field_reader.max_value(), 3);
|
assert_eq!(score_field_reader.max_value(), 3);
|
||||||
@@ -1037,7 +1084,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 3);
|
assert_eq!(score_field_reader.min_value(), 3);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
@@ -1083,7 +1131,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 3);
|
assert_eq!(score_field_reader.min_value(), 3);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
@@ -1135,7 +1184,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_fields()
|
||||||
|
.u64(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 6000);
|
assert_eq!(score_field_reader.min_value(), 6000);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
@@ -1381,7 +1431,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(0u32);
|
let segment = searcher.segment_reader(0u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
|
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[1, 2]);
|
assert_eq!(&vals, &[1, 2]);
|
||||||
@@ -1416,7 +1466,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(1u32);
|
let segment = searcher.segment_reader(1u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
@@ -1426,7 +1476,7 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(2u32);
|
let segment = searcher.segment_reader(2u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[20]);
|
assert_eq!(&vals, &[20]);
|
||||||
}
|
}
|
||||||
@@ -1459,7 +1509,7 @@ mod tests {
|
|||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
);
|
);
|
||||||
let segment = searcher.segment_reader(0u32);
|
let segment = searcher.segment_reader(0u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||||
|
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[1, 2]);
|
assert_eq!(&vals, &[1, 2]);
|
||||||
|
|||||||
@@ -1,17 +1,18 @@
|
|||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
|
use Opstamp;
|
||||||
|
|
||||||
/// Timestamped Delete operation.
|
/// Timestamped Delete operation.
|
||||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||||
pub struct DeleteOperation {
|
pub struct DeleteOperation {
|
||||||
pub opstamp: u64,
|
pub opstamp: Opstamp,
|
||||||
pub term: Term,
|
pub term: Term,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Timestamped Add operation.
|
/// Timestamped Add operation.
|
||||||
#[derive(Eq, PartialEq, Debug)]
|
#[derive(Eq, PartialEq, Debug)]
|
||||||
pub struct AddOperation {
|
pub struct AddOperation {
|
||||||
pub opstamp: u64,
|
pub opstamp: Opstamp,
|
||||||
pub document: Document,
|
pub document: Document,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,15 +1,16 @@
|
|||||||
use super::IndexWriter;
|
use super::IndexWriter;
|
||||||
|
use Opstamp;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// A prepared commit
|
/// A prepared commit
|
||||||
pub struct PreparedCommit<'a> {
|
pub struct PreparedCommit<'a> {
|
||||||
index_writer: &'a mut IndexWriter,
|
index_writer: &'a mut IndexWriter,
|
||||||
payload: Option<String>,
|
payload: Option<String>,
|
||||||
opstamp: u64,
|
opstamp: Opstamp,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> PreparedCommit<'a> {
|
impl<'a> PreparedCommit<'a> {
|
||||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit {
|
||||||
PreparedCommit {
|
PreparedCommit {
|
||||||
index_writer,
|
index_writer,
|
||||||
payload: None,
|
payload: None,
|
||||||
@@ -17,7 +18,7 @@ impl<'a> PreparedCommit<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn opstamp(&self) -> u64 {
|
pub fn opstamp(&self) -> Opstamp {
|
||||||
self.opstamp
|
self.opstamp
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,11 +26,11 @@ impl<'a> PreparedCommit<'a> {
|
|||||||
self.payload = Some(payload.to_string())
|
self.payload = Some(payload.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn abort(self) -> Result<()> {
|
pub fn abort(self) -> Result<Opstamp> {
|
||||||
self.index_writer.rollback()
|
self.index_writer.rollback()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn commit(self) -> Result<u64> {
|
pub fn commit(self) -> Result<Opstamp> {
|
||||||
info!("committing {}", self.opstamp);
|
info!("committing {}", self.opstamp);
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.segment_updater()
|
.segment_updater()
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ impl SegmentRegister {
|
|||||||
.values()
|
.values()
|
||||||
.map(|segment_entry| segment_entry.meta().clone())
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect();
|
.collect();
|
||||||
segment_ids.sort_by_key(|meta| meta.id());
|
segment_ids.sort_by_key(SegmentMeta::id);
|
||||||
segment_ids
|
segment_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ use std::sync::Arc;
|
|||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::thread::JoinHandle;
|
use std::thread::JoinHandle;
|
||||||
|
use Opstamp;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
@@ -224,7 +225,7 @@ impl SegmentUpdater {
|
|||||||
///
|
///
|
||||||
/// Tne method returns copies of the segment entries,
|
/// Tne method returns copies of the segment entries,
|
||||||
/// updated with the delete information.
|
/// updated with the delete information.
|
||||||
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
|
fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
|
||||||
let mut segment_entries = self.0.segment_manager.segment_entries();
|
let mut segment_entries = self.0.segment_manager.segment_entries();
|
||||||
for segment_entry in &mut segment_entries {
|
for segment_entry in &mut segment_entries {
|
||||||
let segment = self.0.index.segment(segment_entry.meta().clone());
|
let segment = self.0.index.segment(segment_entry.meta().clone());
|
||||||
@@ -233,7 +234,7 @@ impl SegmentUpdater {
|
|||||||
Ok(segment_entries)
|
Ok(segment_entries)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) {
|
pub fn save_metas(&self, opstamp: Opstamp, commit_message: Option<String>) {
|
||||||
if self.is_alive() {
|
if self.is_alive() {
|
||||||
let index = &self.0.index;
|
let index = &self.0.index;
|
||||||
let directory = index.directory();
|
let directory = index.directory();
|
||||||
@@ -280,7 +281,7 @@ impl SegmentUpdater {
|
|||||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
|
pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
if segment_updater.is_alive() {
|
if segment_updater.is_alive() {
|
||||||
let segment_entries = segment_updater
|
let segment_entries = segment_updater
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use fastfield::FastFieldsWriter;
|
|||||||
use fieldnorm::FieldNormsWriter;
|
use fieldnorm::FieldNormsWriter;
|
||||||
use indexer::segment_serializer::SegmentSerializer;
|
use indexer::segment_serializer::SegmentSerializer;
|
||||||
use postings::MultiFieldPostingsWriter;
|
use postings::MultiFieldPostingsWriter;
|
||||||
|
use schema::FieldEntry;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
@@ -15,6 +16,7 @@ use tokenizer::BoxedTokenizer;
|
|||||||
use tokenizer::FacetTokenizer;
|
use tokenizer::FacetTokenizer;
|
||||||
use tokenizer::{TokenStream, Tokenizer};
|
use tokenizer::{TokenStream, Tokenizer};
|
||||||
use DocId;
|
use DocId;
|
||||||
|
use Opstamp;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||||
@@ -28,7 +30,7 @@ pub struct SegmentWriter {
|
|||||||
segment_serializer: SegmentSerializer,
|
segment_serializer: SegmentSerializer,
|
||||||
fast_field_writers: FastFieldsWriter,
|
fast_field_writers: FastFieldsWriter,
|
||||||
fieldnorms_writer: FieldNormsWriter,
|
fieldnorms_writer: FieldNormsWriter,
|
||||||
doc_opstamps: Vec<u64>,
|
doc_opstamps: Vec<Opstamp>,
|
||||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,7 +55,7 @@ impl SegmentWriter {
|
|||||||
schema
|
schema
|
||||||
.fields()
|
.fields()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|field_entry| field_entry.field_type())
|
.map(FieldEntry::field_type)
|
||||||
.map(|field_type| match *field_type {
|
.map(|field_type| match *field_type {
|
||||||
FieldType::Str(ref text_options) => text_options
|
FieldType::Str(ref text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
|
|||||||
@@ -1,70 +1,27 @@
|
|||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use Opstamp;
|
||||||
|
|
||||||
// AtomicU64 have not landed in stable.
|
/// Stamper provides Opstamps, which is just an auto-increment id to label
|
||||||
// For the moment let's just use AtomicUsize on
|
/// an operation.
|
||||||
// x86/64 bit platform, and a mutex on other platform.
|
///
|
||||||
#[cfg(target_arch = "x86_64")]
|
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
|
||||||
mod archicture_impl {
|
|
||||||
|
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct AtomicU64Ersatz(AtomicUsize);
|
|
||||||
|
|
||||||
impl AtomicU64Ersatz {
|
|
||||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
|
||||||
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
|
||||||
self.0.fetch_add(val as usize, order) as u64
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(not(target_arch = "x86_64"))]
|
|
||||||
mod archicture_impl {
|
|
||||||
|
|
||||||
use std::sync::atomic::Ordering;
|
|
||||||
/// Under other architecture, we rely on a mutex.
|
|
||||||
use std::sync::RwLock;
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct AtomicU64Ersatz(RwLock<u64>);
|
|
||||||
|
|
||||||
impl AtomicU64Ersatz {
|
|
||||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
|
||||||
AtomicU64Ersatz(RwLock::new(first_opstamp))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
|
||||||
let mut lock = self.0.write().unwrap();
|
|
||||||
let previous_val = *lock;
|
|
||||||
*lock = previous_val + incr;
|
|
||||||
previous_val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
use self::archicture_impl::AtomicU64Ersatz;
|
|
||||||
|
|
||||||
#[derive(Clone, Default)]
|
#[derive(Clone, Default)]
|
||||||
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
pub struct Stamper(Arc<AtomicU64>);
|
||||||
|
|
||||||
impl Stamper {
|
impl Stamper {
|
||||||
pub fn new(first_opstamp: u64) -> Stamper {
|
pub fn new(first_opstamp: Opstamp) -> Stamper {
|
||||||
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stamp(&self) -> u64 {
|
pub fn stamp(&self) -> Opstamp {
|
||||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Given a desired count `n`, `stamps` returns an iterator that
|
/// Given a desired count `n`, `stamps` returns an iterator that
|
||||||
/// will supply `n` number of u64 stamps.
|
/// will supply `n` number of u64 stamps.
|
||||||
pub fn stamps(&self, n: u64) -> Range<u64> {
|
pub fn stamps(&self, n: u64) -> Range<Opstamp> {
|
||||||
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
||||||
Range {
|
Range {
|
||||||
start,
|
start,
|
||||||
@@ -92,4 +49,5 @@ mod test {
|
|||||||
assert_eq!(stamper.stamps(3u64), (12..15));
|
assert_eq!(stamper.stamps(3u64), (12..15));
|
||||||
assert_eq!(stamper.stamp(), 15u64);
|
assert_eq!(stamper.stamp(), 15u64);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
36
src/lib.rs
36
src/lib.rs
@@ -226,7 +226,7 @@ mod docset;
|
|||||||
pub use self::docset::{DocSet, SkipResult};
|
pub use self::docset::{DocSet, SkipResult};
|
||||||
|
|
||||||
pub use core::SegmentComponent;
|
pub use core::SegmentComponent;
|
||||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta, IndexMeta};
|
||||||
pub use core::{InvertedIndexReader, SegmentReader};
|
pub use core::{InvertedIndexReader, SegmentReader};
|
||||||
pub use directory::Directory;
|
pub use directory::Directory;
|
||||||
pub use indexer::IndexWriter;
|
pub use indexer::IndexWriter;
|
||||||
@@ -254,6 +254,16 @@ pub mod merge_policy {
|
|||||||
/// as they are added in the segment.
|
/// as they are added in the segment.
|
||||||
pub type DocId = u32;
|
pub type DocId = u32;
|
||||||
|
|
||||||
|
/// A u64 assigned to every operation incrementally
|
||||||
|
///
|
||||||
|
/// All operations modifying the index receives an monotonic Opstamp.
|
||||||
|
/// The resulting state of the index is consistent with the opstamp ordering.
|
||||||
|
///
|
||||||
|
/// For instance, a commit with opstamp `32_423` will reflect all Add and Delete operations
|
||||||
|
/// with an opstamp `<= 32_423`. A delete operation with opstamp n will no affect a document added
|
||||||
|
/// with opstamp `n+1`.
|
||||||
|
pub type Opstamp = u64;
|
||||||
|
|
||||||
/// A f32 that represents the relevance of the document to the query
|
/// A f32 that represents the relevance of the document to the query
|
||||||
///
|
///
|
||||||
/// This is modelled internally as a `f32`. The
|
/// This is modelled internally as a `f32`. The
|
||||||
@@ -876,28 +886,28 @@ mod tests {
|
|||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
|
||||||
assert!(fast_field_reader_res.is_err());
|
assert!(fast_field_reader_opt.is_none());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
|
let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
|
||||||
assert!(fast_field_reader_res.is_err());
|
assert!(fast_field_reader_opt.is_none());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
|
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
|
||||||
assert!(fast_field_reader_res.is_err());
|
assert!(fast_field_reader_opt.is_none());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||||
assert!(fast_field_reader_res.is_ok());
|
assert!(fast_field_reader_opt.is_some());
|
||||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||||
assert!(fast_field_reader_res.is_ok());
|
assert!(fast_field_reader_opt.is_some());
|
||||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -214,6 +214,13 @@ pub trait PostingsWriter {
|
|||||||
if token.text.len() <= MAX_TOKEN_LEN {
|
if token.text.len() <= MAX_TOKEN_LEN {
|
||||||
term.set_text(token.text.as_str());
|
term.set_text(token.text.as_str());
|
||||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||||
|
} else {
|
||||||
|
info!(
|
||||||
|
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||||
|
MAX_TOKEN_LEN in the documentation for more information.",
|
||||||
|
token.text.len(),
|
||||||
|
MAX_TOKEN_LEN
|
||||||
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
token_stream.process(&mut sink)
|
token_stream.process(&mut sink)
|
||||||
|
|||||||
@@ -175,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
let positions_idx = self
|
let positions_idx = self
|
||||||
.positions_serializer_opt
|
.positions_serializer_opt
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|positions_serializer| positions_serializer.positions_idx())
|
.map(PositionSerializer::positions_idx)
|
||||||
.unwrap_or(0u64);
|
.unwrap_or(0u64);
|
||||||
TermInfo {
|
TermInfo {
|
||||||
doc_freq: 0,
|
doc_freq: 0,
|
||||||
|
|||||||
@@ -205,4 +205,332 @@ mod tests {
|
|||||||
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
|
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
DoC 0
|
||||||
|
{
|
||||||
|
"_index": "test",
|
||||||
|
"_type": "_doc",
|
||||||
|
"_id": "0",
|
||||||
|
"matched": true,
|
||||||
|
"explanation": {
|
||||||
|
"value": 6.2610235,
|
||||||
|
"description": "max of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 6.1969156,
|
||||||
|
"description": "sum of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 6.1969156,
|
||||||
|
"description": "weight(text:оксана in 561) [PerFieldSimilarity], result of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 6.1969156,
|
||||||
|
"description": "score(freq=1.0), product of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 2.2,
|
||||||
|
"description": "boost",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 5.65998,
|
||||||
|
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 3,
|
||||||
|
"description": "n, number of documents containing term",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1004,
|
||||||
|
"description": "N, total number of documents with field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"value": 0.49766606,
|
||||||
|
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 1.0,
|
||||||
|
"description": "freq, occurrences of term within document",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1.2,
|
||||||
|
"description": "k1, term saturation parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 0.75,
|
||||||
|
"description": "b, length normalization parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 19.0,
|
||||||
|
"description": "dl, length of field",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 24.105577,
|
||||||
|
"description": "avgdl, average length of field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"value": 6.2610235,
|
||||||
|
"description": "sum of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 6.2610235,
|
||||||
|
"description": "weight(title:оксана in 561) [PerFieldSimilarity], result of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 6.2610235,
|
||||||
|
"description": "score(freq=1.0), product of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 2.2,
|
||||||
|
"description": "boost",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 5.4086657,
|
||||||
|
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 4,
|
||||||
|
"description": "n, number of documents containing term",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1004,
|
||||||
|
"description": "N, total number of documents with field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"value": 0.52617776,
|
||||||
|
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 1.0,
|
||||||
|
"description": "freq, occurrences of term within document",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1.2,
|
||||||
|
"description": "k1, term saturation parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 0.75,
|
||||||
|
"description": "b, length normalization parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 4.0,
|
||||||
|
"description": "dl, length of field",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 5.99502,
|
||||||
|
"description": "avgdl, average length of field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
doc 2
|
||||||
|
{
|
||||||
|
"_index": "test",
|
||||||
|
"_type": "_doc",
|
||||||
|
"_id": "2",
|
||||||
|
"matched": true,
|
||||||
|
"explanation": {
|
||||||
|
"value": 11.911896,
|
||||||
|
"description": "max of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 11.911896,
|
||||||
|
"description": "sum of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 5.4068284,
|
||||||
|
"description": "weight(title:оксана in 0) [PerFieldSimilarity], result of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 5.4068284,
|
||||||
|
"description": "score(freq=1.0), product of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 2.2,
|
||||||
|
"description": "boost",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 5.4086657,
|
||||||
|
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 4,
|
||||||
|
"description": "n, number of documents containing term",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1004,
|
||||||
|
"description": "N, total number of documents with field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"value": 0.45439103,
|
||||||
|
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 1.0,
|
||||||
|
"description": "freq, occurrences of term within document",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1.2,
|
||||||
|
"description": "k1, term saturation parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 0.75,
|
||||||
|
"description": "b, length normalization parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 6.0,
|
||||||
|
"description": "dl, length of field",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 5.99502,
|
||||||
|
"description": "avgdl, average length of field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"value": 6.505067,
|
||||||
|
"description": "weight(title:лифенко in 0) [PerFieldSimilarity], result of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 6.505067,
|
||||||
|
"description": "score(freq=1.0), product of:",
|
||||||
|
"details": [{
|
||||||
|
"value": 2.2,
|
||||||
|
"description": "boost",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 6.5072775,
|
||||||
|
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 1,
|
||||||
|
"description": "n, number of documents containing term",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1004,
|
||||||
|
"description": "N, total number of documents with field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}, {
|
||||||
|
"value": 0.45439103,
|
||||||
|
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
|
||||||
|
"details": [{
|
||||||
|
"value": 1.0,
|
||||||
|
"description": "freq, occurrences of term within document",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 1.2,
|
||||||
|
"description": "k1, term saturation parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 0.75,
|
||||||
|
"description": "b, length normalization parameter",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 6.0,
|
||||||
|
"description": "dl, length of field",
|
||||||
|
"details": []
|
||||||
|
}, {
|
||||||
|
"value": 5.99502,
|
||||||
|
"description": "avgdl, average length of field",
|
||||||
|
"details": []
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
// motivated by #554
|
||||||
|
#[test]
|
||||||
|
fn test_bm25_several_fields() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
// tf = 1 0
|
||||||
|
title => "Законы притяжения Оксана Кулакова",
|
||||||
|
// tf = 1 0
|
||||||
|
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
|
||||||
|
));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
// tf = 1 0
|
||||||
|
title => "Любимые русские пироги (Оксана Путан)",
|
||||||
|
// tf = 2 0
|
||||||
|
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
|
||||||
|
));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
// tf = 1 1
|
||||||
|
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
|
||||||
|
// tf = 0 0
|
||||||
|
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
|
||||||
|
));
|
||||||
|
for _ in 0..1_000 {
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
title => "a b d e f g",
|
||||||
|
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
|
||||||
|
));
|
||||||
|
}
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let query_parser = QueryParser::for_index(&index, vec![title, text]);
|
||||||
|
let query = query_parser
|
||||||
|
.parse_query("Оксана Лифенко")
|
||||||
|
.unwrap();
|
||||||
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
|
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
|
// let mut scores = vec![];
|
||||||
|
// while
|
||||||
|
println!("=====|");
|
||||||
|
scorer.advance();
|
||||||
|
dbg!("scorer.score()");
|
||||||
|
assert!(false);
|
||||||
|
|
||||||
|
// scores.push(scorer.score());
|
||||||
|
// assert_eq!(scores, &[0.8017307, 0.72233325, 1.0300813]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// motivated by #554
|
||||||
|
#[test]
|
||||||
|
fn test_bm25_several_fields_bbb() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
|
||||||
|
));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
|
||||||
|
));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
|
||||||
|
));
|
||||||
|
for _ in 0..100 {
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
|
||||||
|
));
|
||||||
|
}
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||||
|
let query = query_parser
|
||||||
|
.parse_query("Оксана Лифенко")
|
||||||
|
.unwrap();
|
||||||
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
|
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
|
let mut scores = vec![];
|
||||||
|
while scorer.advance() {
|
||||||
|
scores.push(scorer.score());
|
||||||
|
}
|
||||||
|
assert_eq!(scores, &[0.8017307, 0.72233325, 1.0300813]);
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use error::TantivyError;
|
|||||||
use query::bm25::BM25Weight;
|
use query::bm25::BM25Weight;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use query::Weight;
|
use query::Weight;
|
||||||
|
use schema::IndexRecordOption;
|
||||||
use schema::{Field, Term};
|
use schema::{Field, Term};
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -83,7 +84,7 @@ impl Query for PhraseQuery {
|
|||||||
let has_positions = field_entry
|
let has_positions = field_entry
|
||||||
.field_type()
|
.field_type()
|
||||||
.get_index_record_option()
|
.get_index_record_option()
|
||||||
.map(|index_record_option| index_record_option.has_positions())
|
.map(IndexRecordOption::has_positions)
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
if !has_positions {
|
if !has_positions {
|
||||||
let field_name = field_entry.name();
|
let field_name = field_entry.name();
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))]
|
#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))]
|
||||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))]
|
#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))]
|
||||||
|
|
||||||
|
use super::query_grammar;
|
||||||
use super::user_input_ast::*;
|
use super::user_input_ast::*;
|
||||||
use combine::char::*;
|
use combine::char::*;
|
||||||
use combine::error::StreamError;
|
use combine::error::StreamError;
|
||||||
@@ -22,7 +23,7 @@ parser! {
|
|||||||
parser! {
|
parser! {
|
||||||
fn word[I]()(I) -> String
|
fn word[I]()(I) -> String
|
||||||
where [I: Stream<Item = char>] {
|
where [I: Stream<Item = char>] {
|
||||||
many1(satisfy(|c: char| c.is_alphanumeric()))
|
many1(satisfy(char::is_alphanumeric))
|
||||||
.and_then(|s: String| {
|
.and_then(|s: String| {
|
||||||
match s.as_str() {
|
match s.as_str() {
|
||||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||||
@@ -62,7 +63,7 @@ parser! {
|
|||||||
fn negative_number[I]()(I) -> String
|
fn negative_number[I]()(I) -> String
|
||||||
where [I: Stream<Item = char>]
|
where [I: Stream<Item = char>]
|
||||||
{
|
{
|
||||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
(char('-'), many1(satisfy(char::is_numeric)))
|
||||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -184,7 +185,7 @@ parser! {
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.map(|el| el.into_dnf())
|
.map(query_grammar::Element::into_dnf)
|
||||||
.map(|fnd| {
|
.map(|fnd| {
|
||||||
if fnd.len() == 1 {
|
if fnd.len() == 1 {
|
||||||
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
|
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ fn refill<TScorer: Scorer, TScoreCombiner: ScoreCombiner>(
|
|||||||
|
|
||||||
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombiner> {
|
||||||
fn refill(&mut self) -> bool {
|
fn refill(&mut self) -> bool {
|
||||||
if let Some(min_doc) = self.docsets.iter_mut().map(|docset| docset.doc()).min() {
|
if let Some(min_doc) = self.docsets.iter().map(DocSet::doc).min() {
|
||||||
self.offset = min_doc;
|
self.offset = min_doc;
|
||||||
self.cursor = 0;
|
self.cursor = 0;
|
||||||
refill(
|
refill(
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ impl Document {
|
|||||||
self.field_values
|
self.field_values
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|field_value| field_value.field() == field)
|
.filter(|field_value| field_value.field() == field)
|
||||||
.map(|field_value| field_value.value())
|
.map(FieldValue::value)
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,7 +137,7 @@ impl Document {
|
|||||||
self.field_values
|
self.field_values
|
||||||
.iter()
|
.iter()
|
||||||
.find(|field_value| field_value.field() == field)
|
.find(|field_value| field_value.field() == field)
|
||||||
.map(|field_value| field_value.value())
|
.map(FieldValue::value)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use schema::{IntOptions, TextOptions};
|
|||||||
|
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
|
use schema::TextFieldIndexing;
|
||||||
use schema::Value;
|
use schema::Value;
|
||||||
use serde_json::Value as JsonValue;
|
use serde_json::Value as JsonValue;
|
||||||
|
|
||||||
@@ -94,7 +95,7 @@ impl FieldType {
|
|||||||
match *self {
|
match *self {
|
||||||
FieldType::Str(ref text_options) => text_options
|
FieldType::Str(ref text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
.map(|indexing_options| indexing_options.index_option()),
|
.map(TextFieldIndexing::index_option),
|
||||||
FieldType::U64(ref int_options)
|
FieldType::U64(ref int_options)
|
||||||
| FieldType::I64(ref int_options)
|
| FieldType::I64(ref int_options)
|
||||||
| FieldType::Date(ref int_options) => {
|
| FieldType::Date(ref int_options) => {
|
||||||
|
|||||||
@@ -130,7 +130,16 @@ impl SchemaBuilder {
|
|||||||
self.add_field(field_entry)
|
self.add_field(field_entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds a fast bytes field to the schema
|
/// Adds a fast bytes field to the schema.
|
||||||
|
///
|
||||||
|
/// Bytes field are not searchable and are only used
|
||||||
|
/// as fast field, to associate any kind of payload
|
||||||
|
/// to a document.
|
||||||
|
///
|
||||||
|
/// For instance, learning-to-rank often requires to access
|
||||||
|
/// some document features at scoring time.
|
||||||
|
/// These can be serializing and stored as a bytes field to
|
||||||
|
/// get access rapidly when scoring each document.
|
||||||
pub fn add_bytes_field(&mut self, field_name: &str) -> Field {
|
pub fn add_bytes_field(&mut self, field_name: &str) -> Field {
|
||||||
let field_entry = FieldEntry::new_bytes(field_name.to_string());
|
let field_entry = FieldEntry::new_bytes(field_name.to_string());
|
||||||
self.add_field(field_entry)
|
self.add_field(field_entry)
|
||||||
@@ -224,7 +233,7 @@ impl Schema {
|
|||||||
let field_name = self.get_field_name(field);
|
let field_name = self.get_field_name(field);
|
||||||
let values: Vec<Value> = field_values
|
let values: Vec<Value> = field_values
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|field_val| field_val.value())
|
.map(FieldValue::value)
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect();
|
.collect();
|
||||||
field_map.insert(field_name.to_string(), values);
|
field_map.insert(field_name.to_string(), values);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use htmlescape::encode_minimal;
|
use htmlescape::encode_minimal;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
|
use schema::Value;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
@@ -303,7 +304,7 @@ impl SnippetGenerator {
|
|||||||
let text: String = doc
|
let text: String = doc
|
||||||
.get_all(self.field)
|
.get_all(self.field)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.flat_map(|val| val.text())
|
.flat_map(Value::text)
|
||||||
.collect::<Vec<&str>>()
|
.collect::<Vec<&str>>()
|
||||||
.join(" ");
|
.join(" ");
|
||||||
self.snippet(&text)
|
self.snippet(&text)
|
||||||
|
|||||||
@@ -227,7 +227,7 @@ pub struct PerFieldSpaceUsage {
|
|||||||
|
|
||||||
impl PerFieldSpaceUsage {
|
impl PerFieldSpaceUsage {
|
||||||
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
|
||||||
let total = fields.values().map(|x| x.total()).sum();
|
let total = fields.values().map(FieldUsage::total).sum();
|
||||||
PerFieldSpaceUsage { fields, total }
|
PerFieldSpaceUsage { fields, total }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
4064
src/tokenizer/ascii_folding_filter.rs
Normal file
4064
src/tokenizer/ascii_folding_filter.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -44,18 +44,17 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.tail.advance() {
|
if !self.tail.advance() {
|
||||||
if self.token_mut().text.is_ascii() {
|
return false;
|
||||||
// fast track for ascii.
|
|
||||||
self.token_mut().text.make_ascii_lowercase();
|
|
||||||
} else {
|
|
||||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
|
||||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
|
||||||
}
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
}
|
||||||
|
if self.token_mut().text.is_ascii() {
|
||||||
|
// fast track for ascii.
|
||||||
|
self.token_mut().text.make_ascii_lowercase();
|
||||||
|
} else {
|
||||||
|
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||||
|
}
|
||||||
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -131,6 +131,7 @@
|
|||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
mod alphanum_only;
|
mod alphanum_only;
|
||||||
|
mod ascii_folding_filter;
|
||||||
mod facet_tokenizer;
|
mod facet_tokenizer;
|
||||||
mod lower_caser;
|
mod lower_caser;
|
||||||
mod ngram_tokenizer;
|
mod ngram_tokenizer;
|
||||||
@@ -144,6 +145,7 @@ mod tokenizer;
|
|||||||
mod tokenizer_manager;
|
mod tokenizer_manager;
|
||||||
|
|
||||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||||
|
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||||
pub use self::facet_tokenizer::FacetTokenizer;
|
pub use self::facet_tokenizer::FacetTokenizer;
|
||||||
pub use self::lower_caser::LowerCaser;
|
pub use self::lower_caser::LowerCaser;
|
||||||
pub use self::ngram_tokenizer::NgramTokenizer;
|
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||||
|
|||||||
@@ -29,12 +29,9 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
|
|||||||
|
|
||||||
impl TokenStream for RawTokenStream {
|
impl TokenStream for RawTokenStream {
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.has_token {
|
let result = self.has_token;
|
||||||
self.has_token = false;
|
self.has_token = false;
|
||||||
true
|
result
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token(&self) -> &Token {
|
fn token(&self) -> &Token {
|
||||||
|
|||||||
@@ -91,7 +91,6 @@ where
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,23 +38,16 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
|
|||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
self.token.text.clear();
|
self.token.text.clear();
|
||||||
self.token.position = self.token.position.wrapping_add(1);
|
self.token.position = self.token.position.wrapping_add(1);
|
||||||
|
while let Some((offset_from, c)) = self.chars.next() {
|
||||||
loop {
|
if c.is_alphanumeric() {
|
||||||
match self.chars.next() {
|
let offset_to = self.search_token_end();
|
||||||
Some((offset_from, c)) => {
|
self.token.offset_from = offset_from;
|
||||||
if c.is_alphanumeric() {
|
self.token.offset_to = offset_to;
|
||||||
let offset_to = self.search_token_end();
|
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||||
self.token.offset_from = offset_from;
|
return true;
|
||||||
self.token.offset_to = offset_to;
|
|
||||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token(&self) -> &Token {
|
fn token(&self) -> &Token {
|
||||||
|
|||||||
@@ -108,15 +108,14 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.tail.advance() {
|
if !self.tail.advance() {
|
||||||
// TODO remove allocation
|
return false;
|
||||||
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
|
||||||
self.token_mut().text.clear();
|
|
||||||
self.token_mut().text.push_str(&stemmed_str);
|
|
||||||
true
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
}
|
||||||
|
// TODO remove allocation
|
||||||
|
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
|
||||||
|
self.token_mut().text.clear();
|
||||||
|
self.token_mut().text.push_str(&stemmed_str);
|
||||||
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ where
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::ops::Deref;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use tokenizer::box_tokenizer;
|
use tokenizer::box_tokenizer;
|
||||||
use tokenizer::stemmer::Language;
|
use tokenizer::stemmer::Language;
|
||||||
@@ -46,7 +47,8 @@ impl TokenizerManager {
|
|||||||
.read()
|
.read()
|
||||||
.expect("Acquiring the lock should never fail")
|
.expect("Acquiring the lock should never fail")
|
||||||
.get(tokenizer_name)
|
.get(tokenizer_name)
|
||||||
.map(|boxed_tokenizer| boxed_tokenizer.boxed_clone())
|
.map(Deref::deref)
|
||||||
|
.map(BoxedTokenizer::boxed_clone)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user