mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 07:42:54 +00:00
Compare commits
10 Commits
poll-meta
...
segment_wr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0cba858b15 | ||
|
|
3c320b5926 | ||
|
|
8506d33b4d | ||
|
|
dde56e3756 | ||
|
|
be9057085c | ||
|
|
d13d5b8662 | ||
|
|
94e5d328af | ||
|
|
c0ec7ebf38 | ||
|
|
97d808ef50 | ||
|
|
a6a78fa607 |
@@ -1,12 +1,8 @@
|
||||
Tantivy 0.12.0
|
||||
======================
|
||||
- Removing static dispatch in tokenizers for simplicity. (#762)
|
||||
==========================
|
||||
- By default IndexReader are in `Manual` mode.
|
||||
|
||||
## How to update?
|
||||
|
||||
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
|
||||
minor changes. Check https://github.com/tantivy-search/tantivy/blob/master/examples/custom_tokenizer.rs
|
||||
to check for some code sample.
|
||||
|
||||
Tantivy 0.11.3
|
||||
=======================
|
||||
|
||||
@@ -18,7 +18,7 @@ byteorder = "1.0"
|
||||
crc32fast = "1.2.0"
|
||||
once_cell = "1.0"
|
||||
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
||||
tantivy-fst = "0.2"
|
||||
tantivy-fst = "0.1"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
// - import tokenized text straight from json,
|
||||
// - perform a search on documents with pre-tokenized text
|
||||
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::TermQuery;
|
||||
|
||||
@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
|
||||
@@ -6,7 +6,6 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
@@ -62,34 +61,6 @@ impl fmt::Debug for TopDocs {
|
||||
}
|
||||
}
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
ff_reader: FastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&self, doc: DocId) -> u64 {
|
||||
self.ff_reader.get_u64(u64::from(doc))
|
||||
}
|
||||
}
|
||||
|
||||
struct ScorerByField {
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl CustomScorer<u64> for ScorerByField {
|
||||
type Child = ScorerByFastFieldReader;
|
||||
|
||||
fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result<Self::Child> {
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(self.field)
|
||||
.ok_or_else(|| {
|
||||
crate::Error::SchemaError(format!("Field requested is not a i64/u64 fast field."))
|
||||
})?;
|
||||
Ok(ScorerByFastFieldReader { ff_reader })
|
||||
}
|
||||
}
|
||||
|
||||
impl TopDocs {
|
||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||
///
|
||||
@@ -172,7 +143,14 @@ impl TopDocs {
|
||||
self,
|
||||
field: Field,
|
||||
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
|
||||
self.custom_score(ScorerByField { field })
|
||||
self.custom_score(move |segment_reader: &SegmentReader| {
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(field)
|
||||
.expect("Field requested is not a i64/u64 fast field.");
|
||||
//TODO error message missmatch actual behavior for i64
|
||||
move |doc: DocId| ff_reader.get(doc)
|
||||
})
|
||||
}
|
||||
|
||||
/// Ranks the documents using a custom score.
|
||||
@@ -594,6 +572,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Field requested is not a i64/u64 fast field")]
|
||||
fn test_field_not_fast_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
@@ -608,12 +587,7 @@ mod tests {
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
let err = top_collector.for_segment(0, segment);
|
||||
if let Err(crate::Error::SchemaError(msg)) = err {
|
||||
assert_eq!(msg, "Field requested is not a i64/u64 fast field.");
|
||||
} else {
|
||||
assert!(false);
|
||||
}
|
||||
assert!(top_collector.for_segment(0, segment).is_ok());
|
||||
}
|
||||
|
||||
fn index(
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use crate::core::Executor;
|
||||
use crate::core::IndexMeta;
|
||||
@@ -20,7 +19,8 @@ use crate::reader::IndexReaderBuilder;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::IndexWriter;
|
||||
use crate::Result;
|
||||
use num_cpus;
|
||||
@@ -172,11 +172,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<TextAnalyzer> {
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<BoxedTokenizer> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
|
||||
let tokenizer_name_opt: Option<BoxedTokenizer> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
@@ -330,9 +330,8 @@ impl Index {
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
|
||||
create_segment(self.clone(), segment_meta)
|
||||
pub(crate) fn segment(&self, segment_meta: SegmentMeta) -> Segment {
|
||||
Segment::for_index(self.clone(), segment_meta)
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
@@ -343,6 +342,13 @@ impl Index {
|
||||
self.segment(segment_meta)
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
pub(crate) fn new_segment_unpersisted(&self) -> Segment {
|
||||
let meta = self
|
||||
.inventory
|
||||
.new_segment_meta(SegmentId::generate_random(), 0);
|
||||
Segment::new_volatile(meta, self.schema())
|
||||
}
|
||||
/// Return a reference to the index directory.
|
||||
pub fn directory(&self) -> &ManagedDirectory {
|
||||
&self.directory
|
||||
@@ -465,7 +471,7 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, index.clone(), &index, &reader);
|
||||
}
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
@@ -489,7 +495,7 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, index.clone(), &index, &reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -531,12 +537,11 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, read_index, &write_index, &reader);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
|
||||
let mut reader_index = reader.index();
|
||||
fn test_index_on_commit_reload_policy_aux(field: Field, mut reader_index: Index, index: &Index, reader: &IndexReader) {
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _watch_handle = reader_index.directory_mut().watch(Box::new(move || {
|
||||
let _ = sender.send(());
|
||||
|
||||
@@ -3,21 +3,62 @@ use crate::core::Index;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::{Directory, ManagedDirectory, RAMDirectory};
|
||||
use crate::directory::{ReadOnlySource, WritePtr};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
use failure::_core::ops::DerefMut;
|
||||
use std::fmt;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
use std::result;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum SegmentDirectory {
|
||||
Persisted(ManagedDirectory),
|
||||
Volatile(RAMDirectory),
|
||||
}
|
||||
|
||||
impl SegmentDirectory {
|
||||
pub fn new_volatile() -> SegmentDirectory {
|
||||
SegmentDirectory::Volatile(RAMDirectory::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ManagedDirectory> for SegmentDirectory {
|
||||
fn from(directory: ManagedDirectory) -> Self {
|
||||
SegmentDirectory::Persisted(directory)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for SegmentDirectory {
|
||||
type Target = dyn Directory;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
SegmentDirectory::Volatile(dir) => dir,
|
||||
SegmentDirectory::Persisted(dir) => dir,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for SegmentDirectory {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
match self {
|
||||
SegmentDirectory::Volatile(dir) => dir,
|
||||
SegmentDirectory::Persisted(dir) => dir,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
pub struct Segment {
|
||||
index: Index,
|
||||
schema: Schema,
|
||||
meta: SegmentMeta,
|
||||
directory: SegmentDirectory,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Segment {
|
||||
@@ -26,23 +67,56 @@ impl fmt::Debug for Segment {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
///
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
/// #[doc(hidden)]
|
||||
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment { index, meta }
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
/// Returns the index the segment belongs to.
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
/// Returns our index's schema.
|
||||
// TODO return a ref.
|
||||
pub fn schema(&self) -> Schema {
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
/// Returns our index's schema.
|
||||
pub fn schema(&self) -> Schema {
|
||||
self.index.schema()
|
||||
pub(crate) fn new_persisted(
|
||||
meta: SegmentMeta,
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
) -> Segment {
|
||||
Segment {
|
||||
meta,
|
||||
schema,
|
||||
directory: SegmentDirectory::from(directory),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new segment that embeds its own `RAMDirectory`.
|
||||
///
|
||||
/// That segment is entirely dissociated from the index directory.
|
||||
/// It will be persisted by a background thread in charge of IO.
|
||||
pub fn new_volatile(meta: SegmentMeta, schema: Schema) -> Segment {
|
||||
Segment {
|
||||
schema,
|
||||
meta,
|
||||
directory: SegmentDirectory::new_volatile(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
pub(crate) fn for_index(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment {
|
||||
directory: SegmentDirectory::Persisted(index.directory().clone()),
|
||||
schema: index.schema(),
|
||||
meta,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn persist(&mut self, mut dest_directory: ManagedDirectory) -> crate::Result<()> {
|
||||
if let SegmentDirectory::Persisted(_) = self.directory {
|
||||
// this segment is already persisted.
|
||||
return Ok(());
|
||||
}
|
||||
if let SegmentDirectory::Volatile(ram_directory) = &self.directory {
|
||||
ram_directory.persist(&mut dest_directory)?;
|
||||
}
|
||||
self.directory = SegmentDirectory::Persisted(dest_directory);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the segment meta-information
|
||||
@@ -56,7 +130,8 @@ impl Segment {
|
||||
/// as we finalize a fresh new segment.
|
||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
directory: self.directory,
|
||||
schema: self.schema,
|
||||
meta: self.meta.with_max_doc(max_doc),
|
||||
}
|
||||
}
|
||||
@@ -64,7 +139,8 @@ impl Segment {
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
directory: self.directory,
|
||||
schema: self.schema,
|
||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||
}
|
||||
}
|
||||
@@ -88,7 +164,7 @@ impl Segment {
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
let path = self.relative_path(component);
|
||||
let source = self.index.directory().open_read(&path)?;
|
||||
let source = self.directory.open_read(&path)?;
|
||||
Ok(source)
|
||||
}
|
||||
|
||||
@@ -98,7 +174,7 @@ impl Segment {
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<WritePtr, OpenWriteError> {
|
||||
let path = self.relative_path(component);
|
||||
let write = self.index.directory_mut().open_write(&path)?;
|
||||
let write = self.directory.open_write(&path)?;
|
||||
Ok(write)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,6 +57,68 @@ pub struct SegmentReader {
|
||||
}
|
||||
|
||||
impl SegmentReader {
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: &Segment) -> Result<SegmentReader> {
|
||||
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
||||
|
||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(&postings_source)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
|
||||
CompositeFile::open(&source)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let positions_idx_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
|
||||
CompositeFile::open(&source)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_source,
|
||||
delete_bitset_opt,
|
||||
positions_composite,
|
||||
positions_idx_composite,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the highest document id ever attributed in
|
||||
/// this segment + 1.
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
@@ -144,68 +206,6 @@ impl SegmentReader {
|
||||
StoreReader::from_source(self.store_source.clone())
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: &Segment) -> Result<SegmentReader> {
|
||||
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
||||
|
||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(&postings_source)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
|
||||
CompositeFile::open(&source)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let positions_idx_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
|
||||
CompositeFile::open(&source)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
Some(DeleteBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_source,
|
||||
delete_bitset_opt,
|
||||
positions_composite,
|
||||
positions_idx_composite,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a field reader associated to the field given in argument.
|
||||
/// If the field was not present in the index during indexing time,
|
||||
/// the InvertedIndexReader is empty.
|
||||
|
||||
@@ -141,19 +141,11 @@ impl MmapCache {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WatcherMode {
|
||||
Event,
|
||||
Poll
|
||||
}
|
||||
|
||||
struct WatcherWrapper {
|
||||
_watcher: Mutex<notify::RecommendedWatcher>,
|
||||
watcher_router: Arc<WatchCallbackList>,
|
||||
watcher_mode: WatcherMode,
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl WatcherWrapper {
|
||||
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
|
||||
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
|
||||
@@ -171,57 +163,33 @@ impl WatcherWrapper {
|
||||
})?;
|
||||
let watcher_router: Arc<WatchCallbackList> = Default::default();
|
||||
let watcher_router_clone = watcher_router.clone();
|
||||
let path_clone = path.clone();
|
||||
let meta_path = path_clone.join(*META_FILEPATH);
|
||||
thread::Builder::new()
|
||||
.name("meta-file-watch-thread".to_string())
|
||||
.spawn(move || {
|
||||
let mut old_content = String::new();
|
||||
let mode = WatcherMode::Event;
|
||||
loop {
|
||||
match mode {
|
||||
WatcherMode::Event => {
|
||||
match watcher_recv.recv().map(|evt| evt.path) {
|
||||
Ok(Some(changed_path)) => {
|
||||
// ... Actually subject to false positive.
|
||||
// We might want to be more accurate than this at one point.
|
||||
if let Some(filename) = changed_path.file_name() {
|
||||
if filename == *META_FILEPATH {
|
||||
let _ = watcher_router_clone.broadcast();
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
// not an event we are interested in.
|
||||
}
|
||||
Err(_e) => {
|
||||
// the watch send channel was dropped
|
||||
break;
|
||||
match watcher_recv.recv().map(|evt| evt.path) {
|
||||
Ok(Some(changed_path)) => {
|
||||
// ... Actually subject to false positive.
|
||||
// We might want to be more accurate than this at one point.
|
||||
if let Some(filename) = changed_path.file_name() {
|
||||
if filename == *META_FILEPATH {
|
||||
let _ = watcher_router_clone.broadcast();
|
||||
}
|
||||
}
|
||||
}
|
||||
WatcherMode::Poll => {
|
||||
let mut file = match File::open(&meta_path) {
|
||||
Err(why) => panic!("open: nope"),
|
||||
Ok(file) => file,
|
||||
};
|
||||
let mut new_content = String::new();
|
||||
match file.read_to_string(&mut new_content) {
|
||||
Err(why) => panic!("read: nope"),
|
||||
Ok(_) => {},
|
||||
}
|
||||
if old_content != new_content {
|
||||
let _ = watcher_router_clone.broadcast();
|
||||
old_content = new_content;
|
||||
}
|
||||
Ok(None) => {
|
||||
// not an event we are interested in.
|
||||
}
|
||||
};
|
||||
Err(_e) => {
|
||||
// the watch send channel was dropped
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
})?;
|
||||
Ok(WatcherWrapper {
|
||||
_watcher: Mutex::new(watcher),
|
||||
watcher_router,
|
||||
watcher_mode: WatcherMode::Event,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -144,6 +144,16 @@ impl RAMDirectory {
|
||||
pub fn total_mem_usage(&self) -> usize {
|
||||
self.fs.read().unwrap().total_mem_usage()
|
||||
}
|
||||
|
||||
pub fn persist(&self, dest: &mut dyn Directory) -> crate::Result<()> {
|
||||
let wlock = self.fs.write().unwrap();
|
||||
for (path, source) in wlock.fs.iter() {
|
||||
let mut dest_wrt = dest.open_write(path)?;
|
||||
dest_wrt.write_all(source.as_slice())?;
|
||||
dest_wrt.terminate()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for RAMDirectory {
|
||||
|
||||
@@ -8,30 +8,33 @@ use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::directory::TerminatingWrite;
|
||||
use crate::directory::{DirectoryLock, GarbageCollectionResult};
|
||||
use crate::directory::{TerminatingWrite, WatchCallbackList};
|
||||
use crate::docset::DocSet;
|
||||
use crate::error::TantivyError;
|
||||
use crate::fastfield::write_delete_bitset;
|
||||
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use crate::indexer::operation::DeleteOperation;
|
||||
use crate::indexer::segment_manager::SegmentRegisters;
|
||||
use crate::indexer::segment_register::SegmentRegister;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::MergePolicy;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::reader::NRTReader;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Opstamp;
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{IndexReader, Opstamp};
|
||||
use crossbeam::channel;
|
||||
use futures::executor::block_on;
|
||||
use futures::future::Future;
|
||||
use smallvec::smallvec;
|
||||
use smallvec::SmallVec;
|
||||
use smallvec::{smallvec, SmallVec};
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
@@ -68,6 +71,8 @@ pub struct IndexWriter {
|
||||
// lifetime of the lock with that of the IndexWriter.
|
||||
_directory_lock: Option<DirectoryLock>,
|
||||
|
||||
segment_registers: Arc<RwLock<SegmentRegisters>>,
|
||||
|
||||
index: Index,
|
||||
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
@@ -87,6 +92,8 @@ pub struct IndexWriter {
|
||||
|
||||
stamper: Stamper,
|
||||
committed_opstamp: Opstamp,
|
||||
|
||||
on_commit: WatchCallbackList,
|
||||
}
|
||||
|
||||
fn compute_deleted_bitset(
|
||||
@@ -133,7 +140,6 @@ fn compute_deleted_bitset(
|
||||
/// For instance, there was no delete operation between the state of the `segment_entry` and
|
||||
/// the `target_opstamp`, `segment_entry` is not updated.
|
||||
pub(crate) fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: Opstamp,
|
||||
) -> crate::Result<()> {
|
||||
@@ -142,28 +148,38 @@ pub(crate) fn advance_deletes(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if segment_entry.delete_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
|
||||
let delete_bitset_opt = segment_entry.take_delete_bitset();
|
||||
|
||||
// We avoid directly advancing the `SegmentEntry` delete cursor, because
|
||||
// we do not want to end up in an invalid state if the delete bitset
|
||||
// serialization fails.
|
||||
let mut delete_cursor = segment_entry.delete_cursor();
|
||||
|
||||
if delete_bitset_opt.is_none() && delete_cursor.get().is_none() {
|
||||
// There has been no `DeleteOperation` between the segment status and `target_opstamp`.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We open our current serialized segment to compute the new deleted bitset.
|
||||
let segment = segment_entry.segment().clone();
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_max_value(max_doc),
|
||||
};
|
||||
let mut delete_bitset: BitSet =
|
||||
delete_bitset_opt.unwrap_or_else(|| BitSet::with_max_value(max_doc));
|
||||
|
||||
let num_deleted_docs_before = segment.meta().num_deleted_docs();
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
segment_entry.delete_cursor(),
|
||||
&mut delete_cursor,
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
)?;
|
||||
|
||||
// TODO optimize
|
||||
// TODO optimize... We are simply manipulating bitsets here.
|
||||
// We should be able to compute the union much faster.
|
||||
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
|
||||
for doc in 0u32..max_doc {
|
||||
if seg_delete_bitset.is_deleted(doc) {
|
||||
@@ -172,15 +188,23 @@ pub(crate) fn advance_deletes(
|
||||
}
|
||||
}
|
||||
|
||||
let num_deleted_docs = delete_bitset.len();
|
||||
if num_deleted_docs > 0 {
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
let num_deleted_docs = delete_bitset.len() as u32;
|
||||
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// We need to write a new delete file.
|
||||
let mut delete_file = segment
|
||||
.with_delete_meta(num_deleted_docs, target_opstamp)
|
||||
.open_write(SegmentComponent::DELETE)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
segment_entry.reset_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
}
|
||||
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
// Regardless of whether we did end up having to write a new file or not
|
||||
// we advance the `delete_cursor`. This is an optimisation. We want to ensure we do not
|
||||
// check that a given deleted term does not match any of our docs more than once.
|
||||
segment_entry.set_delete_cursor(delete_cursor);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -189,11 +213,12 @@ fn index_documents(
|
||||
segment: Segment,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
tokenizers: &TokenizerManager,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<bool> {
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(memory_budget, segment.clone(), tokenizers)?;
|
||||
let schema = segment.schema();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
@@ -231,11 +256,7 @@ fn index_documents(
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(
|
||||
segment_with_max_doc.meta().clone(),
|
||||
delete_cursor,
|
||||
delete_bitset_opt,
|
||||
);
|
||||
let segment_entry = SegmentEntry::new(segment_with_max_doc, delete_cursor, delete_bitset_opt);
|
||||
block_on(segment_updater.schedule_add_segment(segment_entry))?;
|
||||
Ok(true)
|
||||
}
|
||||
@@ -307,16 +328,24 @@ impl IndexWriter {
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let current_opstamp = index.load_metas()?.opstamp;
|
||||
let meta = index.load_metas()?;
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
let stamper = Stamper::new(meta.opstamp);
|
||||
|
||||
let commited_segments = SegmentRegister::new(
|
||||
index.directory(),
|
||||
&index.schema(),
|
||||
meta.segments,
|
||||
&delete_queue.cursor(),
|
||||
);
|
||||
let segment_registers = Arc::new(RwLock::new(SegmentRegisters::new(commited_segments)));
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
SegmentUpdater::create(segment_registers.clone(), index.clone(), stamper.clone())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
|
||||
segment_registers,
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
@@ -330,10 +359,12 @@ impl IndexWriter {
|
||||
|
||||
delete_queue,
|
||||
|
||||
committed_opstamp: current_opstamp,
|
||||
committed_opstamp: meta.opstamp,
|
||||
stamper,
|
||||
|
||||
worker_id: 0,
|
||||
|
||||
on_commit: Default::default(),
|
||||
};
|
||||
index_writer.start_workers()?;
|
||||
Ok(index_writer)
|
||||
@@ -373,13 +404,6 @@ impl IndexWriter {
|
||||
result
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn add_segment(&self, segment_meta: SegmentMeta) -> crate::Result<()> {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
block_on(self.segment_updater.schedule_add_segment(segment_entry))
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
///
|
||||
/// This method is useful only for users trying to do complex
|
||||
@@ -428,12 +452,13 @@ impl IndexWriter {
|
||||
// was dropped.
|
||||
return Ok(());
|
||||
}
|
||||
let segment = index.new_segment();
|
||||
let segment = index.new_segment_unpersisted();
|
||||
index_documents(
|
||||
mem_budget,
|
||||
segment,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
index.tokenizers(),
|
||||
delete_cursor.clone(),
|
||||
)?;
|
||||
}
|
||||
@@ -460,6 +485,21 @@ impl IndexWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO move me
|
||||
pub(crate) fn trigger_commit(&self) -> impl Future<Output = ()> {
|
||||
self.on_commit.broadcast()
|
||||
}
|
||||
|
||||
pub fn reader(&self, num_searchers: usize) -> crate::Result<IndexReader> {
|
||||
let nrt_reader = NRTReader::create(
|
||||
num_searchers,
|
||||
self.index.clone(),
|
||||
self.segment_registers.clone(),
|
||||
&self.on_commit,
|
||||
)?;
|
||||
Ok(IndexReader::NRT(nrt_reader))
|
||||
}
|
||||
|
||||
/// Detects and removes the files that are not used by the index anymore.
|
||||
pub fn garbage_collect_files(
|
||||
&self,
|
||||
@@ -603,7 +643,7 @@ impl IndexWriter {
|
||||
/// It is also possible to add a payload to the `commit`
|
||||
/// using this API.
|
||||
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
||||
pub fn prepare_commit(&mut self) -> crate::Result<PreparedCommit> {
|
||||
pub fn prepare_commit(&mut self, soft_commit: bool) -> crate::Result<PreparedCommit> {
|
||||
// Here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
@@ -631,7 +671,7 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
let commit_opstamp = self.stamper.stamp();
|
||||
let prepared_commit = PreparedCommit::new(self, commit_opstamp);
|
||||
let prepared_commit = PreparedCommit::new(self, commit_opstamp, soft_commit);
|
||||
info!("Prepared commit {}", commit_opstamp);
|
||||
Ok(prepared_commit)
|
||||
}
|
||||
@@ -651,7 +691,11 @@ impl IndexWriter {
|
||||
/// that made it in the commit.
|
||||
///
|
||||
pub fn commit(&mut self) -> crate::Result<Opstamp> {
|
||||
self.prepare_commit()?.commit()
|
||||
self.prepare_commit(false)?.commit()
|
||||
}
|
||||
|
||||
pub fn soft_commit(&mut self) -> crate::Result<Opstamp> {
|
||||
self.prepare_commit(true)?.commit()
|
||||
}
|
||||
|
||||
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
||||
@@ -776,7 +820,6 @@ impl Drop for IndexWriter {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::operation::UserOperation;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::error::LockError;
|
||||
@@ -1009,7 +1052,8 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
let mut prepared_commit =
|
||||
index_writer.prepare_commit(false).expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
prepared_commit.commit().expect("commit failed");
|
||||
}
|
||||
@@ -1042,7 +1086,8 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
let mut prepared_commit =
|
||||
index_writer.prepare_commit(false).expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
prepared_commit.abort().expect("commit failed");
|
||||
}
|
||||
@@ -1217,7 +1262,43 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(idfield=>"myid"));
|
||||
let commit = index_writer.commit();
|
||||
assert!(commit.is_ok());
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_writer_reader() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let idfield = schema_builder.add_text_field("id", STRING);
|
||||
schema_builder.add_text_field("optfield", STRING);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(idfield=>"myid"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let reader = index_writer.reader(2).unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1u64);
|
||||
index_writer.add_document(doc!(idfield=>"myid"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
assert_eq!(reader.searcher().num_docs(), 2u64);
|
||||
assert_eq!(searcher.num_docs(), 1u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_writer_reader_soft_commit() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let idfield = schema_builder.add_text_field("id", STRING);
|
||||
schema_builder.add_text_field("optfield", STRING);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(idfield=>"myid"));
|
||||
assert!(index_writer.soft_commit().is_ok());
|
||||
let nrt_reader = index_writer.reader(2).unwrap();
|
||||
let normal_reader = index.reader_builder().try_into().unwrap();
|
||||
assert_eq!(nrt_reader.searcher().num_docs(), 1u64);
|
||||
assert_eq!(normal_reader.searcher().num_docs(), 0u64);
|
||||
assert!(index_writer.commit().is_ok());
|
||||
assert!(normal_reader.reload().is_ok());
|
||||
assert_eq!(nrt_reader.searcher().num_docs(), 1u64);
|
||||
assert_eq!(normal_reader.searcher().num_docs(), 1u64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::SegmentEntry;
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub(crate) use self::segment_manager::SegmentRegisters;
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
|
||||
|
||||
@@ -19,8 +19,6 @@ pub struct AddOperation {
|
||||
/// UserOperation is an enum type that encapsulates other operation types.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub enum UserOperation {
|
||||
/// Add operation
|
||||
Add(Document),
|
||||
/// Delete operation
|
||||
Delete(Term),
|
||||
}
|
||||
|
||||
@@ -8,14 +8,20 @@ pub struct PreparedCommit<'a> {
|
||||
index_writer: &'a mut IndexWriter,
|
||||
payload: Option<String>,
|
||||
opstamp: Opstamp,
|
||||
soft_commit: bool,
|
||||
}
|
||||
|
||||
impl<'a> PreparedCommit<'a> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
|
||||
pub(crate) fn new(
|
||||
index_writer: &'a mut IndexWriter,
|
||||
opstamp: Opstamp,
|
||||
soft_commit: bool,
|
||||
) -> PreparedCommit {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
opstamp,
|
||||
soft_commit,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,11 +39,12 @@ impl<'a> PreparedCommit<'a> {
|
||||
|
||||
pub fn commit(self) -> Result<Opstamp> {
|
||||
info!("committing {}", self.opstamp);
|
||||
let _ = block_on(
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
.schedule_commit(self.opstamp, self.payload),
|
||||
);
|
||||
block_on(self.index_writer.segment_updater().schedule_commit(
|
||||
self.opstamp,
|
||||
self.payload,
|
||||
self.soft_commit,
|
||||
))?;
|
||||
block_on(self.index_writer.trigger_commit());
|
||||
Ok(self.opstamp)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::directory::ManagedDirectory;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::{Opstamp, Segment};
|
||||
use std::fmt;
|
||||
|
||||
/// A segment entry describes the state of
|
||||
@@ -19,55 +21,81 @@ use std::fmt;
|
||||
/// in the .del file or in the `delete_bitset`.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentEntry {
|
||||
meta: SegmentMeta,
|
||||
segment: Segment,
|
||||
delete_bitset: Option<BitSet>,
|
||||
delete_cursor: DeleteCursor,
|
||||
}
|
||||
|
||||
impl SegmentEntry {
|
||||
/// Create a new `SegmentEntry`
|
||||
pub fn new(
|
||||
segment_meta: SegmentMeta,
|
||||
pub(crate) fn new(
|
||||
segment: Segment,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>,
|
||||
) -> SegmentEntry {
|
||||
SegmentEntry {
|
||||
meta: segment_meta,
|
||||
segment,
|
||||
delete_bitset,
|
||||
delete_cursor,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a reference to the segment entry deleted bitset.
|
||||
pub fn persist(&mut self, dest_directory: ManagedDirectory) -> crate::Result<()> {
|
||||
// TODO take in account delete bitset?
|
||||
self.segment.persist(dest_directory)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn segment(&self) -> &Segment {
|
||||
&self.segment
|
||||
}
|
||||
|
||||
/// `Takes` (as in Option::take) the delete bitset of
|
||||
/// a segment entry.
|
||||
///
|
||||
/// `DocId` in this bitset are flagged as deleted.
|
||||
pub fn delete_bitset(&self) -> Option<&BitSet> {
|
||||
self.delete_bitset.as_ref()
|
||||
pub fn take_delete_bitset(&mut self) -> Option<BitSet> {
|
||||
self.delete_bitset.take()
|
||||
}
|
||||
|
||||
/// Set the `SegmentMeta` for this segment.
|
||||
pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
|
||||
self.meta = segment_meta;
|
||||
/// Reset the delete informmationo in this segment.
|
||||
///
|
||||
/// The `SegmentEntry` segment's `SegmentMeta` gets updated, and
|
||||
/// any delete bitset is drop and set to None.
|
||||
pub fn reset_delete_meta(&mut self, num_deleted_docs: u32, target_opstamp: Opstamp) {
|
||||
self.segment = self
|
||||
.segment
|
||||
.clone()
|
||||
.with_delete_meta(num_deleted_docs, target_opstamp);
|
||||
self.delete_bitset = None;
|
||||
}
|
||||
|
||||
pub fn set_delete_cursor(&mut self, delete_cursor: DeleteCursor) {
|
||||
self.delete_cursor = delete_cursor;
|
||||
}
|
||||
/// Return a reference to the segment_entry's delete cursor
|
||||
pub fn delete_cursor(&mut self) -> &mut DeleteCursor {
|
||||
&mut self.delete_cursor
|
||||
pub fn delete_cursor(&mut self) -> DeleteCursor {
|
||||
self.delete_cursor.clone()
|
||||
}
|
||||
|
||||
/// Returns the segment id.
|
||||
pub fn segment_id(&self) -> SegmentId {
|
||||
self.meta.id()
|
||||
self.segment.id()
|
||||
}
|
||||
|
||||
/// Accessor to the `SegmentMeta`
|
||||
pub fn meta(&self) -> &SegmentMeta {
|
||||
&self.meta
|
||||
self.segment.meta()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentEntry {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(formatter, "SegmentEntry({:?})", self.meta)
|
||||
let num_deletes = self.delete_bitset.as_ref().map(|bitset| bitset.len());
|
||||
write!(
|
||||
formatter,
|
||||
"SegmentEntry(seg={:?}, ndel={:?})",
|
||||
self.segment, num_deletes
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,14 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::Result as TantivyResult;
|
||||
use crate::Segment;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
#[derive(Default)]
|
||||
struct SegmentRegisters {
|
||||
pub(crate) struct SegmentRegisters {
|
||||
uncommitted: SegmentRegister,
|
||||
committed: SegmentRegister,
|
||||
}
|
||||
@@ -23,6 +20,17 @@ pub(crate) enum SegmentsStatus {
|
||||
}
|
||||
|
||||
impl SegmentRegisters {
|
||||
pub fn new(committed: SegmentRegister) -> SegmentRegisters {
|
||||
SegmentRegisters {
|
||||
uncommitted: Default::default(),
|
||||
committed,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn committed_segment(&self) -> Vec<Segment> {
|
||||
self.committed.segments()
|
||||
}
|
||||
|
||||
/// Check if all the segments are committed or uncommited.
|
||||
///
|
||||
/// If some segment is missing or segments are in a different state (this should not happen
|
||||
@@ -45,18 +53,7 @@ impl SegmentRegisters {
|
||||
/// changes (merges especially)
|
||||
#[derive(Default)]
|
||||
pub struct SegmentManager {
|
||||
registers: RwLock<SegmentRegisters>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentManager {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
let lock = self.read();
|
||||
write!(
|
||||
f,
|
||||
"{{ uncommitted: {:?}, committed: {:?} }}",
|
||||
lock.uncommitted, lock.committed
|
||||
)
|
||||
}
|
||||
registers: Arc<RwLock<SegmentRegisters>>,
|
||||
}
|
||||
|
||||
pub fn get_mergeable_segments(
|
||||
@@ -75,16 +72,8 @@ pub fn get_mergeable_segments(
|
||||
}
|
||||
|
||||
impl SegmentManager {
|
||||
pub fn from_segments(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: &DeleteCursor,
|
||||
) -> SegmentManager {
|
||||
SegmentManager {
|
||||
registers: RwLock::new(SegmentRegisters {
|
||||
uncommitted: SegmentRegister::default(),
|
||||
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
||||
}),
|
||||
}
|
||||
pub(crate) fn new(registers: Arc<RwLock<SegmentRegisters>>) -> SegmentManager {
|
||||
SegmentManager { registers }
|
||||
}
|
||||
|
||||
/// Returns all of the segment entries (committed or uncommitted)
|
||||
@@ -145,7 +134,7 @@ impl SegmentManager {
|
||||
/// Returns an error if some segments are missing, or if
|
||||
/// the `segment_ids` are not either all committed or all
|
||||
/// uncommitted.
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> crate::Result<Vec<SegmentEntry>> {
|
||||
let registers_lock = self.read();
|
||||
let mut segment_entries = vec![];
|
||||
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||
@@ -166,7 +155,7 @@ impl SegmentManager {
|
||||
let error_msg = "Merge operation sent for segments that are not \
|
||||
all uncommited or commited."
|
||||
.to_string();
|
||||
return Err(TantivyError::InvalidArgument(error_msg));
|
||||
return Err(crate::Error::InvalidArgument(error_msg));
|
||||
}
|
||||
Ok(segment_entries)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::directory::ManagedDirectory;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::segment_entry::SegmentEntry;
|
||||
use crate::schema::Schema;
|
||||
use crate::Segment;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
@@ -46,6 +49,13 @@ impl SegmentRegister {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn segments(&self) -> Vec<Segment> {
|
||||
self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.segment().clone())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||
self.segment_states.values().cloned().collect()
|
||||
}
|
||||
@@ -79,11 +89,17 @@ impl SegmentRegister {
|
||||
self.segment_states.get(segment_id).cloned()
|
||||
}
|
||||
|
||||
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
|
||||
pub fn new(
|
||||
directory: &ManagedDirectory,
|
||||
schema: &Schema,
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: &DeleteCursor,
|
||||
) -> SegmentRegister {
|
||||
let mut segment_states = HashMap::new();
|
||||
for segment_meta in segment_metas {
|
||||
let segment_id = segment_meta.id();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None);
|
||||
for meta in segment_metas {
|
||||
let segment_id = meta.id();
|
||||
let segment = Segment::new_persisted(meta, directory.clone(), schema.clone());
|
||||
let segment_entry = SegmentEntry::new(segment, delete_cursor.clone(), None);
|
||||
segment_states.insert(segment_id, segment_entry);
|
||||
}
|
||||
SegmentRegister { segment_states }
|
||||
@@ -95,6 +111,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::core::{SegmentId, SegmentMetaInventory};
|
||||
use crate::indexer::delete_queue::*;
|
||||
use crate::schema::Schema;
|
||||
|
||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||
segment_register
|
||||
@@ -108,6 +125,7 @@ mod tests {
|
||||
fn test_segment_register() {
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let delete_queue = DeleteQueue::new();
|
||||
let schema = Schema::builder().build();
|
||||
|
||||
let mut segment_register = SegmentRegister::default();
|
||||
let segment_id_a = SegmentId::generate_random();
|
||||
@@ -115,21 +133,24 @@ mod tests {
|
||||
let segment_id_merged = SegmentId::generate_random();
|
||||
|
||||
{
|
||||
let segment_meta = inventory.new_segment_meta(segment_id_a, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
let meta = inventory.new_segment_meta(segment_id_a, 0u32);
|
||||
let segment = Segment::new_volatile(meta, schema.clone());
|
||||
let segment_entry = SegmentEntry::new(segment, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||
{
|
||||
let segment_meta = inventory.new_segment_meta(segment_id_b, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
let segment = Segment::new_volatile(segment_meta, schema.clone());
|
||||
let segment_entry = SegmentEntry::new(segment, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
segment_register.remove_segment(&segment_id_a);
|
||||
segment_register.remove_segment(&segment_id_b);
|
||||
{
|
||||
let segment_meta_merged = inventory.new_segment_meta(segment_id_merged, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||
let segment = Segment::new_volatile(segment_meta_merged, schema);
|
||||
let segment_entry = SegmentEntry::new(segment, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
|
||||
|
||||
@@ -7,11 +7,10 @@ use crate::core::SegmentMeta;
|
||||
use crate::core::SerializableSegment;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::index_writer::advance_deletes;
|
||||
use crate::indexer::merge_operation::MergeOperationInventory;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::indexer::segment_manager::SegmentsStatus;
|
||||
use crate::indexer::segment_manager::{SegmentRegisters, SegmentsStatus};
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::indexer::SegmentSerializer;
|
||||
@@ -117,8 +116,7 @@ fn merge(
|
||||
|
||||
// First we apply all of the delet to the merged segment, up to the target opstamp.
|
||||
for segment_entry in &mut segment_entries {
|
||||
let segment = index.segment(segment_entry.meta().clone());
|
||||
advance_deletes(segment, segment_entry, target_opstamp)?;
|
||||
advance_deletes(segment_entry, target_opstamp)?;
|
||||
}
|
||||
|
||||
let delete_cursor = segment_entries[0].delete_cursor().clone();
|
||||
@@ -134,11 +132,13 @@ fn merge(
|
||||
// ... we just serialize this index merger in our new segment to merge the two segments.
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?;
|
||||
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
let max_doc = merger.write(segment_serializer)?;
|
||||
|
||||
let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
|
||||
|
||||
Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
|
||||
Ok(SegmentEntry::new(
|
||||
merged_segment.with_max_doc(max_doc),
|
||||
delete_cursor,
|
||||
None,
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) struct InnerSegmentUpdater {
|
||||
@@ -162,12 +162,11 @@ pub(crate) struct InnerSegmentUpdater {
|
||||
|
||||
impl SegmentUpdater {
|
||||
pub fn create(
|
||||
segment_registers: Arc<RwLock<SegmentRegisters>>,
|
||||
index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: &DeleteCursor,
|
||||
) -> crate::Result<SegmentUpdater> {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
let segment_manager = SegmentManager::new(segment_registers);
|
||||
let pool = ThreadPoolBuilder::new()
|
||||
.name_prefix("segment_updater")
|
||||
.pool_size(1)
|
||||
@@ -230,6 +229,7 @@ impl SegmentUpdater {
|
||||
&self,
|
||||
segment_entry: SegmentEntry,
|
||||
) -> impl Future<Output = crate::Result<()>> {
|
||||
// TODO temporary: serializing the segment at this point.
|
||||
let segment_updater = self.clone();
|
||||
self.schedule_future(async move {
|
||||
segment_updater.segment_manager.add_segment(segment_entry);
|
||||
@@ -258,8 +258,7 @@ impl SegmentUpdater {
|
||||
fn purge_deletes(&self, target_opstamp: Opstamp) -> crate::Result<Vec<SegmentEntry>> {
|
||||
let mut segment_entries = self.segment_manager.segment_entries();
|
||||
for segment_entry in &mut segment_entries {
|
||||
let segment = self.index.segment(segment_entry.meta().clone());
|
||||
advance_deletes(segment, segment_entry, target_opstamp)?;
|
||||
advance_deletes(segment_entry, target_opstamp)?;
|
||||
}
|
||||
Ok(segment_entries)
|
||||
}
|
||||
@@ -327,12 +326,21 @@ impl SegmentUpdater {
|
||||
&self,
|
||||
opstamp: Opstamp,
|
||||
payload: Option<String>,
|
||||
soft_commit: bool,
|
||||
) -> impl Future<Output = crate::Result<()>> {
|
||||
let segment_updater: SegmentUpdater = self.clone();
|
||||
let directory = self.index.directory().clone();
|
||||
self.schedule_future(async move {
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||
let mut segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||
if !soft_commit {
|
||||
for segment_entry in &mut segment_entries {
|
||||
segment_entry.persist(directory.clone())?;
|
||||
}
|
||||
}
|
||||
segment_updater.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp, payload)?;
|
||||
if !soft_commit {
|
||||
segment_updater.save_metas(opstamp, payload)?;
|
||||
}
|
||||
let _ = garbage_collect_files(segment_updater.clone()).await;
|
||||
segment_updater.consider_merge_options().await;
|
||||
Ok(())
|
||||
@@ -470,17 +478,14 @@ impl SegmentUpdater {
|
||||
let end_merge_future = self.schedule_future(async move {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
{
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
if let Err(e) = advance_deletes(
|
||||
segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp,
|
||||
) {
|
||||
let _index = &segment_updater.index;
|
||||
if let Err(e) =
|
||||
advance_deletes(&mut after_merge_segment_entry, committed_opstamp)
|
||||
{
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
merge_operation.segment_ids(),
|
||||
|
||||
@@ -11,9 +11,10 @@ use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||
use crate::tokenizer::FacetTokenizer;
|
||||
use crate::tokenizer::PreTokenizedStream;
|
||||
use crate::tokenizer::{BoxedTokenizer, TokenizerManager};
|
||||
use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
@@ -49,7 +50,7 @@ pub struct SegmentWriter {
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<TextAnalyzer>>,
|
||||
tokenizers: Vec<Option<BoxedTokenizer>>,
|
||||
}
|
||||
|
||||
impl SegmentWriter {
|
||||
@@ -65,11 +66,12 @@ impl SegmentWriter {
|
||||
pub fn for_segment(
|
||||
memory_budget: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
tokenizers: &TokenizerManager,
|
||||
) -> Result<SegmentWriter> {
|
||||
let schema = segment.schema();
|
||||
let table_num_bits = initial_table_size(memory_budget)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(&schema, table_num_bits);
|
||||
let tokenizers = schema
|
||||
.fields()
|
||||
.map(
|
||||
@@ -78,7 +80,7 @@ impl SegmentWriter {
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
segment.index().tokenizers().get(tokenizer_name)
|
||||
tokenizers.get(tokenizer_name)
|
||||
}),
|
||||
_ => None,
|
||||
},
|
||||
@@ -87,9 +89,9 @@ impl SegmentWriter {
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
multifield_postings,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(schema),
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
|
||||
segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
fast_field_writers: FastFieldsWriter::from_schema(&schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
tokenizers,
|
||||
})
|
||||
@@ -158,7 +160,7 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let mut token_streams: Vec<BoxTokenStream> = vec![];
|
||||
let mut token_streams: Vec<Box<dyn TokenStream>> = vec![];
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
|
||||
@@ -171,7 +173,7 @@ impl SegmentWriter {
|
||||
}
|
||||
|
||||
token_streams
|
||||
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
.push(Box::new(PreTokenizedStream::from(tok_str.clone())));
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(ref mut tokenizer) =
|
||||
@@ -190,7 +192,8 @@ impl SegmentWriter {
|
||||
let num_tokens = if token_streams.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||
let mut token_stream: Box<dyn TokenStream> =
|
||||
Box::new(TokenStreamChain::new(offsets, token_streams));
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &mut token_stream)
|
||||
};
|
||||
|
||||
@@ -1,76 +1,18 @@
|
||||
use crate::Opstamp;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(not(target_arch = "arm"))]
|
||||
mod atomic_impl {
|
||||
|
||||
use crate::Opstamp;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Wrapper(AtomicU64);
|
||||
|
||||
impl AtomicU64Wrapper {
|
||||
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
|
||||
AtomicU64Wrapper(AtomicU64::new(first_opstamp as u64))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||
self.0.fetch_add(val as u64, order) as u64
|
||||
}
|
||||
|
||||
pub fn revert(&self, val: u64, order: Ordering) -> u64 {
|
||||
self.0.store(val, order);
|
||||
val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "arm")]
|
||||
mod atomic_impl {
|
||||
|
||||
use crate::Opstamp;
|
||||
/// Under other architecture, we rely on a mutex.
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::RwLock;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Wrapper(RwLock<u64>);
|
||||
|
||||
impl AtomicU64Wrapper {
|
||||
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
|
||||
AtomicU64Wrapper(RwLock::new(first_opstamp))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
||||
let mut lock = self.0.write().unwrap();
|
||||
let previous_val = *lock;
|
||||
*lock = previous_val + incr;
|
||||
previous_val
|
||||
}
|
||||
|
||||
pub fn revert(&self, val: u64, _order: Ordering) -> u64 {
|
||||
let mut lock = self.0.write().unwrap();
|
||||
*lock = val;
|
||||
val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
use self::atomic_impl::AtomicU64Wrapper;
|
||||
|
||||
/// Stamper provides Opstamps, which is just an auto-increment id to label
|
||||
/// an operation.
|
||||
///
|
||||
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64Wrapper>);
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: Opstamp) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64Wrapper::new(first_opstamp)))
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> Opstamp {
|
||||
@@ -89,7 +31,8 @@ impl Stamper {
|
||||
|
||||
/// Reverts the stamper to a given `Opstamp` value and returns it
|
||||
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
|
||||
self.0.revert(to_opstamp, Ordering::SeqCst)
|
||||
self.0.store(to_opstamp, Ordering::SeqCst);
|
||||
to_opstamp
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -165,7 +165,6 @@ pub use crate::core::SegmentComponent;
|
||||
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::operation::UserOperation;
|
||||
pub use crate::indexer::IndexWriter;
|
||||
pub use crate::postings::Postings;
|
||||
pub use crate::reader::LeasedItem;
|
||||
|
||||
@@ -220,7 +220,7 @@ pub mod tests {
|
||||
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), index.tokenizers()).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
|
||||
@@ -533,7 +533,7 @@ mod test {
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
|
||||
use crate::tokenizer::{
|
||||
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
|
||||
LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager,
|
||||
};
|
||||
use crate::Index;
|
||||
use matches::assert_matches;
|
||||
@@ -563,7 +563,7 @@ mod test {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"en_with_stop_words",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec!["the".to_string()])),
|
||||
);
|
||||
|
||||
@@ -112,6 +112,7 @@ mod tests {
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
|
||||
let reader = index.reader().unwrap();
|
||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
||||
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
|
||||
}
|
||||
|
||||
|
||||
83
src/reader/index_writer_reader.rs
Normal file
83
src/reader/index_writer_reader.rs
Normal file
@@ -0,0 +1,83 @@
|
||||
use crate::directory::{WatchCallbackList, WatchHandle};
|
||||
use crate::indexer::SegmentRegisters;
|
||||
use crate::reader::pool::Pool;
|
||||
use crate::{Index, LeasedItem, Searcher, Segment, SegmentReader};
|
||||
use std::iter::repeat_with;
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
struct InnerNRTReader {
|
||||
num_searchers: usize,
|
||||
index: Index,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
segment_registers: Arc<RwLock<SegmentRegisters>>,
|
||||
}
|
||||
|
||||
impl InnerNRTReader {
|
||||
fn load_segment_readers(&self) -> crate::Result<Vec<SegmentReader>> {
|
||||
let segments: Vec<Segment> = {
|
||||
let rlock = self.segment_registers.read().unwrap();
|
||||
rlock.committed_segment()
|
||||
};
|
||||
segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<crate::Result<Vec<SegmentReader>>>()
|
||||
}
|
||||
|
||||
pub fn reload(&self) -> crate::Result<()> {
|
||||
let segment_readers: Vec<SegmentReader> = self.load_segment_readers()?;
|
||||
let schema = self.index.schema();
|
||||
let searchers = repeat_with(|| {
|
||||
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
|
||||
})
|
||||
.take(self.num_searchers)
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.searcher_pool.acquire()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct NRTReader {
|
||||
inner: Arc<InnerNRTReader>,
|
||||
watch_handle: WatchHandle,
|
||||
}
|
||||
|
||||
impl NRTReader {
|
||||
pub fn reload(&self) -> crate::Result<()> {
|
||||
self.inner.reload()
|
||||
}
|
||||
|
||||
pub fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.inner.searcher()
|
||||
}
|
||||
|
||||
pub(crate) fn create(
|
||||
num_searchers: usize,
|
||||
index: Index,
|
||||
segment_registers: Arc<RwLock<SegmentRegisters>>,
|
||||
watch_callback_list: &WatchCallbackList,
|
||||
) -> crate::Result<Self> {
|
||||
let inner_reader: Arc<InnerNRTReader> = Arc::new(InnerNRTReader {
|
||||
num_searchers,
|
||||
index,
|
||||
searcher_pool: Pool::new(),
|
||||
segment_registers,
|
||||
});
|
||||
let inner_reader_weak: Weak<InnerNRTReader> = Arc::downgrade(&inner_reader);
|
||||
let watch_handle = watch_callback_list.subscribe(Box::new(move || {
|
||||
if let Some(nrt_reader_arc) = inner_reader_weak.upgrade() {
|
||||
let _ = nrt_reader_arc.reload();
|
||||
}
|
||||
}));
|
||||
inner_reader.reload()?;
|
||||
Ok(NRTReader {
|
||||
inner: inner_reader,
|
||||
watch_handle,
|
||||
})
|
||||
}
|
||||
}
|
||||
180
src/reader/meta_file_reader.rs
Normal file
180
src/reader/meta_file_reader.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
use super::pool::Pool;
|
||||
use crate::core::Segment;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::META_LOCK;
|
||||
use crate::Searcher;
|
||||
use crate::SegmentReader;
|
||||
use crate::{Index, LeasedItem};
|
||||
use crate::{IndexReader, Result};
|
||||
use std::iter::repeat_with;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Defines when a new version of the index should be reloaded.
|
||||
///
|
||||
/// Regardless of whether you search and index in the same process, tantivy does not necessarily
|
||||
/// reflects the change that are commited to your index. `ReloadPolicy` precisely helps you define
|
||||
/// when you want your index to be reloaded.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ReloadPolicy {
|
||||
/// The index is entirely reloaded manually.
|
||||
/// All updates of the index should be manual.
|
||||
///
|
||||
/// No change is reflected automatically. You are required to call `.load_seacher()` manually.
|
||||
Manual,
|
||||
/// The index is reloaded within milliseconds after a new commit is available.
|
||||
/// This is made possible by watching changes in the `meta.json` file.
|
||||
OnCommit, // TODO add NEAR_REAL_TIME(target_ms)
|
||||
}
|
||||
|
||||
/// `IndexReader` builder
|
||||
///
|
||||
/// It makes it possible to set the following values.
|
||||
///
|
||||
/// - `num_searchers` (by default, the number of detected CPU threads):
|
||||
///
|
||||
/// When `num_searchers` queries are requested at the same time, the `num_searchers` will block
|
||||
/// until the one of the searcher in-use gets released.
|
||||
/// - `reload_policy` (by default `ReloadPolicy::OnCommit`):
|
||||
///
|
||||
/// See [`ReloadPolicy`](./enum.ReloadPolicy.html) for more details.
|
||||
#[derive(Clone)]
|
||||
pub struct IndexReaderBuilder {
|
||||
num_searchers: usize,
|
||||
reload_policy: ReloadPolicy,
|
||||
index: Index,
|
||||
}
|
||||
|
||||
impl IndexReaderBuilder {
|
||||
pub(crate) fn new(index: Index) -> IndexReaderBuilder {
|
||||
IndexReaderBuilder {
|
||||
num_searchers: num_cpus::get(),
|
||||
reload_policy: ReloadPolicy::Manual,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds the reader.
|
||||
///
|
||||
/// Building the reader is a non-trivial operation that requires
|
||||
/// to open different segment readers. It may take hundreds of milliseconds
|
||||
/// of time and it may return an error.
|
||||
/// TODO(pmasurel) Use the `TryInto` trait once it is available in stable.
|
||||
pub fn try_into(self) -> Result<IndexReader> {
|
||||
let inner_reader = MetaFileIndexReaderInner {
|
||||
index: self.index,
|
||||
num_searchers: self.num_searchers,
|
||||
searcher_pool: Pool::new(),
|
||||
};
|
||||
inner_reader.reload()?;
|
||||
let inner_reader_arc = Arc::new(inner_reader);
|
||||
let watch_handle_opt: Option<WatchHandle>;
|
||||
match self.reload_policy {
|
||||
ReloadPolicy::Manual => {
|
||||
// No need to set anything...
|
||||
watch_handle_opt = None;
|
||||
}
|
||||
ReloadPolicy::OnCommit => {
|
||||
let inner_reader_arc_clone = inner_reader_arc.clone();
|
||||
let callback = move || {
|
||||
if let Err(err) = inner_reader_arc_clone.reload() {
|
||||
error!(
|
||||
"Error while loading searcher after commit was detected. {:?}",
|
||||
err
|
||||
);
|
||||
}
|
||||
};
|
||||
let watch_handle = inner_reader_arc
|
||||
.index
|
||||
.directory()
|
||||
.watch(Box::new(callback))?;
|
||||
watch_handle_opt = Some(watch_handle);
|
||||
}
|
||||
}
|
||||
Ok(IndexReader::from(MetaFileIndexReader {
|
||||
inner: inner_reader_arc,
|
||||
watch_handle_opt,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Sets the reload_policy.
|
||||
///
|
||||
/// See [`ReloadPolicy`](./enum.ReloadPolicy.html) for more details.
|
||||
pub fn reload_policy(mut self, reload_policy: ReloadPolicy) -> IndexReaderBuilder {
|
||||
self.reload_policy = reload_policy;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the number of `Searcher` in the searcher pool.
|
||||
pub fn num_searchers(mut self, num_searchers: usize) -> IndexReaderBuilder {
|
||||
self.num_searchers = num_searchers;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
struct MetaFileIndexReaderInner {
|
||||
num_searchers: usize,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
index: Index,
|
||||
}
|
||||
|
||||
impl MetaFileIndexReaderInner {
|
||||
fn load_segment_readers(&self) -> crate::Result<Vec<SegmentReader>> {
|
||||
// We keep the lock until we have effectively finished opening the
|
||||
// the `SegmentReader` because it prevents a diffferent process
|
||||
// to garbage collect these file while we open them.
|
||||
//
|
||||
// Once opened, on linux & mac, the mmap will remain valid after
|
||||
// the file has been deleted
|
||||
// On windows, the file deletion will fail.
|
||||
let _meta_lock = self.index.directory().acquire_lock(&META_LOCK)?;
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
searchable_segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()
|
||||
}
|
||||
|
||||
fn reload(&self) -> crate::Result<()> {
|
||||
let segment_readers: Vec<SegmentReader> = self.load_segment_readers()?;
|
||||
let schema = self.index.schema();
|
||||
let searchers = repeat_with(|| {
|
||||
Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone())
|
||||
})
|
||||
.take(self.num_searchers)
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
fn searchable_segments(&self) -> crate::Result<Vec<Segment>> {
|
||||
self.index.searchable_segments()
|
||||
}
|
||||
|
||||
fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.searcher_pool.acquire()
|
||||
}
|
||||
}
|
||||
|
||||
/// `IndexReader` is your entry point to read and search the index.
|
||||
///
|
||||
/// It controls when a new version of the index should be loaded and lends
|
||||
/// you instances of `Searcher` for the last loaded version.
|
||||
///
|
||||
/// `Clone` does not clone the different pool of searcher. `IndexReader`
|
||||
/// just wraps and `Arc`.
|
||||
#[derive(Clone)]
|
||||
pub struct MetaFileIndexReader {
|
||||
inner: Arc<MetaFileIndexReaderInner>,
|
||||
watch_handle_opt: Option<WatchHandle>,
|
||||
}
|
||||
|
||||
impl MetaFileIndexReader {
|
||||
pub fn reload(&self) -> crate::Result<()> {
|
||||
self.inner.reload()
|
||||
}
|
||||
pub fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.inner.searcher()
|
||||
}
|
||||
}
|
||||
@@ -1,173 +1,20 @@
|
||||
mod index_writer_reader;
|
||||
mod meta_file_reader;
|
||||
mod pool;
|
||||
|
||||
use self::meta_file_reader::MetaFileIndexReader;
|
||||
pub use self::meta_file_reader::{IndexReaderBuilder, ReloadPolicy};
|
||||
pub use self::pool::LeasedItem;
|
||||
use self::pool::Pool;
|
||||
use crate::core::Segment;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::META_LOCK;
|
||||
use crate::Index;
|
||||
use crate::Result;
|
||||
pub(crate) use crate::reader::index_writer_reader::{NRTReader};
|
||||
use crate::Searcher;
|
||||
use crate::SegmentReader;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
/// Defines when a new version of the index should be reloaded.
|
||||
///
|
||||
/// Regardless of whether you search and index in the same process, tantivy does not necessarily
|
||||
/// reflects the change that are commited to your index. `ReloadPolicy` precisely helps you define
|
||||
/// when you want your index to be reloaded.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ReloadPolicy {
|
||||
/// The index is entirely reloaded manually.
|
||||
/// All updates of the index should be manual.
|
||||
///
|
||||
/// No change is reflected automatically. You are required to call `.load_seacher()` manually.
|
||||
Manual,
|
||||
/// The index is reloaded within milliseconds after a new commit is available.
|
||||
/// This is made possible by watching changes in the `meta.json` file.
|
||||
OnCommit, // TODO add NEAR_REAL_TIME(target_ms)
|
||||
}
|
||||
|
||||
/// `IndexReader` builder
|
||||
///
|
||||
/// It makes it possible to set the following values.
|
||||
///
|
||||
/// - `num_searchers` (by default, the number of detected CPU threads):
|
||||
///
|
||||
/// When `num_searchers` queries are requested at the same time, the `num_searchers` will block
|
||||
/// until the one of the searcher in-use gets released.
|
||||
/// - `reload_policy` (by default `ReloadPolicy::OnCommit`):
|
||||
///
|
||||
/// See [`ReloadPolicy`](./enum.ReloadPolicy.html) for more details.
|
||||
#[derive(Clone)]
|
||||
pub struct IndexReaderBuilder {
|
||||
num_searchers: usize,
|
||||
reload_policy: ReloadPolicy,
|
||||
index: Index,
|
||||
}
|
||||
|
||||
impl IndexReaderBuilder {
|
||||
pub(crate) fn new(index: Index) -> IndexReaderBuilder {
|
||||
IndexReaderBuilder {
|
||||
num_searchers: num_cpus::get(),
|
||||
reload_policy: ReloadPolicy::OnCommit,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds the reader.
|
||||
///
|
||||
/// Building the reader is a non-trivial operation that requires
|
||||
/// to open different segment readers. It may take hundreds of milliseconds
|
||||
/// of time and it may return an error.
|
||||
/// TODO(pmasurel) Use the `TryInto` trait once it is available in stable.
|
||||
pub fn try_into(self) -> Result<IndexReader> {
|
||||
let inner_reader = InnerIndexReader {
|
||||
index: self.index,
|
||||
num_searchers: self.num_searchers,
|
||||
searcher_pool: Pool::new(),
|
||||
};
|
||||
inner_reader.reload()?;
|
||||
let inner_reader_arc = Arc::new(inner_reader);
|
||||
let watch_handle_opt: Option<WatchHandle>;
|
||||
match self.reload_policy {
|
||||
ReloadPolicy::Manual => {
|
||||
// No need to set anything...
|
||||
watch_handle_opt = None;
|
||||
}
|
||||
ReloadPolicy::OnCommit => {
|
||||
let inner_reader_arc_clone = inner_reader_arc.clone();
|
||||
let callback = move || {
|
||||
if let Err(err) = inner_reader_arc_clone.reload() {
|
||||
error!(
|
||||
"Error while loading searcher after commit was detected. {:?}",
|
||||
err
|
||||
);
|
||||
}
|
||||
};
|
||||
let watch_handle = inner_reader_arc
|
||||
.index
|
||||
.directory()
|
||||
.watch(Box::new(callback))?;
|
||||
watch_handle_opt = Some(watch_handle);
|
||||
}
|
||||
}
|
||||
Ok(IndexReader {
|
||||
inner: inner_reader_arc,
|
||||
watch_handle_opt,
|
||||
})
|
||||
}
|
||||
|
||||
/// Sets the reload_policy.
|
||||
///
|
||||
/// See [`ReloadPolicy`](./enum.ReloadPolicy.html) for more details.
|
||||
pub fn reload_policy(mut self, reload_policy: ReloadPolicy) -> IndexReaderBuilder {
|
||||
self.reload_policy = reload_policy;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the number of `Searcher` in the searcher pool.
|
||||
pub fn num_searchers(mut self, num_searchers: usize) -> IndexReaderBuilder {
|
||||
self.num_searchers = num_searchers;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerIndexReader {
|
||||
num_searchers: usize,
|
||||
searcher_pool: Pool<Searcher>,
|
||||
index: Index,
|
||||
}
|
||||
|
||||
impl InnerIndexReader {
|
||||
fn reload(&self) -> Result<()> {
|
||||
let segment_readers: Vec<SegmentReader> = {
|
||||
let _meta_lock = self.index.directory().acquire_lock(&META_LOCK)?;
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
searchable_segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()?
|
||||
};
|
||||
let schema = self.index.schema();
|
||||
let searchers = (0..self.num_searchers)
|
||||
.map(|_| Searcher::new(schema.clone(), self.index.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
self.index.searchable_segments()
|
||||
}
|
||||
|
||||
fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.searcher_pool.acquire()
|
||||
}
|
||||
}
|
||||
|
||||
/// `IndexReader` is your entry point to read and search the index.
|
||||
///
|
||||
/// It controls when a new version of the index should be loaded and lends
|
||||
/// you instances of `Searcher` for the last loaded version.
|
||||
///
|
||||
/// `Clone` does not clone the different pool of searcher. `IndexReader`
|
||||
/// just wraps and `Arc`.
|
||||
#[derive(Clone)]
|
||||
pub struct IndexReader {
|
||||
inner: Arc<InnerIndexReader>,
|
||||
watch_handle_opt: Option<WatchHandle>,
|
||||
pub enum IndexReader {
|
||||
FromMetaFile(MetaFileIndexReader),
|
||||
NRT(NRTReader),
|
||||
}
|
||||
|
||||
impl IndexReader {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn index(&self) -> Index {
|
||||
self.inner.index.clone()
|
||||
}
|
||||
|
||||
/// Update searchers so that they reflect the state of the last
|
||||
/// `.commit()`.
|
||||
///
|
||||
@@ -177,8 +24,11 @@ impl IndexReader {
|
||||
///
|
||||
/// This automatic reload can take 10s of milliseconds to kick in however, and in unit tests
|
||||
/// it can be nice to deterministically force the reload of searchers.
|
||||
pub fn reload(&self) -> Result<()> {
|
||||
self.inner.reload()
|
||||
pub fn reload(&self) -> crate::Result<()> {
|
||||
match self {
|
||||
IndexReader::FromMetaFile(meta_file_reader) => meta_file_reader.reload(),
|
||||
IndexReader::NRT(nrt_reader) => nrt_reader.reload(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a searcher
|
||||
@@ -192,6 +42,21 @@ impl IndexReader {
|
||||
/// The same searcher must be used for a given query, as it ensures
|
||||
/// the use of a consistent segment set.
|
||||
pub fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.inner.searcher()
|
||||
match self {
|
||||
IndexReader::FromMetaFile(meta_file_reader) => meta_file_reader.searcher(),
|
||||
IndexReader::NRT(nrt_reader) => nrt_reader.searcher(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<MetaFileIndexReader> for IndexReader {
|
||||
fn from(meta_file_reader: MetaFileIndexReader) -> Self {
|
||||
IndexReader::FromMetaFile(meta_file_reader)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NRTReader> for IndexReader {
|
||||
fn from(nrt_reader: NRTReader) -> Self {
|
||||
IndexReader::NRT(nrt_reader)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use crate::query::Query;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Value;
|
||||
use crate::tokenizer::{TextAnalyzer, Token};
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
use crate::Document;
|
||||
use crate::Result;
|
||||
use crate::Searcher;
|
||||
@@ -141,7 +142,7 @@ impl Snippet {
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments<'a>(
|
||||
tokenizer: &TextAnalyzer,
|
||||
tokenizer: &BoxedTokenizer,
|
||||
text: &'a str,
|
||||
terms: &BTreeMap<String, f32>,
|
||||
max_num_chars: usize,
|
||||
@@ -250,7 +251,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
/// ```
|
||||
pub struct SnippetGenerator {
|
||||
terms_text: BTreeMap<String, f32>,
|
||||
tokenizer: TextAnalyzer,
|
||||
tokenizer: BoxedTokenizer,
|
||||
field: Field,
|
||||
max_num_chars: usize,
|
||||
}
|
||||
@@ -346,11 +347,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet() {
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
let terms = btreemap! {
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -372,12 +374,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_scored_fragment() {
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
{
|
||||
let terms = btreemap! {
|
||||
String::from("rust") =>1.0f32,
|
||||
String::from("language") => 0.9f32
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
@@ -386,12 +389,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
|
||||
}
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
{
|
||||
let terms = btreemap! {
|
||||
String::from("rust") =>0.9f32,
|
||||
String::from("language") => 1.0f32
|
||||
};
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
//assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -405,12 +409,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_in_second_fragment() {
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
@@ -427,12 +433,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_at_the_end_of_fragment() {
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -449,13 +457,15 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_second_fragment_has_the_highest_score() {
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -472,12 +482,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_not_in_text() {
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -488,10 +500,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_no_terms() {
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
|
||||
//! let tokenizer = RawTokenizer
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there");
|
||||
@@ -10,7 +10,7 @@
|
||||
//! // contains a space
|
||||
//! assert!(stream.next().is_none());
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let tokenizer = SimpleTokenizer
|
||||
//! .filter(AlphaNumOnlyFilter);
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
||||
@@ -19,30 +19,56 @@
|
||||
//! // the "emoji" is dropped because its not an alphanum
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
/// ascii alphanumeric characters.
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
pub struct AlphaNumOnlyFilterStream<'a> {
|
||||
tail: BoxTokenStream<'a>,
|
||||
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<'a> AlphaNumOnlyFilterStream<'a> {
|
||||
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for AlphaNumOnlyFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
|
||||
fn wrap(tail: TailTokenStream) -> AlphaNumOnlyFilterStream<TailTokenStream> {
|
||||
AlphaNumOnlyFilterStream { tail }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
AlphaNumOnlyFilterStream::wrap(token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
@@ -52,12 +78,4 @@ impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use std::mem;
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
@@ -7,21 +7,26 @@ use std::mem;
|
||||
#[derive(Clone)]
|
||||
pub struct AsciiFoldingFilter;
|
||||
|
||||
impl TokenFilter for AsciiFoldingFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
From::from(AsciiFoldingFilterTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::with_capacity(100),
|
||||
})
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for AsciiFoldingFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = AsciiFoldingFilterTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
AsciiFoldingFilterTokenStream::wrap(token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AsciiFoldingFilterTokenStream<'a> {
|
||||
pub struct AsciiFoldingFilterTokenStream<TailTokenStream> {
|
||||
buffer: String,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
|
||||
impl<TailTokenStream> TokenStream for AsciiFoldingFilterTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -43,6 +48,18 @@ impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> AsciiFoldingFilterTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn wrap(tail: TailTokenStream) -> AsciiFoldingFilterTokenStream<TailTokenStream> {
|
||||
AsciiFoldingFilterTokenStream {
|
||||
tail,
|
||||
buffer: String::with_capacity(100),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Returns a string that represents the ascii folded version of
|
||||
// the character. If the `char` does not require ascii folding
|
||||
// (e.g. simple ASCII chars like `A`) or if the `char`
|
||||
@@ -1544,7 +1561,8 @@ mod tests {
|
||||
use crate::tokenizer::AsciiFoldingFilter;
|
||||
use crate::tokenizer::RawTokenizer;
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::Tokenizer;
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
@@ -1561,7 +1579,7 @@ mod tests {
|
||||
|
||||
fn folding_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
SimpleTokenizer
|
||||
.filter(AsciiFoldingFilter)
|
||||
.token_stream(text)
|
||||
.process(&mut |token| {
|
||||
@@ -1571,9 +1589,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||
let mut token_stream = TextAnalyzer::from(RawTokenizer)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.token_stream(text);
|
||||
let mut token_stream = RawTokenizer.filter(AsciiFoldingFilter).token_stream(text);
|
||||
token_stream.advance();
|
||||
token_stream.token().text.clone()
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::schema::FACET_SEP_BYTE;
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
@@ -25,14 +25,15 @@ pub struct FacetTokenStream<'a> {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for FacetTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
impl<'a> Tokenizer<'a> for FacetTokenizer {
|
||||
type TokenStreamImpl = FacetTokenStream<'a>;
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
FacetTokenStream {
|
||||
text,
|
||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||
token: Token::default(),
|
||||
}
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,7 +84,7 @@ mod tests {
|
||||
|
||||
use super::FacetTokenizer;
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::{Token, Tokenizer};
|
||||
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_facet_tokenizer() {
|
||||
|
||||
@@ -1,23 +1,24 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use std::mem;
|
||||
|
||||
impl TokenFilter for LowerCaser {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(LowerCaserTokenStream {
|
||||
tail: token_stream,
|
||||
buffer: String::with_capacity(100),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaser;
|
||||
|
||||
pub struct LowerCaserTokenStream<'a> {
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
LowerCaserTokenStream::wrap(token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<TailTokenStream> {
|
||||
buffer: String,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
// writes a lowercased version of text into output.
|
||||
@@ -30,7 +31,18 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
|
||||
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -44,19 +56,26 @@ impl<'a> TokenStream for LowerCaserTokenStream<'a> {
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
impl<TailTokenStream> LowerCaserTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
|
||||
LowerCaserTokenStream {
|
||||
tail,
|
||||
buffer: String::with_capacity(100),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
|
||||
use crate::tokenizer::LowerCaser;
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::Tokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_to_lower_case() {
|
||||
@@ -68,9 +87,7 @@ mod tests {
|
||||
|
||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||
let mut tokens = vec![];
|
||||
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.token_stream(text);
|
||||
let mut token_stream = SimpleTokenizer.filter(LowerCaser).token_stream(text);
|
||||
while token_stream.advance() {
|
||||
let token_text = token_stream.token().text.clone();
|
||||
tokens.push(token_text);
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let en_stem = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new(Language::English));
|
||||
@@ -109,7 +109,7 @@
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//!
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let custom_en_tokenizer = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser);
|
||||
//! index
|
||||
@@ -143,11 +143,10 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
};
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
@@ -161,9 +160,9 @@ pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use super::{
|
||||
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
|
||||
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, Tokenizer,
|
||||
TokenizerManager,
|
||||
};
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
|
||||
/// This is a function that can be used in tests and doc tests
|
||||
/// to assert a token's correctness.
|
||||
@@ -230,7 +229,7 @@ pub mod tests {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register(
|
||||
"el_stem",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::Greek)),
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||
///
|
||||
@@ -130,9 +129,11 @@ pub struct NgramTokenStream<'a> {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for NgramTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
From::from(NgramTokenStream {
|
||||
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
||||
type TokenStreamImpl = NgramTokenStream<'a>;
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
NgramTokenStream {
|
||||
ngram_charidx_iterator: StutteringIterator::new(
|
||||
CodepointFrontiers::for_str(text),
|
||||
self.min_gram,
|
||||
@@ -141,7 +142,7 @@ impl Tokenizer for NgramTokenizer {
|
||||
prefix_only: self.prefix_only,
|
||||
text,
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -307,10 +308,10 @@ mod tests {
|
||||
use super::NgramTokenizer;
|
||||
use super::StutteringIterator;
|
||||
use crate::tokenizer::tests::assert_token;
|
||||
use crate::tokenizer::tokenizer::Tokenizer;
|
||||
use crate::tokenizer::{BoxTokenStream, Token};
|
||||
use crate::tokenizer::tokenizer::{TokenStream, Tokenizer};
|
||||
use crate::tokenizer::Token;
|
||||
|
||||
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
|
||||
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||
tokens
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone)]
|
||||
@@ -10,8 +9,10 @@ pub struct RawTokenStream {
|
||||
has_token: bool,
|
||||
}
|
||||
|
||||
impl Tokenizer for RawTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
impl<'a> Tokenizer<'a> for RawTokenizer {
|
||||
type TokenStreamImpl = RawTokenStream;
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let token = Token {
|
||||
offset_from: 0,
|
||||
offset_to: text.len(),
|
||||
@@ -23,7 +24,6 @@ impl Tokenizer for RawTokenizer {
|
||||
token,
|
||||
has_token: true,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let tokenizer = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(5));
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("toolong nice");
|
||||
@@ -13,7 +13,6 @@
|
||||
//! ```
|
||||
//!
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
/// than a given number of bytes (in UTF-8 representation).
|
||||
@@ -32,27 +31,56 @@ impl RemoveLongFilter {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> RemoveLongFilterStream<'a> {
|
||||
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.len() < self.token_length_limit
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for RemoveLongFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(RemoveLongFilterStream {
|
||||
token_length_limit: self.length_limit,
|
||||
tail: token_stream,
|
||||
})
|
||||
fn wrap(
|
||||
token_length_limit: usize,
|
||||
tail: TailTokenStream,
|
||||
) -> RemoveLongFilterStream<TailTokenStream> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit,
|
||||
tail,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RemoveLongFilterStream<'a> {
|
||||
token_length_limit: usize,
|
||||
tail: BoxTokenStream<'a>,
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
RemoveLongFilterStream::wrap(self.length_limit, token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
||||
pub struct RemoveLongFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
token_length_limit: usize,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
@@ -61,12 +89,4 @@ impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use super::BoxTokenStream;
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use std::str::CharIndices;
|
||||
|
||||
@@ -12,13 +11,15 @@ pub struct SimpleTokenStream<'a> {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for SimpleTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(SimpleTokenStream {
|
||||
impl<'a> Tokenizer<'a> for SimpleTokenizer {
|
||||
type TokenStreamImpl = SimpleTokenStream<'a>;
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
SimpleTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
|
||||
/// Available stemmer languages.
|
||||
@@ -76,22 +75,38 @@ impl Default for Stemmer {
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenFilter for Stemmer {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||
BoxTokenStream::from(StemmerTokenStream {
|
||||
tail: token_stream,
|
||||
stemmer: inner_stemmer,
|
||||
})
|
||||
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StemmerTokenStream<'a> {
|
||||
tail: BoxTokenStream<'a>,
|
||||
pub struct StemmerTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
stemmer: rust_stemmers::Stemmer,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for StemmerTokenStream<'a> {
|
||||
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if !self.tail.advance() {
|
||||
return false;
|
||||
@@ -102,12 +117,16 @@ impl<'a> TokenStream for StemmerTokenStream<'a> {
|
||||
self.token_mut().text.push_str(&stemmed_str);
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn wrap(
|
||||
stemmer: rust_stemmers::Stemmer,
|
||||
tail: TailTokenStream,
|
||||
) -> StemmerTokenStream<TailTokenStream> {
|
||||
StemmerTokenStream { tail, stemmer }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
//! let tokenizer = SimpleTokenizer
|
||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
|
||||
//!
|
||||
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
||||
@@ -11,7 +11,6 @@
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use crate::tokenizer::BoxTokenStream;
|
||||
use fnv::FnvHasher;
|
||||
use std::collections::HashSet;
|
||||
use std::hash::BuildHasherDefault;
|
||||
@@ -49,27 +48,53 @@ impl StopWordFilter {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StopWordFilterStream<'a> {
|
||||
pub struct StopWordFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
words: StopWordHashSet,
|
||||
tail: BoxTokenStream<'a>,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl TokenFilter for StopWordFilter {
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(StopWordFilterStream {
|
||||
words: self.words.clone(),
|
||||
tail: token_stream,
|
||||
})
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for StopWordFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = StopWordFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
StopWordFilterStream::wrap(self.words.clone(), token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> StopWordFilterStream<'a> {
|
||||
impl<TailTokenStream> StopWordFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
!self.words.contains(&token.text)
|
||||
}
|
||||
|
||||
fn wrap(
|
||||
words: StopWordHashSet,
|
||||
tail: TailTokenStream,
|
||||
) -> StopWordFilterStream<TailTokenStream> {
|
||||
StopWordFilterStream { words, tail }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for StopWordFilterStream<'a> {
|
||||
impl<TailTokenStream> TokenStream for StopWordFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
@@ -78,14 +103,6 @@ impl<'a> TokenStream for StopWordFilterStream<'a> {
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for StopWordFilter {
|
||||
|
||||
@@ -1,21 +1,23 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream};
|
||||
use std::ops::DerefMut;
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
const POSITION_GAP: usize = 2;
|
||||
|
||||
pub(crate) struct TokenStreamChain<'a> {
|
||||
pub(crate) struct TokenStreamChain<TTokenStream: TokenStream> {
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
token_streams: Vec<TTokenStream>,
|
||||
position_shift: usize,
|
||||
stream_idx: usize,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> TokenStreamChain<'a> {
|
||||
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
|
||||
where
|
||||
TTokenStream: TokenStream,
|
||||
{
|
||||
pub fn new(
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
) -> TokenStreamChain<'a> {
|
||||
token_streams: Vec<TTokenStream>,
|
||||
) -> TokenStreamChain<TTokenStream> {
|
||||
TokenStreamChain {
|
||||
offsets,
|
||||
stream_idx: 0,
|
||||
@@ -26,10 +28,13 @@ impl<'a> TokenStreamChain<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
|
||||
where
|
||||
TTokenStream: TokenStream,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.stream_idx < self.token_streams.len() {
|
||||
let token_stream = self.token_streams[self.stream_idx].deref_mut();
|
||||
let token_stream = &mut self.token_streams[self.stream_idx];
|
||||
if token_stream.advance() {
|
||||
let token = token_stream.token();
|
||||
let offset_offset = self.offsets[self.stream_idx];
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
|
||||
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
/// Struct representing pre-tokenized text
|
||||
@@ -41,9 +41,9 @@ impl PreTokenizedStream {
|
||||
/// Creates a TokenStream from PreTokenizedString array
|
||||
pub fn chain_tokenized_strings<'a>(
|
||||
tok_strings: &'a [&'a PreTokenizedString],
|
||||
) -> BoxTokenStream {
|
||||
) -> Box<dyn TokenStream + 'a> {
|
||||
if tok_strings.len() == 1 {
|
||||
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
|
||||
Box::new(PreTokenizedStream::from((*tok_strings[0]).clone()))
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
@@ -53,12 +53,11 @@ impl PreTokenizedStream {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
}
|
||||
// TODO remove the string cloning.
|
||||
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
|
||||
let token_streams: Vec<_> = tok_strings
|
||||
.iter()
|
||||
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
|
||||
.map(|tok_string| PreTokenizedStream::from((*tok_string).clone()))
|
||||
.collect();
|
||||
TokenStreamChain::new(offsets, token_streams).into()
|
||||
Box::new(TokenStreamChain::new(offsets, token_streams))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ use crate::tokenizer::TokenStreamChain;
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
/// Token
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
@@ -34,31 +33,20 @@ impl Default for Token {
|
||||
}
|
||||
}
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn Tokenizer>,
|
||||
token_filters: Vec<BoxTokenFilter>,
|
||||
}
|
||||
/// See the [module documentation](./index.html) for more detail.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer<'a>: Sized + Clone {
|
||||
/// Type associated to the resulting tokenstream tokenstream.
|
||||
type TokenStreamImpl: TokenStream;
|
||||
|
||||
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||
fn from(tokenizer: T) -> Self {
|
||||
TextAnalyzer::new(tokenizer, Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||
/// `TextAnalyzer::from(tokenizer)`.
|
||||
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(tokenizer),
|
||||
token_filters,
|
||||
}
|
||||
}
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl;
|
||||
|
||||
/// Appends a token filter to the current tokenizer.
|
||||
///
|
||||
@@ -70,26 +58,90 @@ impl TextAnalyzer {
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// let en_stem = SimpleTokenizer
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::default());
|
||||
/// ```
|
||||
///
|
||||
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
|
||||
self.token_filters.push(token_filter.into());
|
||||
self
|
||||
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainTokenizer<NewFilter, Self>
|
||||
where
|
||||
NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>,
|
||||
{
|
||||
ChainTokenizer {
|
||||
head: new_filter,
|
||||
tail: self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxed tokenizer
|
||||
trait BoxedTokenizerTrait: Send + Sync {
|
||||
/// Tokenize a `&str`
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b>;
|
||||
|
||||
/// Return a boxed clone of the tokenizer
|
||||
fn boxed_clone(&self) -> BoxedTokenizer;
|
||||
}
|
||||
|
||||
/// A boxed tokenizer
|
||||
pub struct BoxedTokenizer(Box<dyn BoxedTokenizerTrait>);
|
||||
|
||||
impl<T> From<T> for BoxedTokenizer
|
||||
where
|
||||
T: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
fn from(tokenizer: T) -> BoxedTokenizer {
|
||||
BoxedTokenizer(Box::new(BoxableTokenizer(tokenizer)))
|
||||
}
|
||||
}
|
||||
|
||||
impl BoxedTokenizer {
|
||||
/// Tokenize a `&str`
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
self.0.token_stream(text)
|
||||
}
|
||||
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> {
|
||||
pub fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
|
||||
self.0.token_stream_texts(texts)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for BoxedTokenizer {
|
||||
fn clone(&self) -> BoxedTokenizer {
|
||||
self.0.boxed_clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct BoxableTokenizer<A>(A)
|
||||
where
|
||||
A: for<'a> Tokenizer<'a> + Send + Sync;
|
||||
|
||||
impl<A> BoxedTokenizerTrait for BoxableTokenizer<A>
|
||||
where
|
||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
||||
{
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(self.0.token_stream(text))
|
||||
}
|
||||
|
||||
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
|
||||
assert!(!texts.is_empty());
|
||||
if texts.len() == 1 {
|
||||
self.token_stream(texts[0])
|
||||
Box::new(self.0.token_stream(texts[0]))
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
@@ -97,124 +149,34 @@ impl TextAnalyzer {
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
}
|
||||
let token_streams: Vec<BoxTokenStream<'a>> = texts
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|text| self.token_stream(text))
|
||||
.collect();
|
||||
From::from(TokenStreamChain::new(offsets, token_streams))
|
||||
let token_streams: Vec<_> =
|
||||
texts.iter().map(|text| self.0.token_stream(text)).collect();
|
||||
Box::new(TokenStreamChain::new(offsets, token_streams))
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
let mut token_stream = self.tokenizer.token_stream(text);
|
||||
for token_filter in &self.token_filters {
|
||||
token_stream = token_filter.transform(token_stream);
|
||||
}
|
||||
token_stream
|
||||
fn boxed_clone(&self) -> BoxedTokenizer {
|
||||
self.0.clone().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
token_filters: self
|
||||
.token_filters
|
||||
.iter()
|
||||
.map(|token_filter| token_filter.box_clone())
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
/// See the [module documentation](./index.html) for more detail.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
}
|
||||
|
||||
pub trait TokenizerClone {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||
impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
token_stream.advance()
|
||||
}
|
||||
|
||||
fn token<'b>(&'b self) -> &'b Token {
|
||||
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
|
||||
fn token(&self) -> &Token {
|
||||
let token_stream: &dyn TokenStream = self.borrow();
|
||||
token_stream.token()
|
||||
}
|
||||
|
||||
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
|
||||
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
token_stream.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
///
|
||||
/// See `TokenStream` for more information.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where
|
||||
T: TokenStream + 'a,
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BoxTokenStream<'a> {
|
||||
type Target = dyn TokenStream + 'a;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||
///
|
||||
/// See `TokenStream` for more information.
|
||||
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
|
||||
|
||||
impl Deref for BoxTokenFilter {
|
||||
type Target = dyn TokenFilter;
|
||||
|
||||
fn deref(&self) -> &dyn TokenFilter {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
@@ -224,7 +186,7 @@ impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// let tokenizer = SimpleTokenizer
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
@@ -263,7 +225,7 @@ pub trait TokenStream {
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// let tokenizer = SimpleTokenizer
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
@@ -281,8 +243,6 @@ pub trait TokenStream {
|
||||
|
||||
/// Helper function to consume the entire `TokenStream`
|
||||
/// and push the tokens to a sink function.
|
||||
///
|
||||
/// Remove this.
|
||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
|
||||
let mut num_tokens_pushed = 0u32;
|
||||
while self.advance() {
|
||||
@@ -293,20 +253,33 @@ pub trait TokenStream {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> BoxTokenFilter;
|
||||
#[derive(Clone)]
|
||||
pub struct ChainTokenizer<HeadTokenFilterFactory, TailTokenizer> {
|
||||
head: HeadTokenFilterFactory,
|
||||
tail: TailTokenizer,
|
||||
}
|
||||
|
||||
impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
|
||||
for ChainTokenizer<HeadTokenFilterFactory, TailTokenizer>
|
||||
where
|
||||
HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
|
||||
TailTokenizer: Tokenizer<'a>,
|
||||
{
|
||||
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let tail_token_stream = self.tail.token_stream(text);
|
||||
self.head.transform(tail_token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
|
||||
}
|
||||
pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
|
||||
/// The resulting `TokenStream` type.
|
||||
type ResultTokenStream: TokenStream;
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
fn box_clone(&self) -> BoxTokenFilter {
|
||||
BoxTokenFilter::from(self.clone())
|
||||
}
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use crate::tokenizer::stemmer::Language;
|
||||
use crate::tokenizer::tokenizer::TextAnalyzer;
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::LowerCaser;
|
||||
use crate::tokenizer::RawTokenizer;
|
||||
use crate::tokenizer::RemoveLongFilter;
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::tokenizer::Stemmer;
|
||||
use crate::tokenizer::Tokenizer;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
@@ -22,16 +23,16 @@ use std::sync::{Arc, RwLock};
|
||||
/// search engine.
|
||||
#[derive(Clone)]
|
||||
pub struct TokenizerManager {
|
||||
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
|
||||
tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
|
||||
where
|
||||
TextAnalyzer: From<T>,
|
||||
A: Into<BoxedTokenizer>,
|
||||
{
|
||||
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
||||
let boxed_tokenizer = tokenizer.into();
|
||||
self.tokenizers
|
||||
.write()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
@@ -39,7 +40,7 @@ impl TokenizerManager {
|
||||
}
|
||||
|
||||
/// Accessing a tokenizer given its name.
|
||||
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
|
||||
pub fn get(&self, tokenizer_name: &str) -> Option<BoxedTokenizer> {
|
||||
self.tokenizers
|
||||
.read()
|
||||
.expect("Acquiring the lock should never fail")
|
||||
@@ -61,13 +62,13 @@ impl Default for TokenizerManager {
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register(
|
||||
"default",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser),
|
||||
);
|
||||
manager.register(
|
||||
"en_stem",
|
||||
TextAnalyzer::from(SimpleTokenizer)
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English)),
|
||||
|
||||
Reference in New Issue
Block a user