mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
43aa53f150 | ||
|
|
811fd0cb9e | ||
|
|
f6847c46d7 | ||
|
|
92dac7af5c | ||
|
|
801905d77f | ||
|
|
8f5ac86f30 |
10
CHANGELOG.md
10
CHANGELOG.md
@@ -1,3 +1,13 @@
|
|||||||
|
Tantivy 0.12.0
|
||||||
|
======================
|
||||||
|
- Removing static dispatch in tokenizers for simplicity. (#762)
|
||||||
|
|
||||||
|
## How to update?
|
||||||
|
|
||||||
|
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
|
||||||
|
minor changes. Check https://github.com/tantivy-search/tantivy/blob/master/examples/custom_tokenizer.rs
|
||||||
|
to check for some code sample.
|
||||||
|
|
||||||
Tantivy 0.11.3
|
Tantivy 0.11.3
|
||||||
=======================
|
=======================
|
||||||
- Fixed DateTime as a fast field (#735)
|
- Fixed DateTime as a fast field (#735)
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ byteorder = "1.0"
|
|||||||
crc32fast = "1.2.0"
|
crc32fast = "1.2.0"
|
||||||
once_cell = "1.0"
|
once_cell = "1.0"
|
||||||
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
||||||
tantivy-fst = "0.1"
|
tantivy-fst = "0.2"
|
||||||
memmap = {version = "0.7", optional=true}
|
memmap = {version = "0.7", optional=true}
|
||||||
lz4 = {version="1.20", optional=true}
|
lz4 = {version="1.20", optional=true}
|
||||||
snap = {version="0.2"}
|
snap = {version="0.2"}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
// - import tokenized text straight from json,
|
// - import tokenized text straight from json,
|
||||||
// - perform a search on documents with pre-tokenized text
|
// - perform a search on documents with pre-tokenized text
|
||||||
|
|
||||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
|
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
|
||||||
|
|
||||||
use tantivy::collector::{Count, TopDocs};
|
use tantivy::collector::{Count, TopDocs};
|
||||||
use tantivy::query::TermQuery;
|
use tantivy::query::TermQuery;
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||||
// then removes all instances of `the` and `and` from the corpus
|
// then removes all instances of `the` and `and` from the corpus
|
||||||
let tokenizer = SimpleTokenizer
|
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(StopWordFilter::remove(vec![
|
.filter(StopWordFilter::remove(vec![
|
||||||
"the".to_string(),
|
"the".to_string(),
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
|||||||
use crate::collector::{
|
use crate::collector::{
|
||||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||||
};
|
};
|
||||||
|
use crate::fastfield::FastFieldReader;
|
||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::DocAddress;
|
use crate::DocAddress;
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
@@ -61,6 +62,34 @@ impl fmt::Debug for TopDocs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ScorerByFastFieldReader {
|
||||||
|
ff_reader: FastFieldReader<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||||
|
fn score(&self, doc: DocId) -> u64 {
|
||||||
|
self.ff_reader.get_u64(u64::from(doc))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ScorerByField {
|
||||||
|
field: Field,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CustomScorer<u64> for ScorerByField {
|
||||||
|
type Child = ScorerByFastFieldReader;
|
||||||
|
|
||||||
|
fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result<Self::Child> {
|
||||||
|
let ff_reader = segment_reader
|
||||||
|
.fast_fields()
|
||||||
|
.u64(self.field)
|
||||||
|
.ok_or_else(|| {
|
||||||
|
crate::Error::SchemaError(format!("Field requested is not a i64/u64 fast field."))
|
||||||
|
})?;
|
||||||
|
Ok(ScorerByFastFieldReader { ff_reader })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl TopDocs {
|
impl TopDocs {
|
||||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
@@ -143,14 +172,7 @@ impl TopDocs {
|
|||||||
self,
|
self,
|
||||||
field: Field,
|
field: Field,
|
||||||
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
|
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
|
||||||
self.custom_score(move |segment_reader: &SegmentReader| {
|
self.custom_score(ScorerByField { field })
|
||||||
let ff_reader = segment_reader
|
|
||||||
.fast_fields()
|
|
||||||
.u64(field)
|
|
||||||
.expect("Field requested is not a i64/u64 fast field.");
|
|
||||||
//TODO error message missmatch actual behavior for i64
|
|
||||||
move |doc: DocId| ff_reader.get(doc)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Ranks the documents using a custom score.
|
/// Ranks the documents using a custom score.
|
||||||
@@ -572,7 +594,6 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic(expected = "Field requested is not a i64/u64 fast field")]
|
|
||||||
fn test_field_not_fast_field() {
|
fn test_field_not_fast_field() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
@@ -587,7 +608,12 @@ mod tests {
|
|||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment = searcher.segment_reader(0);
|
let segment = searcher.segment_reader(0);
|
||||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||||
assert!(top_collector.for_segment(0, segment).is_ok());
|
let err = top_collector.for_segment(0, segment);
|
||||||
|
if let Err(crate::Error::SchemaError(msg)) = err {
|
||||||
|
assert_eq!(msg, "Field requested is not a i64/u64 fast field.");
|
||||||
|
} else {
|
||||||
|
assert!(false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn index(
|
fn index(
|
||||||
|
|||||||
@@ -20,8 +20,7 @@ use crate::reader::IndexReaderBuilder;
|
|||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::schema::FieldType;
|
use crate::schema::FieldType;
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::tokenizer::BoxedTokenizer;
|
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||||
use crate::tokenizer::TokenizerManager;
|
|
||||||
use crate::IndexWriter;
|
use crate::IndexWriter;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
use num_cpus;
|
use num_cpus;
|
||||||
@@ -173,11 +172,11 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Helper to access the tokenizer associated to a specific field.
|
/// Helper to access the tokenizer associated to a specific field.
|
||||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<BoxedTokenizer> {
|
pub fn tokenizer_for_field(&self, field: Field) -> Result<TextAnalyzer> {
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
let field_type = field_entry.field_type();
|
let field_type = field_entry.field_type();
|
||||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||||
let tokenizer_name_opt: Option<BoxedTokenizer> = match field_type {
|
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
|
||||||
FieldType::Str(text_options) => text_options
|
FieldType::Str(text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||||
|
|||||||
@@ -141,11 +141,19 @@ impl MmapCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub enum WatcherMode {
|
||||||
|
Event,
|
||||||
|
Poll
|
||||||
|
}
|
||||||
|
|
||||||
struct WatcherWrapper {
|
struct WatcherWrapper {
|
||||||
_watcher: Mutex<notify::RecommendedWatcher>,
|
_watcher: Mutex<notify::RecommendedWatcher>,
|
||||||
watcher_router: Arc<WatchCallbackList>,
|
watcher_router: Arc<WatchCallbackList>,
|
||||||
|
watcher_mode: WatcherMode,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
impl WatcherWrapper {
|
impl WatcherWrapper {
|
||||||
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
|
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
|
||||||
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
|
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
|
||||||
@@ -163,33 +171,57 @@ impl WatcherWrapper {
|
|||||||
})?;
|
})?;
|
||||||
let watcher_router: Arc<WatchCallbackList> = Default::default();
|
let watcher_router: Arc<WatchCallbackList> = Default::default();
|
||||||
let watcher_router_clone = watcher_router.clone();
|
let watcher_router_clone = watcher_router.clone();
|
||||||
|
let path_clone = path.clone();
|
||||||
|
let meta_path = path_clone.join(*META_FILEPATH);
|
||||||
thread::Builder::new()
|
thread::Builder::new()
|
||||||
.name("meta-file-watch-thread".to_string())
|
.name("meta-file-watch-thread".to_string())
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
|
let mut old_content = String::new();
|
||||||
|
let mode = WatcherMode::Event;
|
||||||
loop {
|
loop {
|
||||||
match watcher_recv.recv().map(|evt| evt.path) {
|
match mode {
|
||||||
Ok(Some(changed_path)) => {
|
WatcherMode::Event => {
|
||||||
// ... Actually subject to false positive.
|
match watcher_recv.recv().map(|evt| evt.path) {
|
||||||
// We might want to be more accurate than this at one point.
|
Ok(Some(changed_path)) => {
|
||||||
if let Some(filename) = changed_path.file_name() {
|
// ... Actually subject to false positive.
|
||||||
if filename == *META_FILEPATH {
|
// We might want to be more accurate than this at one point.
|
||||||
let _ = watcher_router_clone.broadcast();
|
if let Some(filename) = changed_path.file_name() {
|
||||||
|
if filename == *META_FILEPATH {
|
||||||
|
let _ = watcher_router_clone.broadcast();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
// not an event we are interested in.
|
||||||
|
}
|
||||||
|
Err(_e) => {
|
||||||
|
// the watch send channel was dropped
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(None) => {
|
WatcherMode::Poll => {
|
||||||
// not an event we are interested in.
|
let mut file = match File::open(&meta_path) {
|
||||||
|
Err(why) => panic!("open: nope"),
|
||||||
|
Ok(file) => file,
|
||||||
|
};
|
||||||
|
let mut new_content = String::new();
|
||||||
|
match file.read_to_string(&mut new_content) {
|
||||||
|
Err(why) => panic!("read: nope"),
|
||||||
|
Ok(_) => {},
|
||||||
|
}
|
||||||
|
if old_content != new_content {
|
||||||
|
let _ = watcher_router_clone.broadcast();
|
||||||
|
old_content = new_content;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Err(_e) => {
|
};
|
||||||
// the watch send channel was dropped
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})?;
|
})?;
|
||||||
Ok(WatcherWrapper {
|
Ok(WatcherWrapper {
|
||||||
_watcher: Mutex::new(watcher),
|
_watcher: Mutex::new(watcher),
|
||||||
watcher_router,
|
watcher_router,
|
||||||
|
watcher_mode: WatcherMode::Event,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ pub struct AddOperation {
|
|||||||
/// UserOperation is an enum type that encapsulates other operation types.
|
/// UserOperation is an enum type that encapsulates other operation types.
|
||||||
#[derive(Eq, PartialEq, Debug)]
|
#[derive(Eq, PartialEq, Debug)]
|
||||||
pub enum UserOperation {
|
pub enum UserOperation {
|
||||||
|
/// Add operation
|
||||||
Add(Document),
|
Add(Document),
|
||||||
|
/// Delete operation
|
||||||
Delete(Term),
|
Delete(Term),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,10 +11,9 @@ use crate::schema::Schema;
|
|||||||
use crate::schema::Term;
|
use crate::schema::Term;
|
||||||
use crate::schema::Value;
|
use crate::schema::Value;
|
||||||
use crate::schema::{Field, FieldEntry};
|
use crate::schema::{Field, FieldEntry};
|
||||||
use crate::tokenizer::BoxedTokenizer;
|
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream};
|
||||||
use crate::tokenizer::FacetTokenizer;
|
use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
||||||
use crate::tokenizer::PreTokenizedStream;
|
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||||
use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
|
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
@@ -50,7 +49,7 @@ pub struct SegmentWriter {
|
|||||||
fast_field_writers: FastFieldsWriter,
|
fast_field_writers: FastFieldsWriter,
|
||||||
fieldnorms_writer: FieldNormsWriter,
|
fieldnorms_writer: FieldNormsWriter,
|
||||||
doc_opstamps: Vec<Opstamp>,
|
doc_opstamps: Vec<Opstamp>,
|
||||||
tokenizers: Vec<Option<BoxedTokenizer>>,
|
tokenizers: Vec<Option<TextAnalyzer>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentWriter {
|
impl SegmentWriter {
|
||||||
@@ -159,7 +158,7 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::Str(_) => {
|
FieldType::Str(_) => {
|
||||||
let mut token_streams: Vec<Box<dyn TokenStream>> = vec![];
|
let mut token_streams: Vec<BoxTokenStream> = vec![];
|
||||||
let mut offsets = vec![];
|
let mut offsets = vec![];
|
||||||
let mut total_offset = 0;
|
let mut total_offset = 0;
|
||||||
|
|
||||||
@@ -172,7 +171,7 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
token_streams
|
token_streams
|
||||||
.push(Box::new(PreTokenizedStream::from(tok_str.clone())));
|
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||||
}
|
}
|
||||||
Value::Str(ref text) => {
|
Value::Str(ref text) => {
|
||||||
if let Some(ref mut tokenizer) =
|
if let Some(ref mut tokenizer) =
|
||||||
@@ -191,8 +190,7 @@ impl SegmentWriter {
|
|||||||
let num_tokens = if token_streams.is_empty() {
|
let num_tokens = if token_streams.is_empty() {
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
let mut token_stream: Box<dyn TokenStream> =
|
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||||
Box::new(TokenStreamChain::new(offsets, token_streams));
|
|
||||||
self.multifield_postings
|
self.multifield_postings
|
||||||
.index_text(doc_id, field, &mut token_stream)
|
.index_text(doc_id, field, &mut token_stream)
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,18 +1,76 @@
|
|||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::atomic::{AtomicU64, Ordering};
|
use std::sync::atomic::Ordering;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
#[cfg(not(target_arch = "arm"))]
|
||||||
|
mod atomic_impl {
|
||||||
|
|
||||||
|
use crate::Opstamp;
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct AtomicU64Wrapper(AtomicU64);
|
||||||
|
|
||||||
|
impl AtomicU64Wrapper {
|
||||||
|
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
|
||||||
|
AtomicU64Wrapper(AtomicU64::new(first_opstamp as u64))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||||
|
self.0.fetch_add(val as u64, order) as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn revert(&self, val: u64, order: Ordering) -> u64 {
|
||||||
|
self.0.store(val, order);
|
||||||
|
val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "arm")]
|
||||||
|
mod atomic_impl {
|
||||||
|
|
||||||
|
use crate::Opstamp;
|
||||||
|
/// Under other architecture, we rely on a mutex.
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
use std::sync::RwLock;
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct AtomicU64Wrapper(RwLock<u64>);
|
||||||
|
|
||||||
|
impl AtomicU64Wrapper {
|
||||||
|
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
|
||||||
|
AtomicU64Wrapper(RwLock::new(first_opstamp))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
||||||
|
let mut lock = self.0.write().unwrap();
|
||||||
|
let previous_val = *lock;
|
||||||
|
*lock = previous_val + incr;
|
||||||
|
previous_val
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn revert(&self, val: u64, _order: Ordering) -> u64 {
|
||||||
|
let mut lock = self.0.write().unwrap();
|
||||||
|
*lock = val;
|
||||||
|
val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
use self::atomic_impl::AtomicU64Wrapper;
|
||||||
|
|
||||||
/// Stamper provides Opstamps, which is just an auto-increment id to label
|
/// Stamper provides Opstamps, which is just an auto-increment id to label
|
||||||
/// an operation.
|
/// an operation.
|
||||||
///
|
///
|
||||||
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
|
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
|
||||||
#[derive(Clone, Default)]
|
#[derive(Clone, Default)]
|
||||||
pub struct Stamper(Arc<AtomicU64>);
|
pub struct Stamper(Arc<AtomicU64Wrapper>);
|
||||||
|
|
||||||
impl Stamper {
|
impl Stamper {
|
||||||
pub fn new(first_opstamp: Opstamp) -> Stamper {
|
pub fn new(first_opstamp: Opstamp) -> Stamper {
|
||||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
Stamper(Arc::new(AtomicU64Wrapper::new(first_opstamp)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stamp(&self) -> Opstamp {
|
pub fn stamp(&self) -> Opstamp {
|
||||||
@@ -31,8 +89,7 @@ impl Stamper {
|
|||||||
|
|
||||||
/// Reverts the stamper to a given `Opstamp` value and returns it
|
/// Reverts the stamper to a given `Opstamp` value and returns it
|
||||||
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
|
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
|
||||||
self.0.store(to_opstamp, Ordering::SeqCst);
|
self.0.revert(to_opstamp, Ordering::SeqCst)
|
||||||
to_opstamp
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -165,6 +165,7 @@ pub use crate::core::SegmentComponent;
|
|||||||
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||||
pub use crate::directory::Directory;
|
pub use crate::directory::Directory;
|
||||||
|
pub use crate::indexer::operation::UserOperation;
|
||||||
pub use crate::indexer::IndexWriter;
|
pub use crate::indexer::IndexWriter;
|
||||||
pub use crate::postings::Postings;
|
pub use crate::postings::Postings;
|
||||||
pub use crate::reader::LeasedItem;
|
pub use crate::reader::LeasedItem;
|
||||||
|
|||||||
@@ -533,7 +533,7 @@ mod test {
|
|||||||
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||||
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
|
use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT};
|
||||||
use crate::tokenizer::{
|
use crate::tokenizer::{
|
||||||
LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager,
|
LowerCaser, SimpleTokenizer, StopWordFilter, TextAnalyzer, TokenizerManager,
|
||||||
};
|
};
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use matches::assert_matches;
|
use matches::assert_matches;
|
||||||
@@ -563,7 +563,7 @@ mod test {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"en_with_stop_words",
|
"en_with_stop_words",
|
||||||
SimpleTokenizer
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(StopWordFilter::remove(vec!["the".to_string()])),
|
.filter(StopWordFilter::remove(vec!["the".to_string()])),
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ use crate::Result;
|
|||||||
use crate::Searcher;
|
use crate::Searcher;
|
||||||
use crate::SegmentReader;
|
use crate::SegmentReader;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::thread;
|
||||||
|
|
||||||
/// Defines when a new version of the index should be reloaded.
|
/// Defines when a new version of the index should be reloaded.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::schema::Value;
|
use crate::schema::Value;
|
||||||
use crate::tokenizer::BoxedTokenizer;
|
use crate::tokenizer::{TextAnalyzer, Token};
|
||||||
use crate::tokenizer::{Token, TokenStream};
|
|
||||||
use crate::Document;
|
use crate::Document;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
use crate::Searcher;
|
use crate::Searcher;
|
||||||
@@ -142,7 +141,7 @@ impl Snippet {
|
|||||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||||
/// has to be a valid string.
|
/// has to be a valid string.
|
||||||
fn search_fragments<'a>(
|
fn search_fragments<'a>(
|
||||||
tokenizer: &BoxedTokenizer,
|
tokenizer: &TextAnalyzer,
|
||||||
text: &'a str,
|
text: &'a str,
|
||||||
terms: &BTreeMap<String, f32>,
|
terms: &BTreeMap<String, f32>,
|
||||||
max_num_chars: usize,
|
max_num_chars: usize,
|
||||||
@@ -251,7 +250,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
|||||||
/// ```
|
/// ```
|
||||||
pub struct SnippetGenerator {
|
pub struct SnippetGenerator {
|
||||||
terms_text: BTreeMap<String, f32>,
|
terms_text: BTreeMap<String, f32>,
|
||||||
tokenizer: BoxedTokenizer,
|
tokenizer: TextAnalyzer,
|
||||||
field: Field,
|
field: Field,
|
||||||
max_num_chars: usize,
|
max_num_chars: usize,
|
||||||
}
|
}
|
||||||
@@ -347,12 +346,11 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet() {
|
fn test_snippet() {
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
let terms = btreemap! {
|
let terms = btreemap! {
|
||||||
String::from("rust") => 1.0,
|
String::from("rust") => 1.0,
|
||||||
String::from("language") => 0.9
|
String::from("language") => 0.9
|
||||||
};
|
};
|
||||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 100);
|
||||||
assert_eq!(fragments.len(), 7);
|
assert_eq!(fragments.len(), 7);
|
||||||
{
|
{
|
||||||
let first = &fragments[0];
|
let first = &fragments[0];
|
||||||
@@ -374,13 +372,12 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_scored_fragment() {
|
fn test_snippet_scored_fragment() {
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
{
|
{
|
||||||
let terms = btreemap! {
|
let terms = btreemap! {
|
||||||
String::from("rust") =>1.0f32,
|
String::from("rust") =>1.0f32,
|
||||||
String::from("language") => 0.9f32
|
String::from("language") => 0.9f32
|
||||||
};
|
};
|
||||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||||
{
|
{
|
||||||
let first = &fragments[0];
|
let first = &fragments[0];
|
||||||
assert_eq!(first.score, 1.0);
|
assert_eq!(first.score, 1.0);
|
||||||
@@ -389,13 +386,12 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
||||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
|
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
|
||||||
}
|
}
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
{
|
{
|
||||||
let terms = btreemap! {
|
let terms = btreemap! {
|
||||||
String::from("rust") =>0.9f32,
|
String::from("rust") =>0.9f32,
|
||||||
String::from("language") => 1.0f32
|
String::from("language") => 1.0f32
|
||||||
};
|
};
|
||||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), TEST_TEXT, &terms, 20);
|
||||||
//assert_eq!(fragments.len(), 7);
|
//assert_eq!(fragments.len(), 7);
|
||||||
{
|
{
|
||||||
let first = &fragments[0];
|
let first = &fragments[0];
|
||||||
@@ -409,14 +405,12 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_in_second_fragment() {
|
fn test_snippet_in_second_fragment() {
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
|
|
||||||
let text = "a b c d e f g";
|
let text = "a b c d e f g";
|
||||||
|
|
||||||
let mut terms = BTreeMap::new();
|
let mut terms = BTreeMap::new();
|
||||||
terms.insert(String::from("c"), 1.0);
|
terms.insert(String::from("c"), 1.0);
|
||||||
|
|
||||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 1);
|
assert_eq!(fragments.len(), 1);
|
||||||
{
|
{
|
||||||
@@ -433,14 +427,12 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_with_term_at_the_end_of_fragment() {
|
fn test_snippet_with_term_at_the_end_of_fragment() {
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
|
|
||||||
let text = "a b c d e f f g";
|
let text = "a b c d e f f g";
|
||||||
|
|
||||||
let mut terms = BTreeMap::new();
|
let mut terms = BTreeMap::new();
|
||||||
terms.insert(String::from("f"), 1.0);
|
terms.insert(String::from("f"), 1.0);
|
||||||
|
|
||||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 2);
|
assert_eq!(fragments.len(), 2);
|
||||||
{
|
{
|
||||||
@@ -457,15 +449,13 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_with_second_fragment_has_the_highest_score() {
|
fn test_snippet_with_second_fragment_has_the_highest_score() {
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
|
|
||||||
let text = "a b c d e f g";
|
let text = "a b c d e f g";
|
||||||
|
|
||||||
let mut terms = BTreeMap::new();
|
let mut terms = BTreeMap::new();
|
||||||
terms.insert(String::from("f"), 1.0);
|
terms.insert(String::from("f"), 1.0);
|
||||||
terms.insert(String::from("a"), 0.9);
|
terms.insert(String::from("a"), 0.9);
|
||||||
|
|
||||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 7);
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 2);
|
assert_eq!(fragments.len(), 2);
|
||||||
{
|
{
|
||||||
@@ -482,14 +472,12 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_with_term_not_in_text() {
|
fn test_snippet_with_term_not_in_text() {
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
|
|
||||||
let text = "a b c d";
|
let text = "a b c d";
|
||||||
|
|
||||||
let mut terms = BTreeMap::new();
|
let mut terms = BTreeMap::new();
|
||||||
terms.insert(String::from("z"), 1.0);
|
terms.insert(String::from("z"), 1.0);
|
||||||
|
|
||||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
|
|
||||||
assert_eq!(fragments.len(), 0);
|
assert_eq!(fragments.len(), 0);
|
||||||
|
|
||||||
@@ -500,12 +488,10 @@ Survey in 2016, 2017, and 2018."#;
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_snippet_with_no_terms() {
|
fn test_snippet_with_no_terms() {
|
||||||
let boxed_tokenizer = SimpleTokenizer.into();
|
|
||||||
|
|
||||||
let text = "a b c d";
|
let text = "a b c d";
|
||||||
|
|
||||||
let terms = BTreeMap::new();
|
let terms = BTreeMap::new();
|
||||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
let fragments = search_fragments(&From::from(SimpleTokenizer), &text, &terms, 3);
|
||||||
assert_eq!(fragments.len(), 0);
|
assert_eq!(fragments.len(), 0);
|
||||||
|
|
||||||
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = RawTokenizer
|
//! let tokenizer = TextAnalyzer::from(RawTokenizer)
|
||||||
//! .filter(AlphaNumOnlyFilter);
|
//! .filter(AlphaNumOnlyFilter);
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("hello there");
|
//! let mut stream = tokenizer.token_stream("hello there");
|
||||||
@@ -10,7 +10,7 @@
|
|||||||
//! // contains a space
|
//! // contains a space
|
||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = SimpleTokenizer
|
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(AlphaNumOnlyFilter);
|
//! .filter(AlphaNumOnlyFilter);
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
//! let mut stream = tokenizer.token_stream("hello there 💣");
|
||||||
@@ -19,56 +19,30 @@
|
|||||||
//! // the "emoji" is dropped because its not an alphanum
|
//! // the "emoji" is dropped because its not an alphanum
|
||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//! ```
|
//! ```
|
||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||||
|
|
||||||
/// `TokenFilter` that removes all tokens that contain non
|
/// `TokenFilter` that removes all tokens that contain non
|
||||||
/// ascii alphanumeric characters.
|
/// ascii alphanumeric characters.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct AlphaNumOnlyFilter;
|
pub struct AlphaNumOnlyFilter;
|
||||||
|
|
||||||
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
|
pub struct AlphaNumOnlyFilterStream<'a> {
|
||||||
where
|
tail: BoxTokenStream<'a>,
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
tail: TailTokenStream,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
|
impl<'a> AlphaNumOnlyFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn predicate(&self, token: &Token) -> bool {
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn wrap(tail: TailTokenStream) -> AlphaNumOnlyFilterStream<TailTokenStream> {
|
impl TokenFilter for AlphaNumOnlyFilter {
|
||||||
AlphaNumOnlyFilterStream { tail }
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
|
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
|
impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
|
|
||||||
|
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
|
||||||
AlphaNumOnlyFilterStream::wrap(token_stream)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
|
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn token(&self) -> &Token {
|
|
||||||
self.tail.token()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
|
||||||
self.tail.token_mut()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
while self.tail.advance() {
|
while self.tail.advance() {
|
||||||
if self.predicate(self.tail.token()) {
|
if self.predicate(self.tail.token()) {
|
||||||
@@ -78,4 +52,12 @@ where
|
|||||||
|
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
|
||||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||||
@@ -7,26 +7,21 @@ use std::mem;
|
|||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct AsciiFoldingFilter;
|
pub struct AsciiFoldingFilter;
|
||||||
|
|
||||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for AsciiFoldingFilter
|
impl TokenFilter for AsciiFoldingFilter {
|
||||||
where
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
TailTokenStream: TokenStream,
|
From::from(AsciiFoldingFilterTokenStream {
|
||||||
{
|
tail: token_stream,
|
||||||
type ResultTokenStream = AsciiFoldingFilterTokenStream<TailTokenStream>;
|
buffer: String::with_capacity(100),
|
||||||
|
})
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
|
||||||
AsciiFoldingFilterTokenStream::wrap(token_stream)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct AsciiFoldingFilterTokenStream<TailTokenStream> {
|
pub struct AsciiFoldingFilterTokenStream<'a> {
|
||||||
buffer: String,
|
buffer: String,
|
||||||
tail: TailTokenStream,
|
tail: BoxTokenStream<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenStream for AsciiFoldingFilterTokenStream<TailTokenStream>
|
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if !self.tail.advance() {
|
if !self.tail.advance() {
|
||||||
return false;
|
return false;
|
||||||
@@ -48,18 +43,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> AsciiFoldingFilterTokenStream<TailTokenStream>
|
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn wrap(tail: TailTokenStream) -> AsciiFoldingFilterTokenStream<TailTokenStream> {
|
|
||||||
AsciiFoldingFilterTokenStream {
|
|
||||||
tail,
|
|
||||||
buffer: String::with_capacity(100),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a string that represents the ascii folded version of
|
// Returns a string that represents the ascii folded version of
|
||||||
// the character. If the `char` does not require ascii folding
|
// the character. If the `char` does not require ascii folding
|
||||||
// (e.g. simple ASCII chars like `A`) or if the `char`
|
// (e.g. simple ASCII chars like `A`) or if the `char`
|
||||||
@@ -1561,8 +1544,7 @@ mod tests {
|
|||||||
use crate::tokenizer::AsciiFoldingFilter;
|
use crate::tokenizer::AsciiFoldingFilter;
|
||||||
use crate::tokenizer::RawTokenizer;
|
use crate::tokenizer::RawTokenizer;
|
||||||
use crate::tokenizer::SimpleTokenizer;
|
use crate::tokenizer::SimpleTokenizer;
|
||||||
use crate::tokenizer::TokenStream;
|
use crate::tokenizer::TextAnalyzer;
|
||||||
use crate::tokenizer::Tokenizer;
|
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1579,7 +1561,7 @@ mod tests {
|
|||||||
|
|
||||||
fn folding_helper(text: &str) -> Vec<String> {
|
fn folding_helper(text: &str) -> Vec<String> {
|
||||||
let mut tokens = Vec::new();
|
let mut tokens = Vec::new();
|
||||||
SimpleTokenizer
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(AsciiFoldingFilter)
|
.filter(AsciiFoldingFilter)
|
||||||
.token_stream(text)
|
.token_stream(text)
|
||||||
.process(&mut |token| {
|
.process(&mut |token| {
|
||||||
@@ -1589,7 +1571,9 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
fn folding_using_raw_tokenizer_helper(text: &str) -> String {
|
||||||
let mut token_stream = RawTokenizer.filter(AsciiFoldingFilter).token_stream(text);
|
let mut token_stream = TextAnalyzer::from(RawTokenizer)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.token_stream(text);
|
||||||
token_stream.advance();
|
token_stream.advance();
|
||||||
token_stream.token().text.clone()
|
token_stream.token().text.clone()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use super::{Token, TokenStream, Tokenizer};
|
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||||
use crate::schema::FACET_SEP_BYTE;
|
use crate::schema::FACET_SEP_BYTE;
|
||||||
|
|
||||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||||
@@ -25,15 +25,14 @@ pub struct FacetTokenStream<'a> {
|
|||||||
token: Token,
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Tokenizer<'a> for FacetTokenizer {
|
impl Tokenizer for FacetTokenizer {
|
||||||
type TokenStreamImpl = FacetTokenStream<'a>;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
|
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
|
||||||
FacetTokenStream {
|
FacetTokenStream {
|
||||||
text,
|
text,
|
||||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||||
token: Token::default(),
|
token: Token::default(),
|
||||||
}
|
}
|
||||||
|
.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -84,7 +83,7 @@ mod tests {
|
|||||||
|
|
||||||
use super::FacetTokenizer;
|
use super::FacetTokenizer;
|
||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
use crate::tokenizer::{Token, Tokenizer};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_tokenizer() {
|
fn test_facet_tokenizer() {
|
||||||
|
|||||||
@@ -1,24 +1,23 @@
|
|||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
|
||||||
|
impl TokenFilter for LowerCaser {
|
||||||
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
|
BoxTokenStream::from(LowerCaserTokenStream {
|
||||||
|
tail: token_stream,
|
||||||
|
buffer: String::with_capacity(100),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Token filter that lowercase terms.
|
/// Token filter that lowercase terms.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct LowerCaser;
|
pub struct LowerCaser;
|
||||||
|
|
||||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
|
pub struct LowerCaserTokenStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;
|
|
||||||
|
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
|
||||||
LowerCaserTokenStream::wrap(token_stream)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct LowerCaserTokenStream<TailTokenStream> {
|
|
||||||
buffer: String,
|
buffer: String,
|
||||||
tail: TailTokenStream,
|
tail: BoxTokenStream<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// writes a lowercased version of text into output.
|
// writes a lowercased version of text into output.
|
||||||
@@ -31,18 +30,7 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
impl<'a> TokenStream for LowerCaserTokenStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn token(&self) -> &Token {
|
|
||||||
self.tail.token()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
|
||||||
self.tail.token_mut()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if !self.tail.advance() {
|
if !self.tail.advance() {
|
||||||
return false;
|
return false;
|
||||||
@@ -56,26 +44,19 @@ where
|
|||||||
}
|
}
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<TailTokenStream> LowerCaserTokenStream<TailTokenStream>
|
fn token(&self) -> &Token {
|
||||||
where
|
self.tail.token()
|
||||||
TailTokenStream: TokenStream,
|
}
|
||||||
{
|
|
||||||
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
LowerCaserTokenStream {
|
self.tail.token_mut()
|
||||||
tail,
|
|
||||||
buffer: String::with_capacity(100),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::tokenizer::LowerCaser;
|
use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer};
|
||||||
use crate::tokenizer::SimpleTokenizer;
|
|
||||||
use crate::tokenizer::TokenStream;
|
|
||||||
use crate::tokenizer::Tokenizer;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_to_lower_case() {
|
fn test_to_lower_case() {
|
||||||
@@ -87,7 +68,9 @@ mod tests {
|
|||||||
|
|
||||||
fn lowercase_helper(text: &str) -> Vec<String> {
|
fn lowercase_helper(text: &str) -> Vec<String> {
|
||||||
let mut tokens = vec![];
|
let mut tokens = vec![];
|
||||||
let mut token_stream = SimpleTokenizer.filter(LowerCaser).token_stream(text);
|
let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.token_stream(text);
|
||||||
while token_stream.advance() {
|
while token_stream.advance() {
|
||||||
let token_text = token_stream.token().text.clone();
|
let token_text = token_stream.token().text.clone();
|
||||||
tokens.push(token_text);
|
tokens.push(token_text);
|
||||||
|
|||||||
@@ -64,7 +64,7 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let en_stem = SimpleTokenizer
|
//! let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(RemoveLongFilter::limit(40))
|
//! .filter(RemoveLongFilter::limit(40))
|
||||||
//! .filter(LowerCaser)
|
//! .filter(LowerCaser)
|
||||||
//! .filter(Stemmer::new(Language::English));
|
//! .filter(Stemmer::new(Language::English));
|
||||||
@@ -109,7 +109,7 @@
|
|||||||
//! let index = Index::create_in_ram(schema);
|
//! let index = Index::create_in_ram(schema);
|
||||||
//!
|
//!
|
||||||
//! // We need to register our tokenizer :
|
//! // We need to register our tokenizer :
|
||||||
//! let custom_en_tokenizer = SimpleTokenizer
|
//! let custom_en_tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(RemoveLongFilter::limit(40))
|
//! .filter(RemoveLongFilter::limit(40))
|
||||||
//! .filter(LowerCaser);
|
//! .filter(LowerCaser);
|
||||||
//! index
|
//! index
|
||||||
@@ -143,10 +143,11 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
|||||||
pub use self::stemmer::{Language, Stemmer};
|
pub use self::stemmer::{Language, Stemmer};
|
||||||
pub use self::stop_word_filter::StopWordFilter;
|
pub use self::stop_word_filter::StopWordFilter;
|
||||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||||
pub use self::tokenizer::BoxedTokenizer;
|
|
||||||
|
|
||||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{
|
||||||
|
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
|
||||||
|
};
|
||||||
|
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
@@ -160,9 +161,9 @@ pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
use super::{
|
use super::{
|
||||||
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, Tokenizer,
|
Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, Token, TokenizerManager,
|
||||||
TokenizerManager,
|
|
||||||
};
|
};
|
||||||
|
use crate::tokenizer::TextAnalyzer;
|
||||||
|
|
||||||
/// This is a function that can be used in tests and doc tests
|
/// This is a function that can be used in tests and doc tests
|
||||||
/// to assert a token's correctness.
|
/// to assert a token's correctness.
|
||||||
@@ -229,7 +230,7 @@ pub mod tests {
|
|||||||
let tokenizer_manager = TokenizerManager::default();
|
let tokenizer_manager = TokenizerManager::default();
|
||||||
tokenizer_manager.register(
|
tokenizer_manager.register(
|
||||||
"el_stem",
|
"el_stem",
|
||||||
SimpleTokenizer
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new(Language::Greek)),
|
.filter(Stemmer::new(Language::Greek)),
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::{Token, TokenStream, Tokenizer};
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
|
|
||||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||||
///
|
///
|
||||||
@@ -129,11 +130,9 @@ pub struct NgramTokenStream<'a> {
|
|||||||
token: Token,
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Tokenizer<'a> for NgramTokenizer {
|
impl Tokenizer for NgramTokenizer {
|
||||||
type TokenStreamImpl = NgramTokenStream<'a>;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
|
From::from(NgramTokenStream {
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
|
||||||
NgramTokenStream {
|
|
||||||
ngram_charidx_iterator: StutteringIterator::new(
|
ngram_charidx_iterator: StutteringIterator::new(
|
||||||
CodepointFrontiers::for_str(text),
|
CodepointFrontiers::for_str(text),
|
||||||
self.min_gram,
|
self.min_gram,
|
||||||
@@ -142,7 +141,7 @@ impl<'a> Tokenizer<'a> for NgramTokenizer {
|
|||||||
prefix_only: self.prefix_only,
|
prefix_only: self.prefix_only,
|
||||||
text,
|
text,
|
||||||
token: Token::default(),
|
token: Token::default(),
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -308,10 +307,10 @@ mod tests {
|
|||||||
use super::NgramTokenizer;
|
use super::NgramTokenizer;
|
||||||
use super::StutteringIterator;
|
use super::StutteringIterator;
|
||||||
use crate::tokenizer::tests::assert_token;
|
use crate::tokenizer::tests::assert_token;
|
||||||
use crate::tokenizer::tokenizer::{TokenStream, Tokenizer};
|
use crate::tokenizer::tokenizer::Tokenizer;
|
||||||
use crate::tokenizer::Token;
|
use crate::tokenizer::{BoxTokenStream, Token};
|
||||||
|
|
||||||
fn test_helper<T: TokenStream>(mut tokenizer: T) -> Vec<Token> {
|
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
|
||||||
let mut tokens: Vec<Token> = vec![];
|
let mut tokens: Vec<Token> = vec![];
|
||||||
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
|
||||||
tokens
|
tokens
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::{Token, TokenStream, Tokenizer};
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
|
|
||||||
/// For each value of the field, emit a single unprocessed token.
|
/// For each value of the field, emit a single unprocessed token.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@@ -9,10 +10,8 @@ pub struct RawTokenStream {
|
|||||||
has_token: bool,
|
has_token: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Tokenizer<'a> for RawTokenizer {
|
impl Tokenizer for RawTokenizer {
|
||||||
type TokenStreamImpl = RawTokenStream;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
|
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
|
||||||
let token = Token {
|
let token = Token {
|
||||||
offset_from: 0,
|
offset_from: 0,
|
||||||
offset_to: text.len(),
|
offset_to: text.len(),
|
||||||
@@ -24,6 +23,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
|
|||||||
token,
|
token,
|
||||||
has_token: true,
|
has_token: true,
|
||||||
}
|
}
|
||||||
|
.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = SimpleTokenizer
|
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(RemoveLongFilter::limit(5));
|
//! .filter(RemoveLongFilter::limit(5));
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("toolong nice");
|
//! let mut stream = tokenizer.token_stream("toolong nice");
|
||||||
@@ -13,6 +13,7 @@
|
|||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
|
|
||||||
/// `RemoveLongFilter` removes tokens that are longer
|
/// `RemoveLongFilter` removes tokens that are longer
|
||||||
/// than a given number of bytes (in UTF-8 representation).
|
/// than a given number of bytes (in UTF-8 representation).
|
||||||
@@ -31,56 +32,27 @@ impl RemoveLongFilter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
|
impl<'a> RemoveLongFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn predicate(&self, token: &Token) -> bool {
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
token.text.len() < self.token_length_limit
|
token.text.len() < self.token_length_limit
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn wrap(
|
impl TokenFilter for RemoveLongFilter {
|
||||||
token_length_limit: usize,
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
tail: TailTokenStream,
|
BoxTokenStream::from(RemoveLongFilterStream {
|
||||||
) -> RemoveLongFilterStream<TailTokenStream> {
|
token_length_limit: self.length_limit,
|
||||||
RemoveLongFilterStream {
|
tail: token_stream,
|
||||||
token_length_limit,
|
})
|
||||||
tail,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
|
pub struct RemoveLongFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
|
|
||||||
|
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
|
||||||
RemoveLongFilterStream::wrap(self.length_limit, token_stream)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct RemoveLongFilterStream<TailTokenStream>
|
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
token_length_limit: usize,
|
token_length_limit: usize,
|
||||||
tail: TailTokenStream,
|
tail: BoxTokenStream<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
|
impl<'a> TokenStream for RemoveLongFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn token(&self) -> &Token {
|
|
||||||
self.tail.token()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
|
||||||
self.tail.token_mut()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
while self.tail.advance() {
|
while self.tail.advance() {
|
||||||
if self.predicate(self.tail.token()) {
|
if self.predicate(self.tail.token()) {
|
||||||
@@ -89,4 +61,12 @@ where
|
|||||||
}
|
}
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use super::BoxTokenStream;
|
||||||
use super::{Token, TokenStream, Tokenizer};
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
use std::str::CharIndices;
|
use std::str::CharIndices;
|
||||||
|
|
||||||
@@ -11,15 +12,13 @@ pub struct SimpleTokenStream<'a> {
|
|||||||
token: Token,
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Tokenizer<'a> for SimpleTokenizer {
|
impl Tokenizer for SimpleTokenizer {
|
||||||
type TokenStreamImpl = SimpleTokenStream<'a>;
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
|
BoxTokenStream::from(SimpleTokenStream {
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
|
||||||
SimpleTokenStream {
|
|
||||||
text,
|
text,
|
||||||
chars: text.char_indices(),
|
chars: text.char_indices(),
|
||||||
token: Token::default(),
|
token: Token::default(),
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
use rust_stemmers::{self, Algorithm};
|
use rust_stemmers::{self, Algorithm};
|
||||||
|
|
||||||
/// Available stemmer languages.
|
/// Available stemmer languages.
|
||||||
@@ -75,38 +76,22 @@ impl Default for Stemmer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
impl TokenFilter for Stemmer {
|
||||||
where
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
|
||||||
|
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
|
||||||
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
|
||||||
StemmerTokenStream::wrap(inner_stemmer, token_stream)
|
BoxTokenStream::from(StemmerTokenStream {
|
||||||
|
tail: token_stream,
|
||||||
|
stemmer: inner_stemmer,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct StemmerTokenStream<TailTokenStream>
|
pub struct StemmerTokenStream<'a> {
|
||||||
where
|
tail: BoxTokenStream<'a>,
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
tail: TailTokenStream,
|
|
||||||
stemmer: rust_stemmers::Stemmer,
|
stemmer: rust_stemmers::Stemmer,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
impl<'a> TokenStream for StemmerTokenStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn token(&self) -> &Token {
|
|
||||||
self.tail.token()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
|
||||||
self.tail.token_mut()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if !self.tail.advance() {
|
if !self.tail.advance() {
|
||||||
return false;
|
return false;
|
||||||
@@ -117,16 +102,12 @@ where
|
|||||||
self.token_mut().text.push_str(&stemmed_str);
|
self.token_mut().text.push_str(&stemmed_str);
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
|
fn token(&self) -> &Token {
|
||||||
where
|
self.tail.token()
|
||||||
TailTokenStream: TokenStream,
|
}
|
||||||
{
|
|
||||||
fn wrap(
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
stemmer: rust_stemmers::Stemmer,
|
self.tail.token_mut()
|
||||||
tail: TailTokenStream,
|
|
||||||
) -> StemmerTokenStream<TailTokenStream> {
|
|
||||||
StemmerTokenStream { tail, stemmer }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
//! ```rust
|
//! ```rust
|
||||||
//! use tantivy::tokenizer::*;
|
//! use tantivy::tokenizer::*;
|
||||||
//!
|
//!
|
||||||
//! let tokenizer = SimpleTokenizer
|
//! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
|
//! .filter(StopWordFilter::remove(vec!["the".to_string(), "is".to_string()]));
|
||||||
//!
|
//!
|
||||||
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
//! let mut stream = tokenizer.token_stream("the fox is crafty");
|
||||||
@@ -11,6 +11,7 @@
|
|||||||
//! assert!(stream.next().is_none());
|
//! assert!(stream.next().is_none());
|
||||||
//! ```
|
//! ```
|
||||||
use super::{Token, TokenFilter, TokenStream};
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
use crate::tokenizer::BoxTokenStream;
|
||||||
use fnv::FnvHasher;
|
use fnv::FnvHasher;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
@@ -48,53 +49,27 @@ impl StopWordFilter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct StopWordFilterStream<TailTokenStream>
|
pub struct StopWordFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
words: StopWordHashSet,
|
words: StopWordHashSet,
|
||||||
tail: TailTokenStream,
|
tail: BoxTokenStream<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for StopWordFilter
|
impl TokenFilter for StopWordFilter {
|
||||||
where
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
|
||||||
TailTokenStream: TokenStream,
|
BoxTokenStream::from(StopWordFilterStream {
|
||||||
{
|
words: self.words.clone(),
|
||||||
type ResultTokenStream = StopWordFilterStream<TailTokenStream>;
|
tail: token_stream,
|
||||||
|
})
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
|
||||||
StopWordFilterStream::wrap(self.words.clone(), token_stream)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> StopWordFilterStream<TailTokenStream>
|
impl<'a> StopWordFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn predicate(&self, token: &Token) -> bool {
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
!self.words.contains(&token.text)
|
!self.words.contains(&token.text)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wrap(
|
|
||||||
words: StopWordHashSet,
|
|
||||||
tail: TailTokenStream,
|
|
||||||
) -> StopWordFilterStream<TailTokenStream> {
|
|
||||||
StopWordFilterStream { words, tail }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TailTokenStream> TokenStream for StopWordFilterStream<TailTokenStream>
|
impl<'a> TokenStream for StopWordFilterStream<'a> {
|
||||||
where
|
|
||||||
TailTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn token(&self) -> &Token {
|
|
||||||
self.tail.token()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
|
||||||
self.tail.token_mut()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
while self.tail.advance() {
|
while self.tail.advance() {
|
||||||
if self.predicate(self.tail.token()) {
|
if self.predicate(self.tail.token()) {
|
||||||
@@ -103,6 +78,14 @@ where
|
|||||||
}
|
}
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for StopWordFilter {
|
impl Default for StopWordFilter {
|
||||||
|
|||||||
@@ -1,23 +1,21 @@
|
|||||||
use crate::tokenizer::{Token, TokenStream};
|
use crate::tokenizer::{BoxTokenStream, Token, TokenStream};
|
||||||
|
use std::ops::DerefMut;
|
||||||
|
|
||||||
const POSITION_GAP: usize = 2;
|
const POSITION_GAP: usize = 2;
|
||||||
|
|
||||||
pub(crate) struct TokenStreamChain<TTokenStream: TokenStream> {
|
pub(crate) struct TokenStreamChain<'a> {
|
||||||
offsets: Vec<usize>,
|
offsets: Vec<usize>,
|
||||||
token_streams: Vec<TTokenStream>,
|
token_streams: Vec<BoxTokenStream<'a>>,
|
||||||
position_shift: usize,
|
position_shift: usize,
|
||||||
stream_idx: usize,
|
stream_idx: usize,
|
||||||
token: Token,
|
token: Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
|
impl<'a> TokenStreamChain<'a> {
|
||||||
where
|
|
||||||
TTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
pub fn new(
|
pub fn new(
|
||||||
offsets: Vec<usize>,
|
offsets: Vec<usize>,
|
||||||
token_streams: Vec<TTokenStream>,
|
token_streams: Vec<BoxTokenStream<'a>>,
|
||||||
) -> TokenStreamChain<TTokenStream> {
|
) -> TokenStreamChain<'a> {
|
||||||
TokenStreamChain {
|
TokenStreamChain {
|
||||||
offsets,
|
offsets,
|
||||||
stream_idx: 0,
|
stream_idx: 0,
|
||||||
@@ -28,13 +26,10 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
|
impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||||
where
|
|
||||||
TTokenStream: TokenStream,
|
|
||||||
{
|
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
while self.stream_idx < self.token_streams.len() {
|
while self.stream_idx < self.token_streams.len() {
|
||||||
let token_stream = &mut self.token_streams[self.stream_idx];
|
let token_stream = self.token_streams[self.stream_idx].deref_mut();
|
||||||
if token_stream.advance() {
|
if token_stream.advance() {
|
||||||
let token = token_stream.token();
|
let token = token_stream.token();
|
||||||
let offset_offset = self.offsets[self.stream_idx];
|
let offset_offset = self.offsets[self.stream_idx];
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
|
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
/// Struct representing pre-tokenized text
|
/// Struct representing pre-tokenized text
|
||||||
@@ -41,9 +41,9 @@ impl PreTokenizedStream {
|
|||||||
/// Creates a TokenStream from PreTokenizedString array
|
/// Creates a TokenStream from PreTokenizedString array
|
||||||
pub fn chain_tokenized_strings<'a>(
|
pub fn chain_tokenized_strings<'a>(
|
||||||
tok_strings: &'a [&'a PreTokenizedString],
|
tok_strings: &'a [&'a PreTokenizedString],
|
||||||
) -> Box<dyn TokenStream + 'a> {
|
) -> BoxTokenStream {
|
||||||
if tok_strings.len() == 1 {
|
if tok_strings.len() == 1 {
|
||||||
Box::new(PreTokenizedStream::from((*tok_strings[0]).clone()))
|
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
|
||||||
} else {
|
} else {
|
||||||
let mut offsets = vec![];
|
let mut offsets = vec![];
|
||||||
let mut total_offset = 0;
|
let mut total_offset = 0;
|
||||||
@@ -53,11 +53,12 @@ impl PreTokenizedStream {
|
|||||||
total_offset += last_token.offset_to;
|
total_offset += last_token.offset_to;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let token_streams: Vec<_> = tok_strings
|
// TODO remove the string cloning.
|
||||||
|
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
|
||||||
.iter()
|
.iter()
|
||||||
.map(|tok_string| PreTokenizedStream::from((*tok_string).clone()))
|
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
|
||||||
.collect();
|
.collect();
|
||||||
Box::new(TokenStreamChain::new(offsets, token_streams))
|
TokenStreamChain::new(offsets, token_streams).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use crate::tokenizer::TokenStreamChain;
|
|||||||
/// The tokenizer module contains all of the tools used to process
|
/// The tokenizer module contains all of the tools used to process
|
||||||
/// text in `tantivy`.
|
/// text in `tantivy`.
|
||||||
use std::borrow::{Borrow, BorrowMut};
|
use std::borrow::{Borrow, BorrowMut};
|
||||||
|
use std::ops::{Deref, DerefMut};
|
||||||
|
|
||||||
/// Token
|
/// Token
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||||
@@ -33,20 +34,31 @@ impl Default for Token {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||||
/// before indexing.
|
|
||||||
///
|
///
|
||||||
/// See the [module documentation](./index.html) for more detail.
|
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||||
///
|
pub struct TextAnalyzer {
|
||||||
/// # Warning
|
tokenizer: Box<dyn Tokenizer>,
|
||||||
///
|
token_filters: Vec<BoxTokenFilter>,
|
||||||
/// This API may change to use associated types.
|
}
|
||||||
pub trait Tokenizer<'a>: Sized + Clone {
|
|
||||||
/// Type associated to the resulting tokenstream tokenstream.
|
|
||||||
type TokenStreamImpl: TokenStream;
|
|
||||||
|
|
||||||
/// Creates a token stream for a given `str`.
|
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl;
|
fn from(tokenizer: T) -> Self {
|
||||||
|
TextAnalyzer::new(tokenizer, Vec::new())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TextAnalyzer {
|
||||||
|
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||||
|
///
|
||||||
|
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||||
|
/// `TextAnalyzer::from(tokenizer)`.
|
||||||
|
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer {
|
||||||
|
TextAnalyzer {
|
||||||
|
tokenizer: Box::new(tokenizer),
|
||||||
|
token_filters,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Appends a token filter to the current tokenizer.
|
/// Appends a token filter to the current tokenizer.
|
||||||
///
|
///
|
||||||
@@ -58,90 +70,26 @@ pub trait Tokenizer<'a>: Sized + Clone {
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// use tantivy::tokenizer::*;
|
/// use tantivy::tokenizer::*;
|
||||||
///
|
///
|
||||||
/// let en_stem = SimpleTokenizer
|
/// let en_stem = TextAnalyzer::from(SimpleTokenizer)
|
||||||
/// .filter(RemoveLongFilter::limit(40))
|
/// .filter(RemoveLongFilter::limit(40))
|
||||||
/// .filter(LowerCaser)
|
/// .filter(LowerCaser)
|
||||||
/// .filter(Stemmer::default());
|
/// .filter(Stemmer::default());
|
||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainTokenizer<NewFilter, Self>
|
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self {
|
||||||
where
|
self.token_filters.push(token_filter.into());
|
||||||
NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>,
|
self
|
||||||
{
|
|
||||||
ChainTokenizer {
|
|
||||||
head: new_filter,
|
|
||||||
tail: self,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A boxed tokenizer
|
|
||||||
trait BoxedTokenizerTrait: Send + Sync {
|
|
||||||
/// Tokenize a `&str`
|
|
||||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
|
||||||
|
|
||||||
/// Tokenize an array`&str`
|
|
||||||
///
|
|
||||||
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
|
|
||||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
|
||||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
|
||||||
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b>;
|
|
||||||
|
|
||||||
/// Return a boxed clone of the tokenizer
|
|
||||||
fn boxed_clone(&self) -> BoxedTokenizer;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A boxed tokenizer
|
|
||||||
pub struct BoxedTokenizer(Box<dyn BoxedTokenizerTrait>);
|
|
||||||
|
|
||||||
impl<T> From<T> for BoxedTokenizer
|
|
||||||
where
|
|
||||||
T: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
|
||||||
{
|
|
||||||
fn from(tokenizer: T) -> BoxedTokenizer {
|
|
||||||
BoxedTokenizer(Box::new(BoxableTokenizer(tokenizer)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BoxedTokenizer {
|
|
||||||
/// Tokenize a `&str`
|
|
||||||
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
|
||||||
self.0.token_stream(text)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Tokenize an array`&str`
|
/// Tokenize an array`&str`
|
||||||
///
|
///
|
||||||
/// The resulting `TokenStream` is equivalent to what would be obtained if the &str were
|
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||||
pub fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
|
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> {
|
||||||
self.0.token_stream_texts(texts)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Clone for BoxedTokenizer {
|
|
||||||
fn clone(&self) -> BoxedTokenizer {
|
|
||||||
self.0.boxed_clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct BoxableTokenizer<A>(A)
|
|
||||||
where
|
|
||||||
A: for<'a> Tokenizer<'a> + Send + Sync;
|
|
||||||
|
|
||||||
impl<A> BoxedTokenizerTrait for BoxableTokenizer<A>
|
|
||||||
where
|
|
||||||
A: 'static + Send + Sync + for<'a> Tokenizer<'a>,
|
|
||||||
{
|
|
||||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
|
||||||
Box::new(self.0.token_stream(text))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box<dyn TokenStream + 'b> {
|
|
||||||
assert!(!texts.is_empty());
|
assert!(!texts.is_empty());
|
||||||
if texts.len() == 1 {
|
if texts.len() == 1 {
|
||||||
Box::new(self.0.token_stream(texts[0]))
|
self.token_stream(texts[0])
|
||||||
} else {
|
} else {
|
||||||
let mut offsets = vec![];
|
let mut offsets = vec![];
|
||||||
let mut total_offset = 0;
|
let mut total_offset = 0;
|
||||||
@@ -149,34 +97,124 @@ where
|
|||||||
offsets.push(total_offset);
|
offsets.push(total_offset);
|
||||||
total_offset += text.len();
|
total_offset += text.len();
|
||||||
}
|
}
|
||||||
let token_streams: Vec<_> =
|
let token_streams: Vec<BoxTokenStream<'a>> = texts
|
||||||
texts.iter().map(|text| self.0.token_stream(text)).collect();
|
.iter()
|
||||||
Box::new(TokenStreamChain::new(offsets, token_streams))
|
.cloned()
|
||||||
|
.map(|text| self.token_stream(text))
|
||||||
|
.collect();
|
||||||
|
From::from(TokenStreamChain::new(offsets, token_streams))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn boxed_clone(&self) -> BoxedTokenizer {
|
/// Creates a token stream for a given `str`.
|
||||||
self.0.clone().into()
|
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
|
let mut token_stream = self.tokenizer.token_stream(text);
|
||||||
|
for token_filter in &self.token_filters {
|
||||||
|
token_stream = token_filter.transform(token_stream);
|
||||||
|
}
|
||||||
|
token_stream
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
|
impl Clone for TextAnalyzer {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
TextAnalyzer {
|
||||||
|
tokenizer: self.tokenizer.box_clone(),
|
||||||
|
token_filters: self
|
||||||
|
.token_filters
|
||||||
|
.iter()
|
||||||
|
.map(|token_filter| token_filter.box_clone())
|
||||||
|
.collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||||
|
/// before indexing.
|
||||||
|
///
|
||||||
|
/// See the [module documentation](./index.html) for more detail.
|
||||||
|
///
|
||||||
|
/// # Warning
|
||||||
|
///
|
||||||
|
/// This API may change to use associated types.
|
||||||
|
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||||
|
/// Creates a token stream for a given `str`.
|
||||||
|
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait TokenizerClone {
|
||||||
|
fn box_clone(&self) -> Box<dyn Tokenizer>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||||
|
fn box_clone(&self) -> Box<dyn Tokenizer> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||||
token_stream.advance()
|
token_stream.advance()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token(&self) -> &Token {
|
fn token<'b>(&'b self) -> &'b Token {
|
||||||
let token_stream: &dyn TokenStream = self.borrow();
|
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
|
||||||
token_stream.token()
|
token_stream.token()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token_mut(&mut self) -> &mut Token {
|
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
|
||||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
|
||||||
token_stream.token_mut()
|
token_stream.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||||
|
///
|
||||||
|
/// See `TokenStream` for more information.
|
||||||
|
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||||
|
|
||||||
|
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||||
|
where
|
||||||
|
T: TokenStream + 'a,
|
||||||
|
{
|
||||||
|
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||||
|
BoxTokenStream(Box::new(token_stream))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Deref for BoxTokenStream<'a> {
|
||||||
|
type Target = dyn TokenStream + 'a;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&*self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||||
|
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||||
|
&mut *self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||||
|
///
|
||||||
|
/// See `TokenStream` for more information.
|
||||||
|
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
|
||||||
|
|
||||||
|
impl Deref for BoxTokenFilter {
|
||||||
|
type Target = dyn TokenFilter;
|
||||||
|
|
||||||
|
fn deref(&self) -> &dyn TokenFilter {
|
||||||
|
&*self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||||
|
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||||
|
BoxTokenFilter(Box::new(tokenizer))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// `TokenStream` is the result of the tokenization.
|
/// `TokenStream` is the result of the tokenization.
|
||||||
///
|
///
|
||||||
/// It consists consumable stream of `Token`s.
|
/// It consists consumable stream of `Token`s.
|
||||||
@@ -186,7 +224,7 @@ impl<'b> TokenStream for Box<dyn TokenStream + 'b> {
|
|||||||
/// ```
|
/// ```
|
||||||
/// use tantivy::tokenizer::*;
|
/// use tantivy::tokenizer::*;
|
||||||
///
|
///
|
||||||
/// let tokenizer = SimpleTokenizer
|
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
/// .filter(RemoveLongFilter::limit(40))
|
/// .filter(RemoveLongFilter::limit(40))
|
||||||
/// .filter(LowerCaser);
|
/// .filter(LowerCaser);
|
||||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||||
@@ -225,7 +263,7 @@ pub trait TokenStream {
|
|||||||
/// ```
|
/// ```
|
||||||
/// use tantivy::tokenizer::*;
|
/// use tantivy::tokenizer::*;
|
||||||
///
|
///
|
||||||
/// let tokenizer = SimpleTokenizer
|
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||||
/// .filter(RemoveLongFilter::limit(40))
|
/// .filter(RemoveLongFilter::limit(40))
|
||||||
/// .filter(LowerCaser);
|
/// .filter(LowerCaser);
|
||||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||||
@@ -243,6 +281,8 @@ pub trait TokenStream {
|
|||||||
|
|
||||||
/// Helper function to consume the entire `TokenStream`
|
/// Helper function to consume the entire `TokenStream`
|
||||||
/// and push the tokens to a sink function.
|
/// and push the tokens to a sink function.
|
||||||
|
///
|
||||||
|
/// Remove this.
|
||||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
|
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
|
||||||
let mut num_tokens_pushed = 0u32;
|
let mut num_tokens_pushed = 0u32;
|
||||||
while self.advance() {
|
while self.advance() {
|
||||||
@@ -253,33 +293,20 @@ pub trait TokenStream {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
pub trait TokenFilterClone {
|
||||||
pub struct ChainTokenizer<HeadTokenFilterFactory, TailTokenizer> {
|
fn box_clone(&self) -> BoxTokenFilter;
|
||||||
head: HeadTokenFilterFactory,
|
|
||||||
tail: TailTokenizer,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
|
|
||||||
for ChainTokenizer<HeadTokenFilterFactory, TailTokenizer>
|
|
||||||
where
|
|
||||||
HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
|
|
||||||
TailTokenizer: Tokenizer<'a>,
|
|
||||||
{
|
|
||||||
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
|
|
||||||
|
|
||||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
|
||||||
let tail_token_stream = self.tail.token_stream(text);
|
|
||||||
self.head.transform(tail_token_stream)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Trait for the pluggable components of `Tokenizer`s.
|
/// Trait for the pluggable components of `Tokenizer`s.
|
||||||
pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
|
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||||
/// The resulting `TokenStream` type.
|
|
||||||
type ResultTokenStream: TokenStream;
|
|
||||||
|
|
||||||
/// Wraps a token stream and returns the modified one.
|
/// Wraps a token stream and returns the modified one.
|
||||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
|
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||||
|
fn box_clone(&self) -> BoxTokenFilter {
|
||||||
|
BoxTokenFilter::from(self.clone())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -1,11 +1,10 @@
|
|||||||
use crate::tokenizer::stemmer::Language;
|
use crate::tokenizer::stemmer::Language;
|
||||||
use crate::tokenizer::BoxedTokenizer;
|
use crate::tokenizer::tokenizer::TextAnalyzer;
|
||||||
use crate::tokenizer::LowerCaser;
|
use crate::tokenizer::LowerCaser;
|
||||||
use crate::tokenizer::RawTokenizer;
|
use crate::tokenizer::RawTokenizer;
|
||||||
use crate::tokenizer::RemoveLongFilter;
|
use crate::tokenizer::RemoveLongFilter;
|
||||||
use crate::tokenizer::SimpleTokenizer;
|
use crate::tokenizer::SimpleTokenizer;
|
||||||
use crate::tokenizer::Stemmer;
|
use crate::tokenizer::Stemmer;
|
||||||
use crate::tokenizer::Tokenizer;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
@@ -23,16 +22,16 @@ use std::sync::{Arc, RwLock};
|
|||||||
/// search engine.
|
/// search engine.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct TokenizerManager {
|
pub struct TokenizerManager {
|
||||||
tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
|
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenizerManager {
|
impl TokenizerManager {
|
||||||
/// Registers a new tokenizer associated with a given name.
|
/// Registers a new tokenizer associated with a given name.
|
||||||
pub fn register<A>(&self, tokenizer_name: &str, tokenizer: A)
|
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||||
where
|
where
|
||||||
A: Into<BoxedTokenizer>,
|
TextAnalyzer: From<T>,
|
||||||
{
|
{
|
||||||
let boxed_tokenizer = tokenizer.into();
|
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
||||||
self.tokenizers
|
self.tokenizers
|
||||||
.write()
|
.write()
|
||||||
.expect("Acquiring the lock should never fail")
|
.expect("Acquiring the lock should never fail")
|
||||||
@@ -40,7 +39,7 @@ impl TokenizerManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Accessing a tokenizer given its name.
|
/// Accessing a tokenizer given its name.
|
||||||
pub fn get(&self, tokenizer_name: &str) -> Option<BoxedTokenizer> {
|
pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
|
||||||
self.tokenizers
|
self.tokenizers
|
||||||
.read()
|
.read()
|
||||||
.expect("Acquiring the lock should never fail")
|
.expect("Acquiring the lock should never fail")
|
||||||
@@ -62,13 +61,13 @@ impl Default for TokenizerManager {
|
|||||||
manager.register("raw", RawTokenizer);
|
manager.register("raw", RawTokenizer);
|
||||||
manager.register(
|
manager.register(
|
||||||
"default",
|
"default",
|
||||||
SimpleTokenizer
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser),
|
.filter(LowerCaser),
|
||||||
);
|
);
|
||||||
manager.register(
|
manager.register(
|
||||||
"en_stem",
|
"en_stem",
|
||||||
SimpleTokenizer
|
TextAnalyzer::from(SimpleTokenizer)
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new(Language::English)),
|
.filter(Stemmer::new(Language::English)),
|
||||||
|
|||||||
Reference in New Issue
Block a user