mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
Closes #387
This commit is contained in:
@@ -15,7 +15,6 @@ keywords = ["search", "information", "retrieval"]
|
||||
base64 = "0.9.1"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "1.0"
|
||||
fst = {version="0.3", default-features=false}
|
||||
fst-regex = { version="0.2" }
|
||||
|
||||
@@ -168,8 +168,6 @@ extern crate rand;
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
extern crate test;
|
||||
|
||||
extern crate tinysegmenter;
|
||||
|
||||
#[macro_use]
|
||||
extern crate downcast;
|
||||
|
||||
|
||||
@@ -1,94 +0,0 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use tinysegmenter;
|
||||
|
||||
/// Simple japanese tokenizer based on the `tinysegmenter` crate.
|
||||
#[derive(Clone)]
|
||||
pub struct JapaneseTokenizer;
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum Cursor {
|
||||
HasNotStarted,
|
||||
Cursor(usize),
|
||||
Terminated,
|
||||
}
|
||||
|
||||
pub struct JapaneseTokenizerStream {
|
||||
tokens: Vec<Token>,
|
||||
cursor: Cursor,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> for JapaneseTokenizer {
|
||||
type TokenStreamImpl = JapaneseTokenizerStream;
|
||||
|
||||
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let mut tokens = vec![];
|
||||
let mut offset_from;
|
||||
let mut offset_to = 0;
|
||||
for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() {
|
||||
offset_from = offset_to;
|
||||
offset_to = offset_from + term.len();
|
||||
if term.chars().all(char::is_alphanumeric) {
|
||||
tokens.push(Token {
|
||||
offset_from,
|
||||
offset_to,
|
||||
position: pos,
|
||||
text: term,
|
||||
position_length: 1
|
||||
});
|
||||
}
|
||||
}
|
||||
JapaneseTokenizerStream {
|
||||
tokens,
|
||||
cursor: Cursor::HasNotStarted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for JapaneseTokenizerStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
let new_cursor = match self.cursor {
|
||||
Cursor::HasNotStarted => {
|
||||
if self.tokens.is_empty() {
|
||||
Cursor::Terminated
|
||||
} else {
|
||||
Cursor::Cursor(0)
|
||||
}
|
||||
}
|
||||
Cursor::Cursor(pos) => {
|
||||
let new_pos = pos + 1;
|
||||
if new_pos >= self.tokens.len() {
|
||||
Cursor::Terminated
|
||||
} else {
|
||||
Cursor::Cursor(new_pos)
|
||||
}
|
||||
}
|
||||
Cursor::Terminated => Cursor::Terminated,
|
||||
};
|
||||
self.cursor = new_cursor;
|
||||
self.cursor != Cursor::Terminated
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
match self.cursor {
|
||||
Cursor::Terminated => {
|
||||
panic!("You called .token(), after the end of the token stream has been reached");
|
||||
}
|
||||
Cursor::Cursor(i) => &self.tokens[i],
|
||||
Cursor::HasNotStarted => {
|
||||
panic!("You called .token(), before having called `.advance()`.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
match self.cursor {
|
||||
Cursor::Terminated => {
|
||||
panic!("You called .token(), after the end of the token stream has been reached");
|
||||
}
|
||||
Cursor::Cursor(i) => &mut self.tokens[i],
|
||||
Cursor::HasNotStarted => {
|
||||
panic!("You called .token(), before having called `.advance()`.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -130,7 +130,6 @@
|
||||
//!
|
||||
mod alphanum_only;
|
||||
mod facet_tokenizer;
|
||||
mod japanese_tokenizer;
|
||||
mod lower_caser;
|
||||
mod ngram_tokenizer;
|
||||
mod raw_tokenizer;
|
||||
@@ -144,7 +143,6 @@ mod tokenizer_manager;
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::facet_tokenizer::FacetTokenizer;
|
||||
pub use self::japanese_tokenizer::JapaneseTokenizer;
|
||||
pub use self::lower_caser::LowerCaser;
|
||||
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||
pub use self::raw_tokenizer::RawTokenizer;
|
||||
@@ -224,28 +222,6 @@ pub mod test {
|
||||
assert_token(&tokens[3], 3, "payer", 17, 22);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jp_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let en_tokenizer = tokenizer_manager.get("ja").unwrap();
|
||||
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
en_tokenizer
|
||||
.token_stream("野菜食べないとやばい!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 5);
|
||||
assert_token(&tokens[0], 0, "野菜", 0, 6);
|
||||
assert_token(&tokens[1], 1, "食べ", 6, 12);
|
||||
assert_token(&tokens[2], 2, "ない", 12, 18);
|
||||
assert_token(&tokens[3], 3, "と", 18, 21);
|
||||
assert_token(&tokens[4], 4, "やばい", 21, 30);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer() {
|
||||
use super::{LowerCaser, NgramTokenizer};
|
||||
|
||||
@@ -2,7 +2,6 @@ use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tokenizer::tokenizer::box_tokenizer;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::JapaneseTokenizer;
|
||||
use tokenizer::LowerCaser;
|
||||
use tokenizer::RawTokenizer;
|
||||
use tokenizer::RemoveLongFilter;
|
||||
@@ -74,7 +73,6 @@ impl Default for TokenizerManager {
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new()),
|
||||
);
|
||||
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
|
||||
manager
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user