This commit is contained in:
Paul Masurel
2018-09-09 16:23:56 +09:00
parent 23e97da9f6
commit 7e5f697d00
5 changed files with 0 additions and 123 deletions

View File

@@ -15,7 +15,6 @@ keywords = ["search", "information", "retrieval"]
base64 = "0.9.1"
byteorder = "1.0"
lazy_static = "1"
tinysegmenter = "0.1.0"
regex = "1.0"
fst = {version="0.3", default-features=false}
fst-regex = { version="0.2" }

View File

@@ -168,8 +168,6 @@ extern crate rand;
#[cfg(all(test, feature = "unstable"))]
extern crate test;
extern crate tinysegmenter;
#[macro_use]
extern crate downcast;

View File

@@ -1,94 +0,0 @@
use super::{Token, TokenStream, Tokenizer};
use tinysegmenter;
/// Simple japanese tokenizer based on the `tinysegmenter` crate.
#[derive(Clone)]
pub struct JapaneseTokenizer;
#[derive(Eq, PartialEq)]
enum Cursor {
HasNotStarted,
Cursor(usize),
Terminated,
}
pub struct JapaneseTokenizerStream {
tokens: Vec<Token>,
cursor: Cursor,
}
impl<'a> Tokenizer<'a> for JapaneseTokenizer {
type TokenStreamImpl = JapaneseTokenizerStream;
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
let mut tokens = vec![];
let mut offset_from;
let mut offset_to = 0;
for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() {
offset_from = offset_to;
offset_to = offset_from + term.len();
if term.chars().all(char::is_alphanumeric) {
tokens.push(Token {
offset_from,
offset_to,
position: pos,
text: term,
position_length: 1
});
}
}
JapaneseTokenizerStream {
tokens,
cursor: Cursor::HasNotStarted,
}
}
}
impl<'a> TokenStream for JapaneseTokenizerStream {
fn advance(&mut self) -> bool {
let new_cursor = match self.cursor {
Cursor::HasNotStarted => {
if self.tokens.is_empty() {
Cursor::Terminated
} else {
Cursor::Cursor(0)
}
}
Cursor::Cursor(pos) => {
let new_pos = pos + 1;
if new_pos >= self.tokens.len() {
Cursor::Terminated
} else {
Cursor::Cursor(new_pos)
}
}
Cursor::Terminated => Cursor::Terminated,
};
self.cursor = new_cursor;
self.cursor != Cursor::Terminated
}
fn token(&self) -> &Token {
match self.cursor {
Cursor::Terminated => {
panic!("You called .token(), after the end of the token stream has been reached");
}
Cursor::Cursor(i) => &self.tokens[i],
Cursor::HasNotStarted => {
panic!("You called .token(), before having called `.advance()`.");
}
}
}
fn token_mut(&mut self) -> &mut Token {
match self.cursor {
Cursor::Terminated => {
panic!("You called .token(), after the end of the token stream has been reached");
}
Cursor::Cursor(i) => &mut self.tokens[i],
Cursor::HasNotStarted => {
panic!("You called .token(), before having called `.advance()`.");
}
}
}
}

View File

@@ -130,7 +130,6 @@
//!
mod alphanum_only;
mod facet_tokenizer;
mod japanese_tokenizer;
mod lower_caser;
mod ngram_tokenizer;
mod raw_tokenizer;
@@ -144,7 +143,6 @@ mod tokenizer_manager;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::facet_tokenizer::FacetTokenizer;
pub use self::japanese_tokenizer::JapaneseTokenizer;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer;
@@ -224,28 +222,6 @@ pub mod test {
assert_token(&tokens[3], 3, "payer", 17, 22);
}
#[test]
fn test_jp_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let en_tokenizer = tokenizer_manager.get("ja").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("野菜食べないとやばい!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 5);
assert_token(&tokens[0], 0, "野菜", 0, 6);
assert_token(&tokens[1], 1, "食べ", 6, 12);
assert_token(&tokens[2], 2, "ない", 12, 18);
assert_token(&tokens[3], 3, "", 18, 21);
assert_token(&tokens[4], 4, "やばい", 21, 30);
}
#[test]
fn test_ngram_tokenizer() {
use super::{LowerCaser, NgramTokenizer};

View File

@@ -2,7 +2,6 @@ use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use tokenizer::tokenizer::box_tokenizer;
use tokenizer::BoxedTokenizer;
use tokenizer::JapaneseTokenizer;
use tokenizer::LowerCaser;
use tokenizer::RawTokenizer;
use tokenizer::RemoveLongFilter;
@@ -74,7 +73,6 @@ impl Default for TokenizerManager {
.filter(LowerCaser)
.filter(Stemmer::new()),
);
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
manager
}
}