This commit is contained in:
Paul Masurel
2016-05-01 15:08:44 +09:00
parent 3a2af1aa65
commit 389cdffb4b
7 changed files with 7 additions and 305 deletions

View File

@@ -1,95 +0,0 @@
extern crate regex;
use std::str::Chars;
pub struct TokenIter<'a> {
chars: Chars<'a>,
term_buffer: String,
}
fn append_char_lowercase(c: char, term_buffer: &mut String) {
for c_lower in c.to_lowercase() {
term_buffer.push(c_lower);
}
}
pub trait StreamingIterator<'a, T> {
fn next(&'a mut self) -> Option<T>;
}
impl<'a, 'b> TokenIter<'b> {
fn consume_token(&'a mut self) -> Option<&'a str> {
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
}
else {
break;
}
},
None => {
break;
}
}
}
return Some(&self.term_buffer);
}
}
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
fn next(&'a mut self,) -> Option<&'a str> {
self.term_buffer.clear();
// skipping non-letter characters.
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
return self.consume_token();
}
}
None => { return None; }
}
}
}
}
pub struct SimpleTokenizer;
impl SimpleTokenizer {
pub fn new() -> SimpleTokenizer {
SimpleTokenizer
}
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
TokenIter {
term_buffer: String::new(),
chars: text.chars(),
}
}
}
#[test]
fn test_tokenizer() {
let simple_tokenizer = SimpleTokenizer::new();
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
assert_eq!(term_reader.next().unwrap(), "hello");
assert_eq!(term_reader.next().unwrap(), "happy");
assert_eq!(term_reader.next().unwrap(), "tax");
assert_eq!(term_reader.next().unwrap(), "payer");
assert_eq!(term_reader.next(), None);
}
#[test]
fn test_tokenizer_empty() {
let simple_tokenizer = SimpleTokenizer::new();
let mut term_reader = simple_tokenizer.tokenize("");
assert_eq!(term_reader.next(), None);
}

View File

@@ -1,201 +0,0 @@
use DocId;
use core::reader::SegmentReader;
use core::searcher::SegmentLocalId;
use core::searcher::DocAddress;
use fastfield::U32FastFieldReader;
use schema::U32Field;
use std::io;
pub trait Collector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
fn collect(&mut self, doc_id: DocId);
}
pub struct FirstNCollector {
docs: Vec<DocAddress>,
current_segment: u32,
limit: usize,
}
impl FirstNCollector {
pub fn with_limit(limit: usize) -> FirstNCollector {
FirstNCollector {
docs: Vec::new(),
limit: limit,
current_segment: 0,
}
}
pub fn docs(self,) -> Vec<DocAddress> {
self.docs
}
}
impl Collector for FirstNCollector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
self.current_segment = segment_local_id;
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
if self.docs.len() < self.limit {
self.docs.push(DocAddress(self.current_segment.clone(), doc_id));
}
}
}
pub struct CountCollector {
count: usize,
}
impl CountCollector {
pub fn new() -> CountCollector {
CountCollector {
count: 0,
}
}
pub fn count(&self,) -> usize {
self.count
}
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
Ok(())
}
fn collect(&mut self, _: DocId) {
self.count += 1;
}
}
pub struct MultiCollector<'a> {
collectors: Vec<&'a mut Collector>,
}
impl<'a> MultiCollector<'a> {
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
MultiCollector {
collectors: collectors,
}
}
}
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
for collector in self.collectors.iter_mut() {
try!(collector.set_segment(segment_local_id, segment));
}
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
for collector in self.collectors.iter_mut() {
collector.collect(doc_id);
}
}
}
pub struct TestCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
}
impl TestCollector {
pub fn new() -> TestCollector {
TestCollector {
docs: Vec::new(),
offset: 0,
segment_max_doc: 0,
}
}
pub fn docs(self,) -> Vec<DocId> {
self.docs
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
self.docs.push(doc_id + self.offset);
}
}
pub struct FastFieldTestCollector {
vals: Vec<u32>,
u32_field: U32Field,
ff_reader: Option<U32FastFieldReader>,
}
impl FastFieldTestCollector {
pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
u32_field: u32_field,
ff_reader: None,
}
}
pub fn vals(&self,) -> &Vec<u32> {
&self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field)));
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
let val = self.ff_reader.as_ref().unwrap().get(doc_id);
self.vals.push(val);
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::new();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc);
}
count_collector.count()
});
}
// #[bench]
// fn build_first_3_collector(b: &mut Bencher) {
// b.iter(|| {
// let mut first3collector = FirstNCollector::with_limit(3);
// let docs: Vec<u32> = (0..1_000_000).collect();
// for doc in docs {
// first3collector.collect(doc);
// }
// first3collector.docs()
// });
// }
}

View File

@@ -215,8 +215,8 @@ mod tests {
use schema::Term;
use core::index::Index;
use core::searcher::DocAddress;
use core::collector::FastFieldTestCollector;
use core::collector::TestCollector;
use collector::FastFieldTestCollector;
use collector::TestCollector;
#[test]
fn test_index_merger() {

View File

@@ -1,9 +1,7 @@
pub mod writer;
pub mod analyzer;
pub mod reader;
pub mod codec;
pub mod searcher;
pub mod collector;
pub mod index;
pub mod merger;

View File

@@ -3,7 +3,7 @@ use core::index::Index;
use core::index::Segment;
use DocId;
use schema::{Document, Term};
use core::collector::Collector;
use collector::Collector;
use std::io;
use common::TimerTree;

View File

@@ -5,9 +5,9 @@ use schema::Term;
use schema::TextFieldValue;
use core::codec::*;
use core::index::Index;
use core::analyzer::SimpleTokenizer;
use analyzer::SimpleTokenizer;
use core::index::SerializableSegment;
use core::analyzer::StreamingIterator;
use analyzer::StreamingIterator;
use core::index::Segment;
use core::index::SegmentInfo;
use postings::PostingsWriter;

View File

@@ -35,16 +35,16 @@ mod compression;
mod fastfield;
mod store;
mod common;
pub mod analyzer;
pub mod collector;
pub mod schema;
pub use directory::Directory;
pub use core::analyzer;
pub use core::searcher::Searcher;
pub use core::index::Index;
pub use schema::Term;
pub use schema::Document;
pub use core::collector;
pub use core::reader::SegmentReader;
pub use core::searcher::SegmentLocalId;