mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-14 04:52:54 +00:00
blop
This commit is contained in:
@@ -1,95 +0,0 @@
|
||||
extern crate regex;
|
||||
|
||||
use std::str::Chars;
|
||||
|
||||
pub struct TokenIter<'a> {
|
||||
chars: Chars<'a>,
|
||||
term_buffer: String,
|
||||
}
|
||||
|
||||
fn append_char_lowercase(c: char, term_buffer: &mut String) {
|
||||
for c_lower in c.to_lowercase() {
|
||||
term_buffer.push(c_lower);
|
||||
}
|
||||
}
|
||||
|
||||
pub trait StreamingIterator<'a, T> {
|
||||
fn next(&'a mut self) -> Option<T>;
|
||||
}
|
||||
|
||||
impl<'a, 'b> TokenIter<'b> {
|
||||
fn consume_token(&'a mut self) -> Option<&'a str> {
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Some(&self.term_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
|
||||
|
||||
fn next(&'a mut self,) -> Option<&'a str> {
|
||||
self.term_buffer.clear();
|
||||
// skipping non-letter characters.
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
return self.consume_token();
|
||||
}
|
||||
}
|
||||
None => { return None; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
|
||||
impl SimpleTokenizer {
|
||||
pub fn new() -> SimpleTokenizer {
|
||||
SimpleTokenizer
|
||||
}
|
||||
|
||||
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
|
||||
TokenIter {
|
||||
term_buffer: String::new(),
|
||||
chars: text.chars(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
|
||||
assert_eq!(term_reader.next().unwrap(), "hello");
|
||||
assert_eq!(term_reader.next().unwrap(), "happy");
|
||||
assert_eq!(term_reader.next().unwrap(), "tax");
|
||||
assert_eq!(term_reader.next().unwrap(), "payer");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let mut term_reader = simple_tokenizer.tokenize("");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
@@ -1,201 +0,0 @@
|
||||
use DocId;
|
||||
use core::reader::SegmentReader;
|
||||
use core::searcher::SegmentLocalId;
|
||||
use core::searcher::DocAddress;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use schema::U32Field;
|
||||
use std::io;
|
||||
|
||||
pub trait Collector {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
|
||||
fn collect(&mut self, doc_id: DocId);
|
||||
}
|
||||
|
||||
pub struct FirstNCollector {
|
||||
docs: Vec<DocAddress>,
|
||||
current_segment: u32,
|
||||
limit: usize,
|
||||
}
|
||||
|
||||
impl FirstNCollector {
|
||||
pub fn with_limit(limit: usize) -> FirstNCollector {
|
||||
FirstNCollector {
|
||||
docs: Vec::new(),
|
||||
limit: limit,
|
||||
current_segment: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn docs(self,) -> Vec<DocAddress> {
|
||||
self.docs
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FirstNCollector {
|
||||
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
|
||||
self.current_segment = segment_local_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
if self.docs.len() < self.limit {
|
||||
self.docs.push(DocAddress(self.current_segment.clone(), doc_id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl CountCollector {
|
||||
pub fn new() -> CountCollector {
|
||||
CountCollector {
|
||||
count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn count(&self,) -> usize {
|
||||
self.count
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, _: DocId) {
|
||||
self.count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MultiCollector<'a> {
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector {
|
||||
collectors: collectors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
|
||||
for collector in self.collectors.iter_mut() {
|
||||
try!(collector.set_segment(segment_local_id, segment));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
for collector in self.collectors.iter_mut() {
|
||||
collector.collect(doc_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TestCollector {
|
||||
offset: DocId,
|
||||
segment_max_doc: DocId,
|
||||
docs: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl TestCollector {
|
||||
pub fn new() -> TestCollector {
|
||||
TestCollector {
|
||||
docs: Vec::new(),
|
||||
offset: 0,
|
||||
segment_max_doc: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn docs(self,) -> Vec<DocId> {
|
||||
self.docs
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
self.docs.push(doc_id + self.offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u32>,
|
||||
u32_field: U32Field,
|
||||
ff_reader: Option<U32FastFieldReader>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
u32_field: u32_field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(&self,) -> &Vec<u32> {
|
||||
&self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
|
||||
self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc_id);
|
||||
self.vals.push(val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::new();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
|
||||
// #[bench]
|
||||
// fn build_first_3_collector(b: &mut Bencher) {
|
||||
// b.iter(|| {
|
||||
// let mut first3collector = FirstNCollector::with_limit(3);
|
||||
// let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
// for doc in docs {
|
||||
// first3collector.collect(doc);
|
||||
// }
|
||||
// first3collector.docs()
|
||||
// });
|
||||
// }
|
||||
}
|
||||
@@ -215,8 +215,8 @@ mod tests {
|
||||
use schema::Term;
|
||||
use core::index::Index;
|
||||
use core::searcher::DocAddress;
|
||||
use core::collector::FastFieldTestCollector;
|
||||
use core::collector::TestCollector;
|
||||
use collector::FastFieldTestCollector;
|
||||
use collector::TestCollector;
|
||||
|
||||
#[test]
|
||||
fn test_index_merger() {
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
pub mod writer;
|
||||
pub mod analyzer;
|
||||
pub mod reader;
|
||||
pub mod codec;
|
||||
pub mod searcher;
|
||||
pub mod collector;
|
||||
pub mod index;
|
||||
pub mod merger;
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ use core::index::Index;
|
||||
use core::index::Segment;
|
||||
use DocId;
|
||||
use schema::{Document, Term};
|
||||
use core::collector::Collector;
|
||||
use collector::Collector;
|
||||
use std::io;
|
||||
use common::TimerTree;
|
||||
|
||||
|
||||
@@ -5,9 +5,9 @@ use schema::Term;
|
||||
use schema::TextFieldValue;
|
||||
use core::codec::*;
|
||||
use core::index::Index;
|
||||
use core::analyzer::SimpleTokenizer;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use core::index::SerializableSegment;
|
||||
use core::analyzer::StreamingIterator;
|
||||
use analyzer::StreamingIterator;
|
||||
use core::index::Segment;
|
||||
use core::index::SegmentInfo;
|
||||
use postings::PostingsWriter;
|
||||
|
||||
@@ -35,16 +35,16 @@ mod compression;
|
||||
mod fastfield;
|
||||
mod store;
|
||||
mod common;
|
||||
pub mod analyzer;
|
||||
pub mod collector;
|
||||
|
||||
pub mod schema;
|
||||
|
||||
pub use directory::Directory;
|
||||
pub use core::analyzer;
|
||||
pub use core::searcher::Searcher;
|
||||
pub use core::index::Index;
|
||||
pub use schema::Term;
|
||||
pub use schema::Document;
|
||||
pub use core::collector;
|
||||
pub use core::reader::SegmentReader;
|
||||
pub use core::searcher::SegmentLocalId;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user