mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
added
This commit is contained in:
@@ -1,7 +1,3 @@
|
||||
|
||||
|
||||
// /usr/bin/c++ -Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk -I/Users/pmasurel/github/FastPFor/headers -o CMakeFiles/example.dir/example.cpp.o -c /Users/pmasurel/github/FastPFor/example.cpp
|
||||
|
||||
#include <iostream>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -15,10 +11,6 @@ static shared_ptr<IntegerCODEC> codec = CODECFactory::getFromName("s4-bp128-dm"
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
||||
|
||||
|
||||
size_t encode_native(
|
||||
uint32_t* begin,
|
||||
const size_t num_els,
|
||||
|
||||
@@ -12,14 +12,23 @@ pub struct TokenIter<'a> {
|
||||
chars: Chars<'a>,
|
||||
}
|
||||
|
||||
|
||||
fn append_char(c: char, term_buffer: &mut String) {
|
||||
for c_lower in c.to_lowercase() {
|
||||
term_buffer.push(c_lower);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenIter<'a> {
|
||||
|
||||
|
||||
pub fn read_one(&mut self, term_buffer: &mut String) -> bool {
|
||||
term_buffer.clear();
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
term_buffer.push(c);
|
||||
append_char(c, term_buffer);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
@@ -35,7 +44,7 @@ impl<'a> TokenIter<'a> {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
term_buffer.push(c);
|
||||
append_char(c, term_buffer);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
|
||||
@@ -37,7 +37,6 @@ impl SegmentPostings {
|
||||
pub fn from_data(data: &[u8]) -> SegmentPostings {
|
||||
let mut cursor = Cursor::new(data);
|
||||
let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize;
|
||||
println!("doc_freq {}", doc_freq);
|
||||
let data_size = cursor.read_u32::<BigEndian>().unwrap() as usize;
|
||||
// TODO remove allocs
|
||||
let mut data = Vec::with_capacity(data_size);
|
||||
@@ -47,9 +46,6 @@ impl SegmentPostings {
|
||||
let mut doc_ids: Vec<u32> = (0..doc_freq as u32 ).collect();
|
||||
let decoder = Decoder::new();
|
||||
decoder.decode(&data, &mut doc_ids);
|
||||
for a in doc_ids.iter() {
|
||||
println!("uncompressed {}", a);
|
||||
}
|
||||
SegmentPostings {
|
||||
doc_ids: doc_ids,
|
||||
doc_id: 0,
|
||||
@@ -125,10 +121,8 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
pub fn get_term<'a>(&'a self, term: &Term) -> Option<SegmentPostings> {
|
||||
println!("Term {:?}", term);
|
||||
match self.term_offsets.get(term.as_slice()) {
|
||||
Some(offset) => {
|
||||
println!("offset {}", offset);
|
||||
Some(self.read_postings(offset as usize))
|
||||
},
|
||||
None => None,
|
||||
@@ -136,10 +130,19 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
pub fn search(&self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings> {
|
||||
let segment_postings: Vec<SegmentPostings> = terms
|
||||
.iter()
|
||||
.map(|term| self.get_term(term).unwrap())
|
||||
.collect();
|
||||
|
||||
let mut segment_postings: Vec<SegmentPostings> = Vec::new();
|
||||
for term in terms.iter() {
|
||||
match self.get_term(term) {
|
||||
Some(segment_posting) => {
|
||||
segment_postings.push(segment_posting);
|
||||
}
|
||||
None => {
|
||||
segment_postings.clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
IntersectionPostings::from_postings(segment_postings)
|
||||
}
|
||||
|
||||
|
||||
@@ -10,11 +10,22 @@ pub struct Searcher {
|
||||
|
||||
impl Searcher {
|
||||
pub fn for_directory(directory: Directory) -> Searcher {
|
||||
let mut segment_readers: Vec<SegmentReader> = Vec::new();
|
||||
for segment in directory.segments().into_iter() {
|
||||
println!("{:?}", segment);
|
||||
match SegmentReader::open(segment.clone()) {
|
||||
Ok(segment_reader) => {
|
||||
segment_readers.push(segment_reader);
|
||||
println!("opened {:?}", segment);
|
||||
}
|
||||
Err(err) => {
|
||||
// TODO return err
|
||||
println!("Error while opening {:?}, {:?}", segment, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
Searcher {
|
||||
segments: directory.segments()
|
||||
.into_iter()
|
||||
.map(|segment| SegmentReader::open(segment).unwrap() ) // TODO error handling
|
||||
.collect()
|
||||
segments: segment_readers
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,16 +26,14 @@ impl Encoder {
|
||||
pub fn encode(&mut self, input: &[u32]) -> &[u32] {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len > self.input_buffer.len() {
|
||||
println!("resising {}", input_len);
|
||||
self.input_buffer = (0..input_len as u32 + 10 ).collect();
|
||||
self.output_buffer = (0..input_len as u32 + 10).collect();
|
||||
if input_len >= self.input_buffer.len() {
|
||||
self.input_buffer = (0..input_len as u32).collect();
|
||||
self.output_buffer = (0..input_len as u32 + 1000).collect();
|
||||
// TODO use resize when available
|
||||
}
|
||||
println!("self.input_buffer {}", self.input_buffer.len());
|
||||
// TODO use clone_from when available
|
||||
unsafe {
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
// TODO use clone_from when available
|
||||
let written_size = encode_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
@@ -61,43 +59,28 @@ impl Decoder {
|
||||
compressed_data: &[u32],
|
||||
uncompressed_values: &mut [u32]) -> size_t {
|
||||
unsafe {
|
||||
let num_elements = decode_native(
|
||||
return decode_native(
|
||||
compressed_data.as_ptr(),
|
||||
compressed_data.len() as size_t,
|
||||
uncompressed_values.as_mut_ptr(),
|
||||
uncompressed_values.len() as size_t);
|
||||
return num_elements;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_encode_decode() {
|
||||
let mut encoder = Encoder::new();
|
||||
let input: Vec<u32> = vec!(2,3,5,7,11,13,17,19,23);
|
||||
let data = encoder.encode(&input);
|
||||
assert_eq!(data.len(), 4);
|
||||
// let decoder = Decoder::new();
|
||||
// let mut data_output: Vec<u32> = (0..100).collect();
|
||||
// assert_eq!(9, decoder.decode(&data[0..4], &mut data_output));
|
||||
// for i in 0..9 {
|
||||
// assert_eq!(data_output[i], input[i]) ;
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_encode_decode_big() {
|
||||
fn test_encode_big() {
|
||||
let mut encoder = Encoder::new();
|
||||
let input: Vec<u32> = (0..1_000_000).collect();
|
||||
let input: Vec<u32> = (0..100000).into_iter().collect();
|
||||
let data = encoder.encode(&input);
|
||||
assert_eq!(data.len(), 95718);
|
||||
assert_eq!(data.len(), 9578);
|
||||
let decoder = Decoder::new();
|
||||
let mut data_output: Vec<u32> = (0..1_000_000).collect();
|
||||
assert_eq!(1_000_000, decoder.decode(&data[0..95718], &mut data_output));
|
||||
for i in 0..9 {
|
||||
let mut data_output: Vec<u32> = (0..100000).collect();
|
||||
assert_eq!(100000, decoder.decode(&data[0..9578], &mut data_output));
|
||||
for i in 0..100000 {
|
||||
assert_eq!(data_output[i], input[i]) ;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,7 +115,6 @@ impl SegmentWriter {
|
||||
let mut tokens = self.tokenizer.tokenize(&field_value.text);
|
||||
while tokens.read_one(&mut term_buffer) {
|
||||
let term = Term::from_field_text(&field_value.field, term_buffer.as_ref());
|
||||
println!("token {:?}", term);
|
||||
self.suscribe(doc_id, term);
|
||||
self.num_tokens += 1;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user