This commit is contained in:
Paul Masurel
2016-05-01 15:23:35 +09:00
55 changed files with 2809 additions and 2430 deletions

View File

@@ -1,94 +0,0 @@
extern crate regex;
use std::str::Chars;
use std::ascii::AsciiExt;
pub struct TokenIter<'a> {
chars: Chars<'a>,
term_buffer: String,
}
fn append_char_lowercase(c: char, term_buffer: &mut String) {
term_buffer.push(c.to_ascii_lowercase());
}
pub trait StreamingIterator<'a, T> {
fn next(&'a mut self) -> Option<T>;
}
impl<'a, 'b> TokenIter<'b> {
fn consume_token(&'a mut self) -> Option<&'a str> {
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
}
else {
break;
}
},
None => {
break;
}
}
}
return Some(&self.term_buffer);
}
}
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
fn next(&'a mut self,) -> Option<&'a str> {
self.term_buffer.clear();
// skipping non-letter characters.
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
return self.consume_token();
}
}
None => { return None; }
}
}
}
}
pub struct SimpleTokenizer;
impl SimpleTokenizer {
pub fn new() -> SimpleTokenizer {
SimpleTokenizer
}
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
TokenIter {
term_buffer: String::new(),
chars: text.chars(),
}
}
}
#[test]
fn test_tokenizer() {
let simple_tokenizer = SimpleTokenizer::new();
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
assert_eq!(term_reader.next().unwrap(), "hello");
assert_eq!(term_reader.next().unwrap(), "happy");
assert_eq!(term_reader.next().unwrap(), "tax");
assert_eq!(term_reader.next().unwrap(), "payer");
assert_eq!(term_reader.next(), None);
}
#[test]
fn test_tokenizer_empty() {
let simple_tokenizer = SimpleTokenizer::new();
let mut term_reader = simple_tokenizer.tokenize("");
assert_eq!(term_reader.next(), None);
}

View File

@@ -4,11 +4,12 @@ use rustc_serialize::json;
use core::index::Segment;
use core::index::SegmentInfo;
use core::index::SegmentComponent;
use core::fastfield::FastFieldSerializer;
use core::store::StoreWriter;
use core::postings::PostingsSerializer;
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use core::convert_to_ioerror;
use postings::PostingsSerializer;
pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter,

View File

@@ -1,190 +0,0 @@
use core::schema::DocId;
use core::reader::SegmentReader;
use core::searcher::SegmentLocalId;
use core::searcher::DocAddress;
use core::fastfield::U32FastFieldReader;
use core::schema::U32Field;
use std::io;
pub trait Collector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
fn collect(&mut self, doc_id: DocId);
}
pub struct FirstNCollector {
docs: Vec<DocAddress>,
current_segment: u32,
limit: usize,
}
impl FirstNCollector {
pub fn with_limit(limit: usize) -> FirstNCollector {
FirstNCollector {
docs: Vec::new(),
limit: limit,
current_segment: 0,
}
}
pub fn docs(self,) -> Vec<DocAddress> {
self.docs
}
}
impl Collector for FirstNCollector {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
self.current_segment = segment_local_id;
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
if self.docs.len() < self.limit {
self.docs.push(DocAddress(self.current_segment.clone(), doc_id));
}
}
}
pub struct CountCollector {
count: usize,
}
impl CountCollector {
pub fn new() -> CountCollector {
CountCollector {
count: 0,
}
}
pub fn count(&self,) -> usize {
self.count
}
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
Ok(())
}
fn collect(&mut self, _: DocId) {
self.count += 1;
}
}
pub struct PairCollector<'a, 'b, CollectorLeft: Collector + 'a, CollectorRight: Collector + 'b> {
left: &'a mut CollectorLeft,
right: &'b mut CollectorRight,
}
impl<'a, 'b, CollectorLeft: Collector+ 'a, CollectorRight: Collector + 'b> PairCollector<'a, 'b, CollectorLeft, CollectorRight> {
pub fn from(left: &'a mut CollectorLeft, right: &'b mut CollectorRight) -> PairCollector<'a, 'b, CollectorLeft, CollectorRight> {
PairCollector {
left: left,
right: right,
}
}
}
impl<'a, 'b, CollectorLeft: Collector + 'a, CollectorRight: Collector + 'b>
Collector for PairCollector<'a, 'b, CollectorLeft, CollectorRight> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
self.left.collect(doc_id);
self.right.collect(doc_id);
}
}
pub struct TestCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
}
impl TestCollector {
pub fn new() -> TestCollector {
TestCollector {
docs: Vec::new(),
offset: 0,
segment_max_doc: 0,
}
}
pub fn docs(self,) -> Vec<DocId> {
self.docs
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
self.docs.push(doc_id + self.offset);
}
}
pub struct FastFieldTestCollector {
vals: Vec<u32>,
u32_field: U32Field,
ff_reader: Option<U32FastFieldReader>,
}
impl FastFieldTestCollector {
pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
u32_field: u32_field,
ff_reader: None,
}
}
pub fn vals(&self,) -> &Vec<u32> {
&self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field)));
Ok(())
}
fn collect(&mut self, doc_id: DocId) {
let val = self.ff_reader.as_ref().unwrap().get(doc_id);
self.vals.push(val);
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::new();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc);
}
count_collector.count()
});
}
}

View File

@@ -1,305 +0,0 @@
use std::io::BufWriter;
use std::marker::Send;
use std::marker::Sync;
use std::io;
use std::io::Cursor;
use std::io::Write;
use std::io::Seek;
use std::io::SeekFrom;
use std::fs::File;
use std::fmt;
use std::collections::HashMap;
use std::collections::hash_map::Entry as HashMapEntry;
use fst::raw::MmapReadOnly;
use atomicwrites;
use std::sync::Arc;
use std::sync::RwLock;
use tempdir::TempDir;
use std::ops::Deref;
use std::path::{Path, PathBuf};
///////////////////////////////////////////////////////////////
pub enum ReadOnlySource {
Mmap(MmapReadOnly),
Anonymous(Vec<u8>),
}
impl Deref for ReadOnlySource {
type Target = [u8];
fn deref(&self) -> &[u8] {
self.as_slice()
}
}
impl ReadOnlySource {
pub fn len(&self,) -> usize {
self.as_slice().len()
}
pub fn as_slice(&self,) -> &[u8] {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}
pub fn cursor<'a>(&'a self) -> Cursor<&'a [u8]> {
Cursor::new(&self.deref())
}
pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
}
ReadOnlySource::Anonymous(ref shared_vec) => {
let sliced_data: Vec<u8> = Vec::from(&shared_vec[from_offset..to_offset]);
ReadOnlySource::Anonymous(sliced_data)
},
}
}
}
impl Clone for ReadOnlySource {
fn clone(&self) -> Self {
self.slice(0, self.len())
}
}
pub trait SeekableWrite: Seek + Write {}
impl<T: Seek + Write> SeekableWrite for T {}
pub type WritePtr = Box<SeekableWrite>;
//
// #[derive(Debug)]
// pub enum CreateError {
// RootDirectoryDoesNotExist,
// DirectoryAlreadyExists,
// CannotCreateTempDirectory(io::Error),
// }
pub trait Directory: fmt::Debug + Send + Sync {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource>;
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr>;
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
fn sync(&self, path: &Path) -> io::Result<()>;
fn sync_directory(&self,) -> io::Result<()>;
}
////////////////////////////////////////////////////////////////
// MmapDirectory
pub struct MmapDirectory {
root_path: PathBuf,
mmap_cache: RwLock<HashMap<PathBuf, MmapReadOnly>>,
_temp_directory: Option<TempDir>,
}
impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.root_path)
}
}
impl MmapDirectory {
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
// TODO error management
let tempdir = try!(TempDir::new("index"));
let tempdir_path = PathBuf::from(tempdir.path());
let directory = MmapDirectory {
root_path: PathBuf::from(tempdir_path),
mmap_cache: RwLock::new(HashMap::new()),
_temp_directory: Some(tempdir)
};
Ok(directory)
}
pub fn create(filepath: &Path) -> io::Result<MmapDirectory> {
Ok(MmapDirectory {
root_path: PathBuf::from(filepath),
mmap_cache: RwLock::new(HashMap::new()),
_temp_directory: None
})
}
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
self.root_path.join(relative_path)
}
}
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource> {
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().unwrap();
let mmap = match mmap_cache.entry(full_path.clone()) {
HashMapEntry::Occupied(e) => e.get().clone(),
HashMapEntry::Vacant(vacant_entry) => {
let new_mmap = try!(MmapReadOnly::open_path(full_path.clone()));
vacant_entry.insert(new_mmap.clone());
new_mmap
}
};
Ok(ReadOnlySource::Mmap(mmap))
}
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr> {
let full_path = self.resolve_path(path);
let file = try!(File::create(full_path));
let buf_writer = BufWriter::new(file);
Ok(Box::new(buf_writer))
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
meta_file.write(|f| {
f.write_all(data)
})
}
fn sync(&self, path: &Path) -> io::Result<()> {
let full_path = self.resolve_path(path);
File::open(&full_path).and_then(|fd| fd.sync_all())
}
fn sync_directory(&self,) -> io::Result<()> {
File::open(&self.root_path).and_then(|fd| fd.sync_all())
}
}
////////////////////////////////////////////////////////////////
// RAMDirectory
#[derive(Clone)]
struct SharedVec(Arc<RwLock<Cursor<Vec<u8>>>>);
pub struct RAMDirectory {
fs: HashMap<PathBuf, SharedVec>,
}
impl SharedVec {
fn new() -> SharedVec {
SharedVec(Arc::new( RwLock::new(Cursor::new(Vec::new())) ))
}
}
impl Write for SharedVec {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
try!(self.0.write().unwrap().write(buf));
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl Seek for SharedVec {
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
self.0.write().unwrap().seek(pos)
}
}
impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "RAMDirectory")
}
}
impl RAMDirectory {
pub fn create() -> RAMDirectory {
RAMDirectory {
fs: HashMap::new()
}
}
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource> {
match self.fs.get(path) {
Some(ref data) => {
let data_copy = (*data).0.read().unwrap().clone();
Ok(ReadOnlySource::Anonymous(data_copy.into_inner()))
},
None =>
Err(io::Error::new(io::ErrorKind::NotFound, format!("File has never been created. {:?}", path)))
}
}
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr> {
let full_path = PathBuf::from(&path);
let data = SharedVec::new();
self.fs.insert(full_path, data.clone());
Ok(Box::new(data))
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let meta_file = atomicwrites::AtomicFile::new(PathBuf::from(path), atomicwrites::AllowOverwrite);
meta_file.write(|f| {
f.write_all(data)
})
}
fn sync(&self, _: &Path) -> io::Result<()> {
Ok(())
}
fn sync_directory(&self,) -> io::Result<()> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
fn test_directory(directory: &mut Directory) {
{
let mut write_file = directory.open_write(Path::new("toto")).unwrap();
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7,3,5]).unwrap();
}
let read_file = directory.open_read(Path::new("toto")).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data.len(), 5);
assert_eq!(data[0], 4);
assert_eq!(data[1], 3);
assert_eq!(data[2], 7);
assert_eq!(data[3], 3);
assert_eq!(data[4], 5);
}
}

View File

@@ -1,112 +0,0 @@
use std::num::Wrapping;
// ported from libdivide.h by ridiculous_fish
const LIBDIVIDE_32_SHIFT_MASK: u8 = 0x1F;
const LIBDIVIDE_ADD_MARKER: u8 = 0x40;
const LIBDIVIDE_U32_SHIFT_PATH: u8 = 0x80;
pub fn count_leading_zeros(mut val: u32) -> u8 {
if val == 0 {
return 32;
}
let mut result = 0u8;
while (val & (1u32 << 31)) == 0 {
val <<= 1;
result += 1;
}
return result;
}
pub fn count_trailing_zeros(mut val: u32) -> u8 {
let mut result = 0u8;
val = (val ^ (val - 1)) >> 1;
while val != 0 {
val >>= 1;
result += 1;
}
result
}
#[derive(Debug)]
pub struct DividerU32 {
magic: u32,
more: u8,
}
fn divide_64_div_32_to_32(n: u64, d: u32) -> (u32, u32) {
let d64: u64 = d as u64;
let q: u64 = n / d64;
let r: u32 = (Wrapping(n) - (Wrapping(q) * Wrapping(d64))).0 as u32;
(q as u32, r)
}
impl DividerU32 {
pub fn divide_by(d: u32) -> DividerU32 {
if (d & (d - 1)) == 0 {
DividerU32 {
magic: 0,
more: count_trailing_zeros(d) | LIBDIVIDE_U32_SHIFT_PATH,
}
}
else {
let floor_log_2_d: u8 = 31 - count_leading_zeros(d);
let more: u8;
let (mut proposed_m, rem) = divide_64_div_32_to_32((1u64 << floor_log_2_d) << 32, d);
assert!(rem > 0 && rem < d);
let e = d - rem;
if e < (1u32 << floor_log_2_d) {
more = floor_log_2_d;
}
else {
proposed_m = proposed_m << 1;
let twice_rem: u32 = rem * 2;
if twice_rem >= d || twice_rem < rem {
proposed_m += 1;
}
more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
}
DividerU32 {
magic: 1 + proposed_m,
more: more,
}
}
}
pub fn divide(&self, n: u32) -> u32 {
if self.more & LIBDIVIDE_U32_SHIFT_PATH != 0 {
n >> (self.more & LIBDIVIDE_32_SHIFT_MASK)
}
else {
let q_shifted = (self.magic as u64) * (n as u64);
let q = (q_shifted >> 32) as u32;
if self.more & LIBDIVIDE_ADD_MARKER != 0 {
let t = ((n - q) >> 1) + q;
t >> (self.more & LIBDIVIDE_32_SHIFT_MASK)
}
else {
q >> self.more
}
}
}
}
#[cfg(test)]
mod tests {
use super::DividerU32;
#[test]
fn test_libdivide() {
for d in 1..32 {
let divider = DividerU32::divide_by(d);
for i in 0..100_000 {
assert_eq!(divider.divide(i), i / d);
}
}
}
}

View File

@@ -1,512 +0,0 @@
use std::io::Write;
use std::io;
use std::io::SeekFrom;
use std::io::Seek;
use core::directory::WritePtr;
use core::serialize::BinarySerializable;
use core::directory::ReadOnlySource;
use std::collections::HashMap;
use core::schema::DocId;
use core::schema::Schema;
use core::schema::Document;
use std::ops::Deref;
use core::fastdivide::count_leading_zeros;
use core::fastdivide::DividerU32;
use core::schema::U32Field;
pub fn compute_num_bits(amplitude: u32) -> u8 {
32u8 - count_leading_zeros(amplitude)
}
pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
fields: Vec<(U32Field, u32)>,
num_bits: u8,
min_value: u32,
field_open: bool,
mini_buffer_written: usize,
mini_buffer: u64,
}
impl FastFieldSerializer {
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let written_size: usize = try!(0u32.serialize(&mut write));
Ok(FastFieldSerializer {
write: write,
written_size: written_size,
fields: Vec::new(),
num_bits: 0u8,
field_open: false,
mini_buffer_written: 0,
mini_buffer: 0,
min_value: 0,
})
}
pub fn new_u32_fast_field(&mut self, field: U32Field, min_value: u32, max_value: u32) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
self.min_value = min_value;
self.field_open = true;
self.fields.push((field, self.written_size as u32));
let write: &mut Write = &mut self.write;
self.written_size += try!(min_value.serialize(write));
let amplitude = max_value - min_value;
self.written_size += try!(amplitude.serialize(write));
self.num_bits = compute_num_bits(amplitude);
Ok(())
}
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
let write: &mut Write = &mut self.write;
if self.mini_buffer_written + (self.num_bits as usize) > 64 {
self.written_size += try!(self.mini_buffer.serialize(write));
self.mini_buffer = 0;
self.mini_buffer_written = 0;
}
self.mini_buffer |= ((val - self.min_value) as u64) << self.mini_buffer_written;
self.mini_buffer_written += self.num_bits as usize;
Ok(())
}
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
}
self.field_open = false;
if self.mini_buffer_written > 0 {
self.mini_buffer_written = 0;
self.written_size += try!(self.mini_buffer.serialize(&mut self.write));
}
self.mini_buffer = 0;
Ok(())
}
pub fn close(mut self,) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
}
let header_offset: usize = self.written_size;
self.written_size += try!(self.fields.serialize(&mut self.write));
try!(self.write.seek(SeekFrom::Start(0)));
try!((header_offset as u32).serialize(&mut self.write));
Ok(self.written_size)
}
}
pub struct U32FastFieldsWriter {
field_writers: Vec<U32FastFieldWriter>,
}
impl U32FastFieldsWriter {
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<U32Field> = schema.get_u32_fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.option.is_fast())
.map(|(field_id, _)| U32Field(field_id as u8))
.collect();
U32FastFieldsWriter::new(u32_fields)
}
pub fn new(fields: Vec<U32Field>) -> U32FastFieldsWriter {
U32FastFieldsWriter {
field_writers: fields
.iter()
.map(|field| U32FastFieldWriter::new(&field))
.collect(),
}
}
pub fn add_document(&mut self, doc: &Document) {
for field_writer in self.field_writers.iter_mut() {
field_writer.add_document(doc);
}
}
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
for field_writer in self.field_writers.iter() {
try!(field_writer.serialize(serializer));
}
Ok(())
}
}
pub struct U32FastFieldWriter {
field: U32Field,
vals: Vec<u32>,
}
impl U32FastFieldWriter {
pub fn new(field: &U32Field) -> U32FastFieldWriter {
U32FastFieldWriter {
field: field.clone(),
vals: Vec::new(),
}
}
pub fn add_val(&mut self, val: u32) {
self.vals.push(val);
}
pub fn add_document(&mut self, doc: &Document) {
let val = doc.get_u32(&self.field).unwrap_or(0u32);
self.add_val(val);
}
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
let zero = 0;
let min = self.vals.iter().min().unwrap_or(&zero).clone();
let max = self.vals.iter().max().unwrap_or(&min).clone();
try!(serializer.new_u32_fast_field(self.field.clone(), min, max));
for val in self.vals.iter() {
try!(serializer.add_val(val.clone()));
}
serializer.close_field()
}
}
pub struct U32FastFieldReader {
_data: ReadOnlySource,
data_ptr: *const u64,
min_val: u32,
max_val: u32,
num_bits: u8,
mask: u32,
num_in_pack: u32,
divider: DividerU32,
}
impl U32FastFieldReader {
pub fn min_val(&self,) -> u32 {
self.min_val
}
pub fn max_val(&self,) -> u32 {
self.max_val
}
pub fn open(data: ReadOnlySource) -> io::Result<U32FastFieldReader> {
let min_val;
let amplitude;
{
let mut cursor = data.cursor();
min_val = try!(u32::deserialize(&mut cursor));
amplitude = try!(u32::deserialize(&mut cursor));
}
let num_bits = compute_num_bits(amplitude);
let mask = (1 << num_bits) - 1;
let num_in_pack = 64u32 / (num_bits as u32);
let ptr: *const u8 = &(data.deref()[8 as usize]);
Ok(U32FastFieldReader {
_data: data,
data_ptr: ptr as *const u64,
min_val: min_val,
max_val: min_val + amplitude,
num_bits: num_bits,
mask: mask,
num_in_pack: num_in_pack,
divider: DividerU32::divide_by(num_in_pack),
})
}
pub fn get(&self, doc: DocId) -> u32 {
let long_addr = self.divider.divide(doc);
let ord_within_long = doc - long_addr * self.num_in_pack;
let bit_shift = (self.num_bits as u32) * ord_within_long;
let val_unshifted_unmasked: u64 = unsafe { *self.data_ptr.offset(long_addr as isize) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
return self.min_val + (val_shifted & self.mask);
}
}
pub struct U32FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<U32Field, (u32, u32)>,
}
impl U32FastFieldsReader {
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(U32Field, u32)>;
{
let mut cursor = source.cursor();
header_offset = try!(u32::deserialize(&mut cursor));
try!(cursor.seek(SeekFrom::Start(header_offset as u64)));
field_offsets = try!(Vec::deserialize(&mut cursor));
}
let mut end_offsets: Vec<u32> = field_offsets
.iter()
.map(|&(_, offset)| offset.clone())
.collect();
end_offsets.push(header_offset);
let mut field_offsets_map: HashMap<U32Field, (u32, u32)> = HashMap::new();
for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) {
let (field, start_offset) = field_start_offsets.clone();
field_offsets_map.insert(field.clone(), (start_offset.clone(), stop_offset.clone()));
}
Ok(U32FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
pub fn get_field(&self, field: &U32Field) -> io::Result<U32FastFieldReader> {
match self.field_offsets.get(field) {
Some(&(start, stop)) => {
let field_source = self.source.slice(start as usize, stop as usize);
U32FastFieldReader::open(field_source)
}
None => {
Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field, has it been set as a fast field?"))
}
}
}
}
#[cfg(test)]
mod tests {
use super::compute_num_bits;
use super::U32FastFieldsReader;
use super::U32FastFieldsWriter;
use core::schema::U32Field;
use std::path::Path;
use core::directory::WritePtr;
use core::directory::Directory;
use core::schema::Document;
use core::directory::RAMDirectory;
use core::schema::Schema;
use core::schema::FAST_U32;
use core::fastfield::FastFieldSerializer;
use test::Bencher;
use test;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
}
fn add_single_field_doc(fast_field_writers: &mut U32FastFieldsWriter, field: &U32Field, value: u32) {
let mut doc = Document::new();
doc.set_u32(field, value);
fast_field_writers.add_document(&doc);
}
#[test]
fn test_intfastfield_small() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema = Schema::new();
let field = schema.add_u32_field("field", FAST_U32);
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
add_single_field_doc(&mut fast_field_writers, &field, 13u32);
add_single_field_doc(&mut fast_field_writers, &field, 14u32);
add_single_field_doc(&mut fast_field_writers, &field, 2u32);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 29 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(&field).unwrap();
assert_eq!(fast_field_reader.get(0), 13u32);
assert_eq!(fast_field_reader.get(1), 14u32);
assert_eq!(fast_field_reader.get(2), 2u32);
}
}
#[test]
fn test_intfastfield_large() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema = Schema::new();
let field = schema.add_u32_field("field", FAST_U32);
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
add_single_field_doc(&mut fast_field_writers, &field, 4u32);
add_single_field_doc(&mut fast_field_writers, &field, 14_082_001u32);
add_single_field_doc(&mut fast_field_writers, &field, 3_052u32);
add_single_field_doc(&mut fast_field_writers, &field, 9002u32);
add_single_field_doc(&mut fast_field_writers, &field, 15_001u32);
add_single_field_doc(&mut fast_field_writers, &field, 777u32);
add_single_field_doc(&mut fast_field_writers, &field, 1_002u32);
add_single_field_doc(&mut fast_field_writers, &field, 1_501u32);
add_single_field_doc(&mut fast_field_writers, &field, 215u32);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 61 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(&field).unwrap();
assert_eq!(fast_field_reader.get(0), 4u32);
assert_eq!(fast_field_reader.get(1), 14_082_001u32);
assert_eq!(fast_field_reader.get(2), 3_052u32);
assert_eq!(fast_field_reader.get(3), 9002u32);
assert_eq!(fast_field_reader.get(4), 15_001u32);
assert_eq!(fast_field_reader.get(5), 777u32);
assert_eq!(fast_field_reader.get(6), 1_002u32);
assert_eq!(fast_field_reader.get(7), 1_501u32);
assert_eq!(fast_field_reader.get(8), 215u32);
}
}
fn generate_permutation() -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, 4];
let mut rng = XorShiftRng::from_seed(*seed);
let mut permutation: Vec<u32> = (0u32..1_000_000u32).collect();
rng.shuffle(&mut permutation);
permutation
}
#[test]
fn test_intfastfield_permutation() {
let path = Path::new("test");
let permutation = generate_permutation();
let n = permutation.len();
let mut directory = RAMDirectory::create();
let mut schema = Schema::new();
let field = schema.add_u32_field("field", FAST_U32);
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
for x in permutation.iter() {
add_single_field_doc(&mut fast_field_writers, &field, x.clone());
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(&field).unwrap();
let mut a = 0u32;
for _ in 0..n {
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
a = fast_field_reader.get(a as u32);
}
}
}
#[bench]
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
for i in (0u32..n).step_by(7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_linear_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema = Schema::new();
let field = schema.add_u32_field("field", FAST_U32);
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
for x in permutation.iter() {
add_single_field_doc(&mut fast_field_writers, &field, x.clone());
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(&field).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
for i in (0u32..n).step_by(7) {
a ^= fast_field_reader.get(i);
}
a
});
}
}
#[bench]
fn bench_intfastfield_fflookup(b: &mut Bencher) {
let path = Path::new("test");
let permutation = generate_permutation();
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema = Schema::new();
let field = schema.add_u32_field("field", FAST_U32);
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
for x in permutation.iter() {
add_single_field_doc(&mut fast_field_writers, &field, x.clone());
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(&field).unwrap();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a);
}
a
});
}
}
}

View File

@@ -1,154 +0,0 @@
use std::io;
use std::io::Seek;
use std::io::Write;
use std::io::Cursor;
use fst;
use fst::raw::Fst;
use fst::Streamer;
use core::directory::ReadOnlySource;
use core::serialize::BinarySerializable;
use std::marker::PhantomData;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
}
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()>{
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
try!(value.serialize(&mut self.data));
Ok(())
}
pub fn finish(self,) -> io::Result<W> {
let mut file = try!(
self.fst_builder
.into_inner()
.map_err(convert_fst_error));
let footer_size = self.data.len() as u32;
try!(file.write_all(&self.data));
try!((footer_size as u32).serialize(&mut file));
try!(file.flush());
Ok(file)
}
}
pub struct FstMap<V: BinarySerializable> {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
_phantom_: PhantomData<V>,
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => try!(Fst::from_bytes(data).map_err(convert_fst_error)),
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
}))
}
pub struct FstMapIter<'a, V: 'static + BinarySerializable> {
streamer: fst::map::Stream<'a>,
fst_map: &'a FstMap<V>,
__phantom__: PhantomData<V>
}
impl<'a, V: 'static + BinarySerializable> FstMapIter<'a, V> {
pub fn next(&mut self) -> Option<(&[u8], V)> {
let next_item = self.streamer.next();
match next_item {
Some((key, offset)) => {
let val = self.fst_map.read_value(offset);
Some((key, val))
},
None => None
}
}
}
impl<V: BinarySerializable> FstMap<V> {
pub fn stream<'a>(&'a self,) -> FstMapIter<'a, V> {
FstMapIter {
streamer: self.fst_index.stream(),
fst_map: self,
__phantom__: PhantomData,
}
}
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
let mut cursor = Cursor::new(source.as_slice());
try!(cursor.seek(io::SeekFrom::End(-4)));
let footer_size = try!(u32::deserialize(&mut cursor)) as usize;
let split_len = source.len() - 4 - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, source.len() - 4);
let fst_index = try!(open_fst_index(fst_source));
Ok(FstMap {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
}
fn read_value(&self, offset: u64) -> V {
let buffer = self.values_mmap.as_slice();
let mut cursor = Cursor::new(&buffer[(offset as usize)..]);
V::deserialize(&mut cursor).unwrap()
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
self.fst_index
.get(key)
.map(|offset| self.read_value(offset))
}
}
#[cfg(test)]
mod tests {
use super::*;
use core::directory::{RAMDirectory, Directory};
use std::path::PathBuf;
use fst::Streamer;
#[test]
fn test_fstmap() {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("fstmap");
{
let write = directory.open_write(&path).unwrap();
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
fstmap_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
assert_eq!(fstmap.get("abc"), Some(34u32));
assert_eq!(fstmap.get("abcd"), Some(346u32));
let mut stream = fstmap.stream();
assert_eq!(stream.next().unwrap(), ("abc".as_bytes(), 34u32));
assert_eq!(stream.next().unwrap(), ("abcd".as_bytes(), 346u32));
assert_eq!(stream.next(), None);
}
}

View File

@@ -1,14 +1,14 @@
use std::path::{PathBuf, Path};
use std::io;
use core::schema::Schema;
use core::schema::DocId;
use schema::Schema;
use DocId;
use std::io::Write;
use std::sync::{Arc, RwLock, RwLockWriteGuard, RwLockReadGuard};
use std::fmt;
use rustc_serialize::json;
use std::io::Read;
use std::io::ErrorKind as IOErrorKind;
use core::directory::{Directory, MmapDirectory, RAMDirectory, ReadOnlySource, WritePtr};
use directory::{Directory, MmapDirectory, RAMDirectory, ReadOnlySource, WritePtr};
use core::writer::IndexWriter;
use core::searcher::Searcher;
use uuid::Uuid;
@@ -238,7 +238,7 @@ pub struct SegmentInfo {
pub enum SegmentComponent {
INFO,
POSTINGS,
// POSITIONS,
POSITIONS,
FASTFIELDS,
TERMS,
STORE,
@@ -264,7 +264,7 @@ impl Segment {
fn path_suffix(component: &SegmentComponent)-> &'static str {
match *component {
// SegmentComponent::POSITIONS => ".pos",
SegmentComponent::POSITIONS => ".pos",
SegmentComponent::INFO => ".info",
SegmentComponent::POSTINGS => ".idx",
SegmentComponent::TERMS => ".term",

View File

@@ -1,22 +1,20 @@
use std::io;
use core::reader::SegmentReader;
use core::index::Segment;
use core::schema::DocId;
use DocId;
use core::index::SerializableSegment;
use core::codec::SegmentSerializer;
use core::postings::PostingsSerializer;
use core::postings::TermInfo;
use postings::PostingsSerializer;
use postings::TermInfo;
use std::collections::BinaryHeap;
use core::fstmap::FstMapIter;
use core::schema::Term;
use core::schema::Schema;
use core::fastfield::FastFieldSerializer;
use core::store::StoreWriter;
use datastruct::FstMapIter;
use schema::{Term, Schema, U32Field};
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use core::index::SegmentInfo;
use std::cmp::Ordering;
use core::schema::U32Field;
use std::cmp::min;
use std::cmp::max;
use std::cmp::{min, max, Ordering};
struct PostingsMerger<'a> {
doc_ids: Vec<DocId>,
@@ -181,7 +179,9 @@ impl IndexMerger {
match postings_merger.next() {
Some((term, doc_ids)) => {
try!(postings_serializer.new_term(&Term::from(&term), doc_ids.len() as DocId));
try!(postings_serializer.write_docs(doc_ids));
for doc_id in doc_ids.iter() {
try!(postings_serializer.write_doc(doc_id.clone(), None));
}
}
None => { break; }
}
@@ -210,13 +210,13 @@ impl SerializableSegment for IndexMerger {
#[cfg(test)]
mod tests {
use core::schema;
use core::schema::Document;
use schema;
use schema::Document;
use schema::Term;
use core::index::Index;
use core::schema::Term;
use core::searcher::DocAddress;
use core::collector::FastFieldTestCollector;
use core::collector::TestCollector;
use collector::FastFieldTestCollector;
use collector::TestCollector;
#[test]
fn test_index_merger() {

View File

@@ -1,21 +1,9 @@
pub mod postings;
pub mod schema;
pub mod directory;
pub mod writer;
pub mod analyzer;
pub mod reader;
pub mod codec;
pub mod searcher;
pub mod collector;
pub mod serialize;
pub mod store;
pub mod simdcompression;
pub mod fstmap;
pub mod index;
pub mod fastfield;
pub mod fastdivide;
pub mod merger;
pub mod timer;
use std::error;
use std::io;

View File

@@ -1,279 +0,0 @@
use core::schema::DocId;
use std::ptr;
use core::schema::Term;
use core::fstmap::FstMapBuilder;
use core::index::Segment;
use core::directory::WritePtr;
use core::index::SegmentComponent;
use core::simdcompression;
use core::serialize::BinarySerializable;
use std::io::{Read, Write};
use std::io;
use std::collections::HashMap;
#[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)]
pub struct TermInfo {
pub doc_freq: u32,
pub postings_offset: u32,
}
impl BinarySerializable for TermInfo {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
Ok(
try!(self.doc_freq.serialize(writer)) +
try!(self.postings_offset.serialize(writer))
)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let doc_freq = try!(u32::deserialize(reader));
let offset = try!(u32::deserialize(reader));
Ok(TermInfo {
doc_freq: doc_freq,
postings_offset: offset,
})
}
}
pub struct PostingsWriter {
postings: Vec<Vec<DocId>>,
term_index: HashMap<Term, usize>,
}
impl PostingsWriter {
pub fn new() -> PostingsWriter {
PostingsWriter {
postings: Vec::new(),
term_index: HashMap::new(),
}
}
pub fn suscribe(&mut self, doc: DocId, term: Term) {
let doc_ids: &mut Vec<DocId> = self.get_term_postings(term);
if doc_ids.len() == 0 || doc_ids[doc_ids.len() - 1] < doc {
doc_ids.push(doc);
}
}
fn get_term_postings(&mut self, term: Term) -> &mut Vec<DocId> {
match self.term_index.get(&term) {
Some(unord_id) => {
return &mut self.postings[*unord_id];
},
None => {}
}
let unord_id = self.term_index.len();
self.postings.push(Vec::new());
self.term_index.insert(term, unord_id.clone());
&mut self.postings[unord_id]
}
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> io::Result<()> {
let mut sorted_terms: Vec<(&Term, &usize)> = self.term_index.iter().collect();
sorted_terms.sort();
for (term, postings_id) in sorted_terms.into_iter() {
let doc_ids = &self.postings[postings_id.clone()];
let term_docfreq = doc_ids.len() as u32;
try!(serializer.new_term(&term, term_docfreq));
try!(serializer.write_docs(&doc_ids));
}
Ok(())
}
}
//////////////////////////////////
pub trait Postings: Iterator<Item=DocId> {
// after skipping position
// the iterator in such a way that the
// next call to next() will return a
// value greater or equal to target.
fn skip_next(&mut self, target: DocId) -> Option<DocId>;
}
pub struct IntersectionPostings<T: Postings> {
postings: Vec<T>,
}
impl<T: Postings> IntersectionPostings<T> {
pub fn from_postings(postings: Vec<T>) -> IntersectionPostings<T> {
IntersectionPostings {
postings: postings,
}
}
}
impl<T: Postings> Iterator for IntersectionPostings<T> {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
let mut candidate;
match self.postings[0].next() {
Some(val) => {
candidate = val;
},
None => {
return None;
}
}
'outer: loop {
for i in 1..self.postings.len() {
let skip_result = self.postings[i].skip_next(candidate);
match skip_result {
None => {
return None;
},
Some(x) if x == candidate => {
},
Some(greater) => {
unsafe {
let pa: *mut T = &mut self.postings[i];
let pb: *mut T = &mut self.postings[0];
ptr::swap(pa, pb);
}
candidate = greater;
continue 'outer;
},
}
}
return Some(candidate);
}
}
}
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>, // TODO find an alternative to work around the "move"
postings_write: WritePtr,
written_bytes_postings: usize,
encoder: simdcompression::Encoder,
}
impl PostingsSerializer {
pub fn open(segment: &Segment) -> io::Result<PostingsSerializer> {
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: postings_write,
written_bytes_postings: 0,
encoder: simdcompression::Encoder::new(),
})
}
pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
let term_info = TermInfo {
doc_freq: doc_freq,
postings_offset: self.written_bytes_postings as u32,
};
self.terms_fst_builder
.insert(term.as_slice(), &term_info)
}
pub fn write_docs(&mut self, doc_ids: &[DocId]) -> io::Result<()> {
let docs_data = self.encoder.encode_sorted(doc_ids);
self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write));
for num in docs_data {
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
}
Ok(())
}
pub fn close(mut self,) -> io::Result<()> {
try!(self.terms_fst_builder.finish());
try!(self.postings_write.flush());
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use core::schema::DocId;
#[derive(Debug)]
pub struct VecPostings {
doc_ids: Vec<DocId>,
cursor: usize,
}
impl VecPostings {
pub fn new(vals: Vec<DocId>) -> VecPostings {
VecPostings {
doc_ids: vals,
cursor: 0,
}
}
}
impl Postings for VecPostings {
// after skipping position
// the iterator in such a way that the
// next call to next() will return a
// value greater or equal to target.
fn skip_next(&mut self, target: DocId) -> Option<DocId> {
loop {
match Iterator::next(self) {
Some(val) if val >= target => {
return Some(val);
},
None => {
return None;
},
_ => {}
}
}
}
}
impl Iterator for VecPostings {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
if self.cursor >= self.doc_ids.len() {
None
}
else {
self.cursor += 1;
Some(self.doc_ids[self.cursor - 1])
}
}
}
#[test]
fn test_intersection() {
{
let left = VecPostings::new(vec!(1, 3, 9));
let right = VecPostings::new(vec!(3, 4, 9, 18));
let inter = IntersectionPostings::from_postings(vec!(left, right));
let vals: Vec<DocId> = inter.collect();
assert_eq!(vals, vec!(3, 9));
}
{
let a = VecPostings::new(vec!(1, 3, 9));
let b = VecPostings::new(vec!(3, 4, 9, 18));
let c = VecPostings::new(vec!(1, 5, 9, 111));
let inter = IntersectionPostings::from_postings(vec!(a, b, c));
let vals: Vec<DocId> = inter.collect();
assert_eq!(vals, vec!(9));
}
}
#[bench]
fn bench_single_intersection(b: &mut Bencher) {
b.iter(|| {
let docs = VecPostings::new((0..1_000_000).collect());
let intersection = IntersectionPostings::from_postings(vec!(docs));
intersection.count()
});
}
}

View File

@@ -1,27 +1,27 @@
use core::index::{Segment, SegmentId};
use core::schema::Term;
use core::store::StoreReader;
use core::schema::Document;
use core::postings::IntersectionPostings;
use core::directory::ReadOnlySource;
use schema::Term;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use std::io::Cursor;
use core::schema::DocId;
use DocId;
use core::index::SegmentComponent;
use core::simdcompression::Decoder;
use std::io;
use std::str;
use core::postings::TermInfo;
use core::fstmap::FstMap;
use postings::TermInfo;
use datastruct::FstMap;
use std::fmt;
use rustc_serialize::json;
use core::index::SegmentInfo;
use core::timer::TimerHandle;
use core::schema::U32Field;
use common::TimerTree;
use common::Timing;
use common::OpenTimer;
use schema::U32Field;
use core::convert_to_ioerror;
use core::serialize::BinarySerializable;
use core::fastfield::U32FastFieldsReader;
use core::fastfield::U32FastFieldReader;
use core::simdcompression;
use common::BinarySerializable;
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
use compression;
use compression::S4BP128Decoder;
use std::mem;
impl fmt::Debug for SegmentReader {
@@ -43,7 +43,7 @@ pub fn intersection(mut postings: Vec<SegmentPostings>) -> SegmentPostings {
let mut pair = (output, buffer);
for posting in postings.iter() {
pair = (pair.1, pair.0);
let output_len = simdcompression::intersection(posting.0.as_slice(), pair.0.as_slice(), pair.1.as_mut_slice());
let output_len = compression::intersection(posting.0.as_slice(), pair.0.as_slice(), pair.1.as_mut_slice());
unsafe { pair.1.set_len(output_len); }
}
SegmentPostings(pair.1)
@@ -77,8 +77,8 @@ impl SegmentPostings {
let mut doc_ids: Vec<u32> = Vec::with_capacity(doc_freq as usize);
unsafe { doc_ids.set_len(doc_freq as usize); }
{
let decoder = Decoder::new();
let num_doc_ids = decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids);
let decoder = S4BP128Decoder::new();
decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids);
SegmentPostings(doc_ids)
}
}
@@ -194,7 +194,7 @@ impl SegmentReader {
/// Returns the list of doc ids containing all of the
/// given terms.
pub fn search<'a>(&self, terms: &Vec<Term>, mut timer: TimerHandle<'a>) -> SegmentPostings {
pub fn search<'a>(&self, terms: &Vec<Term>, mut timer: OpenTimer<'a>) -> SegmentPostings {
if terms.len() == 1 {
match self.get_term(&terms[0]) {
Some(term_info) => {
@@ -212,7 +212,7 @@ impl SegmentReader {
for term in terms.iter() {
match self.get_term(term) {
Some(term_info) => {
let decode_one_timer = decode_timer.open("decode_one");
let _decode_one_timer = decode_timer.open("decode_one");
let segment_posting = self.read_postings(&term_info);
segment_postings.push(segment_posting);
}
@@ -224,7 +224,7 @@ impl SegmentReader {
}
}
{
let intersection_time = timer.open("intersection");
let _intersection_time = timer.open("intersection");
intersection(segment_postings)
}
}

View File

@@ -1,563 +0,0 @@
use std::io::Write;
use std::collections::HashMap;
use std::slice;
use std::fmt;
use std::io;
use std::io::Read;
use core::serialize::BinarySerializable;
use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use std::ops::BitOr;
use std::borrow::Borrow;
use std::convert::AsRef;
/// u32 identifying a document within a segment.
/// Document gets their doc id assigned incrementally,
/// as they are added in the segment.
pub type DocId = u32;
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct TextOptions {
tokenized_indexed: bool,
stored: bool,
fast: bool,
}
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct U32Options {
indexed: bool,
fast: bool,
stored: bool,
}
/// The field will be tokenized and indexed
pub const TEXT: TextOptions = TextOptions {
tokenized_indexed: true,
stored: false,
fast: false,
};
/// The field will be tokenized and indexed
pub const FAST_U32: U32Options = U32Options {
indexed: false,
stored: false,
fast: true,
};
/// A stored fields of a document can be retrieved given its DocId.
/// Stored field are stored together and LZ4 compressed.
/// Reading the stored fields of a document is relatively slow.
/// (100 microsecs)
pub const STORED: TextOptions = TextOptions {
tokenized_indexed: false,
stored: true,
fast: false,
};
/// Fast field are used for field you need to access many times during
/// collection. (e.g: for sort, aggregates).
pub const FAST: TextOptions = TextOptions {
tokenized_indexed: false,
stored: false,
fast: true
};
impl BitOr for TextOptions {
type Output = TextOptions;
fn bitor(self, other: TextOptions) -> TextOptions {
let mut res = TextOptions::new();
res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed;
res.stored = self.stored || other.stored;
res.fast = self.fast || other.fast;
res
}
}
/// Field handle
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct U32Field(pub u8);
/// Field handle
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct TextField(pub u8);
impl U32Options {
pub fn new() -> U32Options {
U32Options {
fast: false,
indexed: false,
stored: false,
}
}
pub fn is_indexed(&self,) -> bool {
self.indexed
}
pub fn set_indexed(mut self,) -> U32Options {
self.indexed = true;
self
}
pub fn is_fast(&self,) -> bool {
self.fast
}
pub fn set_fast(mut self,) -> U32Options {
self.fast = true;
self
}
}
impl TextOptions {
pub fn is_tokenized_indexed(&self,) -> bool {
self.tokenized_indexed
}
pub fn is_stored(&self,) -> bool {
self.stored
}
pub fn is_fast(&self,) -> bool {
self.fast
}
pub fn set_stored(mut self,) -> TextOptions {
self.stored = true;
self
}
pub fn set_fast(mut self,) -> TextOptions {
self.fast = true;
self
}
pub fn set_tokenized_indexed(mut self,) -> TextOptions {
self.tokenized_indexed = true;
self
}
pub fn new() -> TextOptions {
TextOptions {
fast: false,
tokenized_indexed: false,
stored: false,
}
}
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct U32FieldValue {
pub field: U32Field,
pub value: u32,
}
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct TextFieldValue {
pub field: TextField,
pub text: String,
}
impl BinarySerializable for TextField {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let TextField(field_id) = *self;
field_id.serialize(writer)
}
fn deserialize(reader: &mut Read) -> io::Result<TextField> {
u8::deserialize(reader).map(TextField)
}
}
impl BinarySerializable for U32Field {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let U32Field(field_id) = *self;
field_id.serialize(writer)
}
fn deserialize(reader: &mut Read) -> io::Result<U32Field> {
u8::deserialize(reader).map(U32Field)
}
}
impl BinarySerializable for TextFieldValue {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
Ok(
try!(self.field.serialize(writer)) +
try!(self.text.serialize(writer))
)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let field = try!(TextField::deserialize(reader));
let text = try!(String::deserialize(reader));
Ok(TextFieldValue {
field: field,
text: text,
})
}
}
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term {
data: Vec<u8>,
}
impl AsRef<[u8]> for Term {
fn as_ref(&self) -> &[u8] {
self.data.as_ref()
}
}
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
struct TextFieldEntry {
name: String,
option: TextOptions,
}
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
pub struct U32FieldEntry {
pub name: String,
pub option: U32Options,
}
/// Tantivy has a very strict schema.
/// You need to specify in advance, whether a field is indexed or not,
/// stored or not, and RAM-based or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
/// It is for the moment impossible to remove fields.
///
/// # Examples
///
/// ```
/// use tantivy::schema::{Schema, TextOptions};
///
/// fn create_schema() -> Schema {
/// let mut schema = Schema::new();
/// let str_fieldtype = TextOptions::new();
/// let text_fieldtype = TextOptions::new().set_tokenized_indexed();
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// let id_field = schema.add_text_field("id", &str_fieldtype);
/// let url_field = schema.add_text_field("url", &str_fieldtype);
/// let title_field = schema.add_text_field("title", &text_fieldtype);
/// let body_field = schema.add_text_field("body", &text_fieldtype);
/// schema
/// }
///
/// let schema = create_schema();
#[derive(Clone, Debug)]
pub struct Schema {
text_fields: Vec<TextFieldEntry>,
text_fields_map: HashMap<String, TextField>, // transient
u32_fields: Vec<U32FieldEntry>,
u32_fields_map: HashMap<String, U32Field>, // transient
}
impl Decodable for Schema {
fn decode<D: Decoder>(d: &mut D) -> Result <Self, D::Error> {
let mut schema = Schema::new();
try!(d.read_seq(|d, num_fields| {
for _ in 0..num_fields {
let field_entry = try!(TextFieldEntry::decode(d));
let field_options: &TextOptions = &field_entry.option;
schema.add_text_field(&field_entry.name, field_options);
}
Ok(())
}));
Ok(schema)
}
}
impl Encodable for Schema {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
try!(s.emit_seq(self.text_fields.len(),
|mut e| {
for (ord, field) in self.text_fields.iter().enumerate() {
try!(e.emit_seq_elt(ord, |e| field.encode(e)));
}
Ok(())
}));
Ok(())
}
}
impl Schema {
/// Creates a new, empty schema.
pub fn new() -> Schema {
Schema {
text_fields: Vec::new(),
text_fields_map: HashMap::new(),
u32_fields: Vec::new(),
u32_fields_map: HashMap::new(),
}
}
pub fn get_u32_fields(&self,) -> &Vec<U32FieldEntry> {
&self.u32_fields
}
/// Given a name, returns the field handle, as well as its associated TextOptions
pub fn get_text_field(&self, field_name: &str) -> Option<(TextField, TextOptions)> {
self.text_fields_map
.get(field_name)
.map(|&TextField(field_id)| {
let field_options = self.text_fields[field_id as usize].option.clone();
(TextField(field_id), field_options)
})
}
pub fn get_u32_field(&self, field_name: &str) -> Option<(U32Field, U32Options)> {
self.u32_fields_map
.get(field_name)
.map(|&U32Field(field_id)| {
let u32_field_options = self.u32_fields[field_id as usize].option.clone();
(U32Field(field_id), u32_field_options)
})
}
/// Returns the field options associated with a given name.
///
/// # Panics
/// Panics if the field name does not exist.
/// It is meant as an helper for user who created
/// and control the content of their schema.
///
/// If panicking is not an option for you,
/// you may use `get(&self, field_name: &str)`.
pub fn text_field(&self, fieldname: &str) -> TextField {
self.text_fields_map.get(fieldname).map(|field| field.clone()).unwrap()
}
pub fn u32_field(&self, fieldname: &str) -> U32Field {
self.u32_fields_map.get(fieldname).map(|field| field.clone()).unwrap()
}
/// Returns the field options associated to a field handle.
pub fn text_field_options(&self, field: &TextField) -> TextOptions {
let TextField(field_id) = *field;
self.text_fields[field_id as usize].option.clone()
}
pub fn u32_field_options(&self, field: &U32Field) -> U32Options {
let U32Field(field_id) = *field;
self.u32_fields[field_id as usize].option.clone()
}
/// Creates a new field.
/// Return the associated field handle.
pub fn add_text_field<RefTextOptions: Borrow<TextOptions>>(&mut self, field_name_str: &str, field_options: RefTextOptions) -> TextField {
let field = TextField(self.text_fields.len() as u8);
// TODO case if field already exists
let field_name = String::from(field_name_str);
self.text_fields.push(TextFieldEntry {
name: field_name.clone(),
option: field_options.borrow().clone(),
});
self.text_fields_map.insert(field_name, field.clone());
field
}
/// Creates a new field.
/// Return the associated field handle.
pub fn add_u32_field<RefU32Options: Borrow<U32Options>>(&mut self, field_name_str: &str, field_options: RefU32Options) -> U32Field {
let field = U32Field(self.u32_fields.len() as u8);
// TODO case if field already exists
let field_name = String::from(field_name_str);
self.u32_fields.push(U32FieldEntry {
name: field_name.clone(),
option: field_options.borrow().clone(),
});
self.u32_fields_map.insert(field_name, field.clone());
field
}
}
impl Term {
// pub fn field_text(&self,) -> TextField {
// TextField(self.data[0])
// }
//
// pub fn text(&self,) -> &str {
// str::from_utf8(&self.data[1..]).unwrap()
// }
pub fn from_field_u32(field: &U32Field, val: u32) -> Term {
let mut buffer = Vec::with_capacity(1 + 4);
let U32Field(field_idx) = *field;
buffer.clear();
buffer.push(128 | field_idx);
val.serialize(&mut buffer).unwrap();
Term {
data: buffer,
}
}
pub fn from_field_text(field: &TextField, text: &str) -> Term {
let mut buffer = Vec::with_capacity(1 + text.len());
let TextField(field_idx) = *field;
buffer.clear();
buffer.push(field_idx);
buffer.extend(text.as_bytes());
Term {
data: buffer,
}
}
pub fn from(data: &[u8]) -> Term {
Term {
data: Vec::from(data),
}
}
pub fn as_slice(&self,)->&[u8] {
&self.data
}
}
impl fmt::Debug for Term {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Term({})", self.data[0])
}
}
///
/// Document are really just a list of field values.
///
/// # Examples
///
/// ```
/// use tantivy::schema::Schema;
/// use tantivy::schema::TEXT;
///
/// let mut schema = Schema::new();
/// schema.add_text_field("body", &TEXT);
/// let field_text = schema.text_field("body");
/// ```
///
#[derive(Debug)]
pub struct Document {
pub text_field_values: Vec<TextFieldValue>,
pub u32_field_values: Vec<U32FieldValue>,
}
impl Document {
pub fn new() -> Document {
Document {
text_field_values: Vec::new(),
u32_field_values: Vec::new(),
}
}
pub fn from(text_field_values: Vec<TextFieldValue>,
u32_field_values: Vec<U32FieldValue>) -> Document {
Document {
text_field_values: text_field_values,
u32_field_values: u32_field_values
}
}
pub fn len(&self,) -> usize {
self.text_field_values.len()
}
pub fn set(&mut self, field: &TextField, text: &str) {
self.add(TextFieldValue {
field: field.clone(),
text: String::from(text)
});
}
pub fn set_u32(&mut self, field: &U32Field, value: u32) {
self.u32_field_values.push(U32FieldValue {
field: field.clone(),
value: value
});
}
pub fn add(&mut self, field_value: TextFieldValue) {
self.text_field_values.push(field_value);
}
pub fn text_fields<'a>(&'a self,) -> slice::Iter<'a, TextFieldValue> {
self.text_field_values.iter()
}
pub fn u32_fields<'a>(&'a self,) -> slice::Iter<'a, U32FieldValue> {
self.u32_field_values.iter()
}
pub fn get_u32(&self, field: &U32Field) -> Option<u32> {
self.u32_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.value)
.cloned()
.next()
}
pub fn get_texts<'a>(&'a self, field: &TextField) -> Vec<&'a String> {
self.text_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.text)
.collect()
}
pub fn get_first_text<'a>(&'a self, field: &TextField) -> Option<&'a String> {
self.text_field_values
.iter()
.filter(|field_value| field_value.field == *field)
.map(|field_value| &field_value.text)
.next()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_field_options() {
{
let field_options = STORED | FAST;
assert!(field_options.is_stored());
assert!(field_options.is_fast());
assert!(!field_options.is_tokenized_indexed());
}
{
let field_options = STORED | TEXT;
assert!(field_options.is_stored());
assert!(!field_options.is_fast());
assert!(field_options.is_tokenized_indexed());
}
}
#[test]
fn test_schema() {
{
let mut schema = Schema::new();
schema.add_text_field("body", &TEXT);
let field = schema.text_field("body");
assert!(schema.text_field_options(&field).is_tokenized_indexed());
}
}
}

View File

@@ -1,12 +1,11 @@
use core::reader::SegmentReader;
use core::index::Index;
use core::index::Segment;
use core::schema::DocId;
use core::schema::Document;
use core::collector::Collector;
use DocId;
use schema::{Document, Term};
use collector::Collector;
use std::io;
use core::timer::TimerTree;
use core::schema::Term;
use common::TimerTree;
#[derive(Debug)]
pub struct Searcher {
@@ -56,12 +55,12 @@ impl Searcher {
for (segment_ord, segment) in self.segments.iter().enumerate() {
let mut segment_search_timer = search_timer.open("segment_search");
{
let set_segment_timer = segment_search_timer.open("set_segment");
let _ = segment_search_timer.open("set_segment");
try!(collector.set_segment(segment_ord as SegmentLocalId, &segment));
}
let postings = segment.search(terms, segment_search_timer.open("get_postings"));
{
let collection_timer = segment_search_timer.open("collection");
let _collection_timer = segment_search_timer.open("collection");
for doc_id in postings {
collector.collect(doc_id);
}

View File

@@ -1,203 +0,0 @@
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use std::fmt;
use std::io::Write;
use std::io::Read;
use std::io;
use byteorder;
fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error {
match byteorder_error {
byteorder::Error::UnexpectedEOF => io::Error::new(io::ErrorKind::InvalidData, "Reached EOF unexpectedly"),
byteorder::Error::Io(e) => e,
}
}
pub trait BinarySerializable : fmt::Debug + Sized {
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
fn deserialize(reader: &mut Read) -> io::Result<Self>;
}
impl BinarySerializable for () {
fn serialize(&self, _: &mut Write) -> io::Result<usize> {
Ok(0)
}
fn deserialize(_: &mut Read) -> io::Result<Self> {
Ok(())
}
}
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut total_size = try!((self.len() as u32).serialize(writer));
for it in self.iter() {
total_size += try!(it.serialize(writer));
}
Ok(total_size)
}
fn deserialize(reader: &mut Read) -> io::Result<Vec<T>> {
let num_items = try!(u32::deserialize(reader));
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = try!(T::deserialize(reader));
items.push(item);
}
Ok(items)
}
}
impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for (Left, Right) {
fn serialize(&self, write: &mut Write) -> io::Result<usize> {
Ok(try!(self.0.serialize(write)) + try!(self.1.serialize(write)))
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
Ok( (try!(Left::deserialize(reader)), try!(Right::deserialize(reader))) )
}
}
impl BinarySerializable for u32 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
writer.write_u32::<NativeEndian>(self.clone())
.map(|_| 4)
.map_err(convert_byte_order_error)
}
fn deserialize(reader: &mut Read) -> io::Result<u32> {
reader.read_u32::<NativeEndian>()
.map_err(convert_byte_order_error)
}
}
impl BinarySerializable for u64 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
writer.write_u64::<NativeEndian>(self.clone())
.map(|_| 8)
.map_err(convert_byte_order_error)
}
fn deserialize(reader: &mut Read) -> io::Result<u64> {
reader.read_u64::<NativeEndian>()
.map_err(convert_byte_order_error)
}
}
impl BinarySerializable for u8 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
// TODO error
try!(writer.write_u8(self.clone()).map_err(convert_byte_order_error));
Ok(1)
}
fn deserialize(reader: &mut Read) -> io::Result<u8> {
reader.read_u8()
.map_err(convert_byte_order_error)
}
}
impl BinarySerializable for String {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
// TODO error
let data: &[u8] = self.as_bytes();
let mut size = try!((data.len() as u32).serialize(writer));
size += data.len();
try!(writer.write_all(data));
Ok(size)
}
fn deserialize(reader: &mut Read) -> io::Result<String> {
// TODO error
let string_length = try!(u32::deserialize(reader)) as usize;
let mut result = String::with_capacity(string_length);
try!(reader.take(string_length as u64).read_to_string(&mut result));
Ok(result)
}
}
#[cfg(test)]
mod test {
use core::serialize::BinarySerializable;
use std::io::Cursor;
#[test]
fn test_serialize_u8() {
let mut buffer: Vec<u8> = Vec::new();
{
let x: u8 = 3;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 1);
}
{
let x: u8 = 5;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 2);
}
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!(3, u8::deserialize(&mut cursor).unwrap());
assert_eq!(5, u8::deserialize(&mut cursor).unwrap());
assert!(u8::deserialize(&mut cursor).is_err());
}
#[test]
fn test_serialize_u32() {
let mut buffer: Vec<u8> = Vec::new();
{
let x: u32 = 3;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 4);
}
{
let x: u32 = 5;
x.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), 8);
}
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!(3, u32::deserialize(&mut cursor).unwrap());
assert_eq!(5, u32::deserialize(&mut cursor).unwrap());
assert!(u32::deserialize(&mut cursor).is_err());
}
#[test]
fn test_serialize_string() {
let mut buffer: Vec<u8> = Vec::new();
let first_length = 4 + 3 * 4;
let second_length = 4 + 3 * 8;
{
let x: String = String::from("ぽよぽよ");
assert_eq!(x.serialize(&mut buffer).unwrap(), first_length);
assert_eq!(buffer.len(), first_length);
}
{
let x: String = String::from("富士さん見える。");
assert_eq!(x.serialize(&mut buffer).unwrap(), second_length);
assert_eq!(buffer.len(), first_length + second_length);
}
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!("ぽよぽよ", String::deserialize(&mut cursor).unwrap());
assert_eq!("富士さん見える。", String::deserialize(&mut cursor).unwrap());
assert!(u32::deserialize(&mut cursor).is_err());
}
#[test]
fn test_serialize_vec() {
let mut buffer: Vec<u8> = Vec::new();
let first_length = 4 + 3 * 4;
let second_length = 4 + 3 * 8;
let vec = vec!(String::from("ぽよぽよ"), String::from("富士さん見える。"));
assert_eq!(vec.serialize(&mut buffer).unwrap(), first_length + second_length + 4);
let mut cursor = Cursor::new(&buffer[..]);
{
let deser: Vec<String> = Vec::deserialize(&mut cursor).unwrap();
assert_eq!(deser.len(), 2);
assert_eq!("ぽよぽよ", deser[0]);
assert_eq!("富士さん見える。", deser[1]);
}
}
}

View File

@@ -1,234 +0,0 @@
use libc::size_t;
use std::ptr;
extern {
// fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
// fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
}
pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize {
unsafe {
intersection_native(
left.as_ptr(), left.len(),
right.as_ptr(), right.len(),
output.as_mut_ptr())
}
}
pub struct Encoder {
input_buffer: Vec<u32>,
output_buffer: Vec<u32>,
}
impl Encoder {
pub fn new() -> Encoder {
Encoder {
input_buffer: Vec::new(),
output_buffer: Vec::new(),
}
}
pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] {
self.input_buffer.clear();
let input_len = input.len();
if input_len + 10000 >= self.input_buffer.len() {
let target_length = input_len + 1024;
self.input_buffer.resize(target_length, 0);
self.output_buffer.resize(target_length, 0);
}
// TODO use clone_from when available
unsafe {
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
let written_size = encode_sorted_native(
self.input_buffer.as_mut_ptr(),
input_len as size_t,
self.output_buffer.as_mut_ptr(),
self.output_buffer.len() as size_t,
);
return &self.output_buffer[0..written_size];
}
}
// pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] {
// self.input_buffer.clear();
// let input_len = input.len();
// if input_len + 10000 >= self.input_buffer.len() {
// let target_length = input_len + 1024;
// self.input_buffer.resize(target_length, 0);
// self.output_buffer.resize(target_length, 0);
// }
// // TODO use clone_from when available
// unsafe {
// ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
// let written_size = encode_unsorted_native(
// self.input_buffer.as_mut_ptr(),
// input_len as size_t,
// self.output_buffer.as_mut_ptr(),
// self.output_buffer.len() as size_t,
// );
// return &self.output_buffer[0..written_size];
// }
// }
}
pub struct Decoder;
impl Decoder {
pub fn new() -> Decoder {
Decoder
}
pub fn decode_sorted(&self,
compressed_data: &[u32],
uncompressed_values: &mut [u32]) -> size_t {
unsafe {
return decode_sorted_native(
compressed_data.as_ptr(),
compressed_data.len() as size_t,
uncompressed_values.as_mut_ptr(),
uncompressed_values.len() as size_t);
}
}
// pub fn decode_unsorted(&self,
// compressed_data: &[u32],
// uncompressed_values: &mut [u32]) -> size_t {
// unsafe {
// return decode_unsorted_native(
// compressed_data.as_ptr(),
// compressed_data.len() as size_t,
// uncompressed_values.as_mut_ptr(),
// uncompressed_values.len() as size_t);
// }
// }
}
//
// pub struct Intersector {
// output_buffer: Vec<u32>,
// }
//
// impl Intersector {
// fn new() -> Intersector {
// Intersector::with_capacity(1_000_000)
// }
// fn with_capacity(capacity: usize) -> Intersector {
// Intersector {
// output_buffer: iter::repeat(0u32).take(capacity).collect()
// }
// }
// fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] {
// let max_intersection_length = min(left.len(), right.len());
// if self.output_buffer.len() < max_intersection_length {
// self.output_buffer.resize(max_intersection_length, 0);
// }
// unsafe {
// let intersection_len = intersection_native(
// left.as_ptr(), left.len() as size_t,
// right.as_ptr(), right.len() as size_t,
// self.output_buffer.as_mut_ptr());
// return &self.output_buffer[0..intersection_len];
// }
// }
// }
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32()< ratio)
.take(n)
.collect()
}
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
#[test]
fn test_encode_big() {
let mut encoder = Encoder::new();
let num_ints = 10000 as usize;
let expected_length = 1274;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
let encoded_data = encoder.encode_sorted(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = Decoder::new();
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data));
assert_eq!(decoded_data, input);
}
// #[test]
// fn test_encode_unsorted() {
// let mut encoder = Encoder::new();
// let num_ints = 10_000 as usize;
// let expected_length = 4361;
// let input: Vec<u32> = (0..num_ints as u32)
// .map(|i| i * 213_127 % 501)
// .into_iter().collect();
// assert_eq!(input.len(), 10_000);
// let encoded_data = encoder.encode_unsorted(&input);
// assert_eq!(encoded_data.len(), expected_length);
// let decoder = Decoder::new();
// let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
// assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data));
// assert_eq!(decoded_data, input);
// }
//
// #[test]
// fn test_simd_intersection() {
// let mut intersector = Intersector::new();
// let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
// let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
// let intersection = intersector.intersection(&arr1[..], &arr2[..]) ;
// assert_eq!(intersection.len(), 500_233);
// }
#[bench]
fn bench_decode(b: &mut Bencher) {
const TEST_SIZE: usize = 1_000_000;
let arr = generate_array(TEST_SIZE, 0.1);
let mut encoder = Encoder::new();
let encoded = encoder.encode_sorted(&arr);
let mut uncompressed: Vec<u32> = (0..TEST_SIZE as u32).collect();
let decoder = Decoder;
b.iter(|| {
decoder.decode_sorted(&encoded, &mut uncompressed);
});
}
// #[bench]
// fn bench_simd_intersection(b: &mut Bencher) {
// let mut intersector = Intersector::new();
// let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
// let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
// b.iter(|| {
// intersector.intersection(&arr1[..], &arr2[..]).len()
// });
// }
}

View File

@@ -1,286 +0,0 @@
use core::directory::WritePtr;
use std::cell::RefCell;
use core::schema::DocId;
use core::schema::Document;
use core::schema::TextFieldValue;
use core::serialize::BinarySerializable;
use core::directory::ReadOnlySource;
use std::io::Write;
use std::io::Read;
use std::io::Cursor;
use std::io;
use std::io::SeekFrom;
use std::io::Seek;
use std::cmp::Ordering;
use lz4;
// TODO cache uncompressed pages
const BLOCK_SIZE: usize = 131_072;
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)]
struct OffsetIndex(DocId, u64);
pub struct StoreWriter {
doc: DocId,
offsets: Vec<OffsetIndex>, // TODO have a better index.
written: u64,
writer: WritePtr,
intermediary_buffer: Vec<u8>,
current_block: Vec<u8>,
}
impl BinarySerializable for OffsetIndex {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let OffsetIndex(a, b) = *self;
Ok(try!(a.serialize(writer)) + try!(b.serialize(writer)))
}
fn deserialize(reader: &mut Read) -> io::Result<OffsetIndex> {
let a = try!(DocId::deserialize(reader));
let b = try!(u64::deserialize(reader));
Ok(OffsetIndex(a, b))
}
}
impl StoreWriter {
pub fn new(writer: WritePtr) -> StoreWriter {
StoreWriter {
doc: 0,
written: 0,
offsets: Vec::new(),
writer: writer,
intermediary_buffer: Vec::new(),
current_block: Vec::new(),
}
}
pub fn stack_reader(&mut self, reader: &StoreReader) -> io::Result<()> {
if self.current_block.len() > 0 {
try!(self.write_and_compress_block());
}
match reader.offsets.last() {
Some(&OffsetIndex(ref num_docs, ref body_size)) => {
try!(self.writer.write_all(&reader.data.as_slice()[0..*body_size as usize]));
for &OffsetIndex(doc, offset) in reader.offsets.iter() {
self.offsets.push(OffsetIndex(self.doc + doc, self.written + offset));
}
self.written += *body_size;
self.doc += *num_docs;
Ok(())
},
None => {
Err(io::Error::new(io::ErrorKind::Other, "No offset for reader"))
}
}
}
pub fn store<'a>(&mut self, field_values: &Vec<&'a TextFieldValue>) -> io::Result<()> {
self.intermediary_buffer.clear();
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
for field_value in field_values.iter() {
try!((*field_value).serialize(&mut self.intermediary_buffer));
}
try!((self.intermediary_buffer.len() as u32).serialize(&mut self.current_block));
try!(self.current_block.write_all(&self.intermediary_buffer[..]));
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
try!(self.write_and_compress_block());
}
Ok(())
}
fn write_and_compress_block(&mut self,) -> io::Result<()> {
self.intermediary_buffer.clear();
{
let mut encoder = lz4::EncoderBuilder::new()
.build(&mut self.intermediary_buffer)
.unwrap();
try!(encoder.write_all(&self.current_block));
let (_, encoder_result) = encoder.finish();
try!(encoder_result);
}
let compressed_block_size = self.intermediary_buffer.len() as u64;
self.written += try!((compressed_block_size as u32).serialize(&mut self.writer)) as u64;
try!(self.writer.write_all(&self.intermediary_buffer));
self.written += compressed_block_size;
self.offsets.push(OffsetIndex(self.doc, self.written));
self.current_block.clear();
Ok(())
}
pub fn close(&mut self,) -> io::Result<()> {
if self.current_block.len() > 0 {
try!(self.write_and_compress_block());
}
let header_offset: u64 = self.written;
try!(self.offsets.serialize(&mut self.writer));
try!(header_offset.serialize(&mut self.writer));
self.writer.flush()
}
}
pub struct StoreReader {
data: ReadOnlySource,
offsets: Vec<OffsetIndex>,
current_block: RefCell<Vec<u8>>,
}
impl StoreReader {
fn read_header(data: &ReadOnlySource) -> Vec<OffsetIndex> {
// TODO err
// the first offset is implicitely (0, 0)
let mut offsets = vec!(OffsetIndex(0, 0));
let mut cursor = Cursor::new(data.as_slice());
cursor.seek(SeekFrom::End(-8)).unwrap();
let offset = u64::deserialize(&mut cursor).unwrap();
cursor.seek(SeekFrom::Start(offset)).unwrap();
offsets.append(&mut Vec::deserialize(&mut cursor).unwrap());
offsets
}
fn block_offset(&self, seek: &DocId) -> OffsetIndex {
fn search(offsets: &[OffsetIndex], seek: &DocId) -> OffsetIndex {
let m = offsets.len() / 2;
let pivot_offset = &offsets[m];
if offsets.len() <= 1 {
return pivot_offset.clone()
}
match pivot_offset.0.cmp(seek) {
Ordering::Less => search(&offsets[m..], seek),
Ordering::Equal => pivot_offset.clone(),
Ordering::Greater => search(&offsets[..m], seek),
}
}
search(&self.offsets, seek)
}
fn read_block(&self, block_offset: usize) -> io::Result<()> {
let mut current_block_mut = self.current_block.borrow_mut();
current_block_mut.clear();
let total_buffer = self.data.as_slice();
let mut cursor = Cursor::new(&total_buffer[block_offset..]);
let block_length = u32::deserialize(&mut cursor).unwrap();
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)];
let mut lz4_decoder = lz4::Decoder::new(Cursor::new(block_array)).unwrap();
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())
}
pub fn get(&self, doc_id: &DocId) -> io::Result<Document> {
let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id);
try!(self.read_block(block_offset as usize));
let mut current_block_mut = self.current_block.borrow_mut();
let mut cursor = Cursor::new(&mut current_block_mut[..]);
for _ in first_doc_id..*doc_id {
let block_length = try!(u32::deserialize(&mut cursor));
try!(cursor.seek(SeekFrom::Current(block_length as i64)));
}
try!(u32::deserialize(&mut cursor));
let mut text_field_values = Vec::new();
let num_fields = try!(u32::deserialize(&mut cursor));
for _ in 0..num_fields {
let text_field_value = try!(TextFieldValue::deserialize(&mut cursor));
text_field_values.push(text_field_value);
}
let u32_field_values = Vec::new();
Ok(Document {
text_field_values: text_field_values,
u32_field_values: u32_field_values,
})
}
pub fn new(data: ReadOnlySource) -> StoreReader {
let offsets = StoreReader::read_header(&data);
StoreReader {
data: data,
offsets: offsets,
current_block: RefCell::new(Vec::new()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use std::path::PathBuf;
use core::schema::Schema;
use core::schema::TextOptions;
use core::schema::TextFieldValue;
use core::directory::{RAMDirectory, Directory, MmapDirectory, WritePtr};
fn write_lorem_ipsum_store(writer: WritePtr) -> Schema {
let mut schema = Schema::new();
let field_body = schema.add_text_field("body", &TextOptions::new().set_stored());
let field_title = schema.add_text_field("title", &TextOptions::new().set_stored());
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
{
let mut store_writer = StoreWriter::new(writer);
for i in 0..1000 {
let mut fields: Vec<TextFieldValue> = Vec::new();
{
let field_value = TextFieldValue {
field: field_body.clone(),
text: lorem.clone(),
};
fields.push(field_value);
}
{
let title_text = format!("Doc {}", i);
let field_value = TextFieldValue {
field: field_title.clone(),
text: title_text,
};
fields.push(field_value);
}
let fields_refs: Vec<&TextFieldValue> = fields.iter().collect();
store_writer.store(&fields_refs).unwrap();
}
store_writer.close().unwrap();
}
schema
}
#[test]
fn test_store() {
let path = PathBuf::from("store");
let mut directory = RAMDirectory::create();
let store_file = directory.open_write(&path).unwrap();
let schema = write_lorem_ipsum_store(store_file);
let field_title = schema.text_field("title");
let store_source = directory.open_read(&path).unwrap();
let store = StoreReader::new(store_source);
for i in (0..10).map(|i| i * 3 / 2) {
assert_eq!(*store.get(&i).unwrap().get_first_text(&field_title).unwrap(), format!("Doc {}", i));
}
}
#[bench]
fn bench_store_encode(b: &mut Bencher) {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(&path).unwrap());
});
}
#[bench]
fn bench_store_decode(b: &mut Bencher) {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("store");
write_lorem_ipsum_store(directory.open_write(&path).unwrap());
let store_source = directory.open_read(&path).unwrap();
let store = StoreReader::new(store_source);
b.iter(|| {
store.get(&12).unwrap();
});
}
}

View File

@@ -1,119 +0,0 @@
use time::PreciseTime;
use rustc_serialize::json::ToJson;
use rustc_serialize::json::Json;
use std::collections::BTreeMap;
pub struct TimerHandle<'a> {
name: &'static str,
timer_tree: &'a mut TimerTree,
start: PreciseTime,
depth: u32,
}
impl<'a> TimerHandle<'a> {
pub fn open(&mut self, name: &'static str) -> TimerHandle {
TimerHandle {
name: name,
timer_tree: self.timer_tree,
start: PreciseTime::now(),
depth: self.depth + 1,
}
}
}
impl<'a> Drop for TimerHandle<'a> {
fn drop(&mut self,) {
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(),
depth: self.depth,
});
}
}
#[derive(Debug)]
pub struct Timing {
name: &'static str,
duration: i64,
depth: u32,
}
#[derive(Debug)]
pub struct TimerTree {
timings: Vec<Timing>,
}
impl TimerTree {
pub fn new() -> TimerTree {
TimerTree {
timings: Vec::new(),
}
}
pub fn open(&mut self, name: &'static str) -> TimerHandle {
TimerHandle {
name: name,
timer_tree: self,
start: PreciseTime::now(),
depth: 0,
}
}
}
fn to_json_obj(timings: &[Timing], root_depth: u32) -> Json {
let last = timings.len() - 1;
let last_timing = &timings[last];
let mut d = BTreeMap::new();
d.insert("name".to_string(), last_timing.name.to_json());
d.insert("duration".to_string(), last_timing.duration.to_json());
if timings.len() > 1 {
d.insert("children".to_string(), to_json_array(&timings[..last], root_depth + 1));
}
Json::Object(d)
}
fn to_json_array(timings: &[Timing], root_depth: u32) -> Json {
let mut offsets: Vec<usize> = vec!(0);
for offset in timings.iter()
.enumerate()
.filter(|&(offset, timing)| timing.depth == root_depth)
.map(|(offset, _)| offset) {
offsets.push(offset + 1);
}
let items: Vec<Json> = offsets.iter()
.zip(offsets[1..].iter())
.map(|(&start, &stop)| to_json_obj(&timings[start..stop], root_depth))
.collect();
Json::Array(items)
}
impl ToJson for TimerTree {
fn to_json(&self) -> Json {
to_json_array(&self.timings[..], 0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_timer() {
let mut timer_tree = TimerTree::new();
{
let mut a = timer_tree.open("a");
{
let mut ab = a.open("b");
{
let abc = ab.open("c");
}
{
let abd = ab.open("d");
}
}
}
assert_eq!(timer_tree.timings.len(), 4);
}
}

View File

@@ -1,13 +1,17 @@
use core::schema::*;
use DocId;
use schema::Schema;
use schema::Document;
use schema::Term;
use schema::TextFieldValue;
use core::codec::*;
use core::index::Index;
use core::analyzer::SimpleTokenizer;
use analyzer::SimpleTokenizer;
use core::index::SerializableSegment;
use core::analyzer::StreamingIterator;
use analyzer::StreamingIterator;
use core::index::Segment;
use core::index::SegmentInfo;
use core::postings::PostingsWriter;
use core::fastfield::U32FastFieldsWriter;
use postings::PostingsWriter;
use fastfield::U32FastFieldsWriter;
use std::clone::Clone;
use std::sync::mpsc;
use std::thread;
@@ -158,11 +162,13 @@ impl SegmentWriter {
let field_options = schema.text_field_options(&field_value.field);
if field_options.is_tokenized_indexed() {
let mut tokens = self.tokenizer.tokenize(&field_value.text);
let mut pos = 0u32;
loop {
match tokens.next() {
Some(token) => {
let term = Term::from_field_text(&field_value.field, token);
self.postings_writer.suscribe(doc_id, term);
self.postings_writer.suscribe(doc_id, pos.clone(), term);
pos += 1;
},
None => { break; }
}
@@ -173,7 +179,7 @@ impl SegmentWriter {
let field_options = schema.u32_field_options(&field_value.field);
if field_options.is_indexed() {
let term = Term::from_field_u32(&field_value.field, field_value.value);
self.postings_writer.suscribe(doc_id, term);
self.postings_writer.suscribe(doc_id, 0.clone(), term);
}
}
self.fast_field_writers.add_document(&doc);