diff --git a/Cargo.toml b/Cargo.toml index 6411054f7..ce2fe204b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,4 @@ itertools = "0.4.5" memmap = "0.2.3" lazy_static = "0.1.*" regex = "0.1" +fst = "0.1.26" diff --git a/src/core/directory.rs b/src/core/directory.rs index 3caa90fa1..14f2e9733 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -81,8 +81,7 @@ impl Segment { // MemoryPointer pub trait MemoryPointer { - fn len(&self) -> usize; - fn ptr(&self) -> *const u8; + fn data(&self) -> &[u8]; } ///////////////////////////////////////////////////////// @@ -94,31 +93,25 @@ pub struct ResidentMemoryPointer { } impl MemoryPointer for ResidentMemoryPointer { - fn len(&self) -> usize { - self.len - } - fn ptr(&self) -> *const u8 { - &self.data[0] + fn data(&self) -> &[u8] { + self.data.deref() } } ///////////////////////////////////////////////////////// +// MmapMemory // -// - pub struct MmapMemory(Mmap); impl MemoryPointer for MmapMemory { - fn len(&self) -> usize { + fn data(&self) -> &[u8] { let &MmapMemory(ref mmap) = self; - mmap.len() - } - fn ptr(&self) -> *const u8 { - let &MmapMemory(ref mmap) = self; - mmap.ptr() + unsafe { + mmap.as_slice() + } } } diff --git a/src/core/global.rs b/src/core/global.rs index c3f986d63..ca66384e5 100644 --- a/src/core/global.rs +++ b/src/core/global.rs @@ -1,6 +1,13 @@ +use std::io::{BufWriter, Write}; +use std::io; pub type DocId = usize; // pub trait SeekableIterator: Iterator { // pub fn seek(&mut self, el: &T) -> bool; // } + + +pub trait Flushable { + fn flush(&self, writer: &mut W) -> Result; +} diff --git a/src/core/postings.rs b/src/core/postings.rs index b97a90064..72cb134cd 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -7,6 +7,18 @@ use core::global::DocId; // use std::slice; use std::vec; + +///////////////////////////// + + +pub trait PostingsWriter { + fn suscribe(&mut self, DocId); +} + + +//////////////////////////////////// + + pub trait Postings { type IteratorType: Iterator; fn iter(&self) -> Self::IteratorType; diff --git a/src/core/writer.rs b/src/core/writer.rs index a5e4ddca7..b711d9ea0 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -6,27 +6,57 @@ use core::directory::Directory; use core::analyzer::tokenize; use std::collections::{HashMap, BTreeMap}; use core::DocId; +use core::postings::PostingsWriter; +use core::global::Flushable; +use std::io::{BufWriter, Write}; +use std::mem; +use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; -pub struct PostingsWriter { + +pub struct SimplePostingsWriter { doc_ids: Vec, } -impl PostingsWriter { - pub fn new()->PostingsWriter { - PostingsWriter { +impl SimplePostingsWriter { + pub fn new() -> SimplePostingsWriter { + SimplePostingsWriter { doc_ids: Vec::new(), } } +} - pub fn suscribe(&mut self, doc_id: DocId) { +impl PostingsWriter for SimplePostingsWriter { + fn suscribe(&mut self, doc_id: DocId) { self.doc_ids.push(doc_id); } } +impl Flushable for SimplePostingsWriter { + fn flush(&self, writer: &mut W) -> Result { + let num_docs = self.doc_ids.len() as u64; + writer.write_u64::(num_docs); + for &doc_id in self.doc_ids.iter() { + writer.write_u64::(doc_id as u64); + } + Ok(1) + } +} + struct FieldWriter { - postings: Vec, + postings: Vec, term_index: BTreeMap, } +// +// impl Flushable for FieldWriter { +// fn flush(&self, writer: &mut W) -> Result { +// let num_docs = self.doc_ids.len() as u64; +// writer.write_u64::(num_docs); +// for &doc_id in self.doc_ids.iter() { +// writer.write_u64::(doc_id as u64); +// } +// Ok(1) +// } +// } impl FieldWriter { pub fn new() -> FieldWriter { @@ -36,7 +66,7 @@ impl FieldWriter { } } - pub fn get_postings_writer(&mut self, term_text: &str) -> &mut PostingsWriter { + pub fn get_postings_writer(&mut self, term_text: &str) -> &mut SimplePostingsWriter { match self.term_index.get(term_text) { Some(unord_id) => { return &mut self.postings[*unord_id]; @@ -44,7 +74,7 @@ impl FieldWriter { None => {} } let unord_id = self.term_index.len(); - self.postings.push(PostingsWriter::new()); + self.postings.push(SimplePostingsWriter::new()); self.term_index.insert(String::from(term_text), unord_id.clone()); &mut self.postings[unord_id] } diff --git a/src/lib.rs b/src/lib.rs index d6f4668ce..3f535347f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #[macro_use] extern crate lazy_static; +extern crate byteorder; pub mod core; diff --git a/tests/core.rs b/tests/core.rs index 6e16a0ace..2f422a483 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -1,5 +1,6 @@ extern crate tantivy; extern crate itertools; +extern crate byteorder; use tantivy::core::DocId; use tantivy::core::postings::{VecPostings, intersection}; @@ -9,6 +10,11 @@ use tantivy::core::writer::IndexWriter; use tantivy::core::directory::Directory; use tantivy::core::schema::{Field, Document}; use tantivy::core::reader::IndexReader; +use tantivy::core::writer::SimplePostingsWriter; +use tantivy::core::postings::PostingsWriter; +use tantivy::core::global::Flushable; +use std::io::{ BufWriter, Write }; +use std::convert::From; #[test] fn test_intersection() { @@ -39,3 +45,16 @@ fn test_indexing() { let index_reader = IndexReader::open(&directory); } } + +#[test] +fn test_postings_writer() { + let mut postings_writer = SimplePostingsWriter::new(); + postings_writer.suscribe(1); + postings_writer.suscribe(4); + postings_writer.suscribe(5); + postings_writer.suscribe(17); + let mut buffer: Vec = Vec::new(); + assert_eq!(buffer.len(), 0); + postings_writer.flush(&mut buffer); + assert_eq!(buffer.len(), 5 * 8); +}