This commit is contained in:
Paul Masurel
2016-01-13 00:47:24 +09:00
parent 8657c62ea9
commit f443ec59b3
7 changed files with 86 additions and 23 deletions

View File

@@ -9,3 +9,4 @@ itertools = "0.4.5"
memmap = "0.2.3"
lazy_static = "0.1.*"
regex = "0.1"
fst = "0.1.26"

View File

@@ -81,8 +81,7 @@ impl Segment {
// MemoryPointer
pub trait MemoryPointer {
fn len(&self) -> usize;
fn ptr(&self) -> *const u8;
fn data(&self) -> &[u8];
}
/////////////////////////////////////////////////////////
@@ -94,31 +93,25 @@ pub struct ResidentMemoryPointer {
}
impl MemoryPointer for ResidentMemoryPointer {
fn len(&self) -> usize {
self.len
}
fn ptr(&self) -> *const u8 {
&self.data[0]
fn data(&self) -> &[u8] {
self.data.deref()
}
}
/////////////////////////////////////////////////////////
// MmapMemory
//
//
pub struct MmapMemory(Mmap);
impl MemoryPointer for MmapMemory {
fn len(&self) -> usize {
fn data(&self) -> &[u8] {
let &MmapMemory(ref mmap) = self;
mmap.len()
}
fn ptr(&self) -> *const u8 {
let &MmapMemory(ref mmap) = self;
mmap.ptr()
unsafe {
mmap.as_slice()
}
}
}

View File

@@ -1,6 +1,13 @@
use std::io::{BufWriter, Write};
use std::io;
pub type DocId = usize;
// pub trait SeekableIterator<T>: Iterator<T> {
// pub fn seek(&mut self, el: &T) -> bool;
// }
pub trait Flushable {
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error>;
}

View File

@@ -7,6 +7,18 @@ use core::global::DocId;
// use std::slice;
use std::vec;
/////////////////////////////
pub trait PostingsWriter {
fn suscribe(&mut self, DocId);
}
////////////////////////////////////
pub trait Postings {
type IteratorType: Iterator<Item=DocId>;
fn iter(&self) -> Self::IteratorType;

View File

@@ -6,27 +6,57 @@ use core::directory::Directory;
use core::analyzer::tokenize;
use std::collections::{HashMap, BTreeMap};
use core::DocId;
use core::postings::PostingsWriter;
use core::global::Flushable;
use std::io::{BufWriter, Write};
use std::mem;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
pub struct PostingsWriter {
pub struct SimplePostingsWriter {
doc_ids: Vec<DocId>,
}
impl PostingsWriter {
pub fn new()->PostingsWriter {
PostingsWriter {
impl SimplePostingsWriter {
pub fn new() -> SimplePostingsWriter {
SimplePostingsWriter {
doc_ids: Vec::new(),
}
}
}
pub fn suscribe(&mut self, doc_id: DocId) {
impl PostingsWriter for SimplePostingsWriter {
fn suscribe(&mut self, doc_id: DocId) {
self.doc_ids.push(doc_id);
}
}
impl Flushable for SimplePostingsWriter {
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error> {
let num_docs = self.doc_ids.len() as u64;
writer.write_u64::<NativeEndian>(num_docs);
for &doc_id in self.doc_ids.iter() {
writer.write_u64::<NativeEndian>(doc_id as u64);
}
Ok(1)
}
}
struct FieldWriter {
postings: Vec<PostingsWriter>,
postings: Vec<SimplePostingsWriter>,
term_index: BTreeMap<String, usize>,
}
//
// impl Flushable for FieldWriter {
// fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error> {
// let num_docs = self.doc_ids.len() as u64;
// writer.write_u64::<NativeEndian>(num_docs);
// for &doc_id in self.doc_ids.iter() {
// writer.write_u64::<NativeEndian>(doc_id as u64);
// }
// Ok(1)
// }
// }
impl FieldWriter {
pub fn new() -> FieldWriter {
@@ -36,7 +66,7 @@ impl FieldWriter {
}
}
pub fn get_postings_writer(&mut self, term_text: &str) -> &mut PostingsWriter {
pub fn get_postings_writer(&mut self, term_text: &str) -> &mut SimplePostingsWriter {
match self.term_index.get(term_text) {
Some(unord_id) => {
return &mut self.postings[*unord_id];
@@ -44,7 +74,7 @@ impl FieldWriter {
None => {}
}
let unord_id = self.term_index.len();
self.postings.push(PostingsWriter::new());
self.postings.push(SimplePostingsWriter::new());
self.term_index.insert(String::from(term_text), unord_id.clone());
&mut self.postings[unord_id]
}

View File

@@ -1,4 +1,5 @@
#[macro_use]
extern crate lazy_static;
extern crate byteorder;
pub mod core;

View File

@@ -1,5 +1,6 @@
extern crate tantivy;
extern crate itertools;
extern crate byteorder;
use tantivy::core::DocId;
use tantivy::core::postings::{VecPostings, intersection};
@@ -9,6 +10,11 @@ use tantivy::core::writer::IndexWriter;
use tantivy::core::directory::Directory;
use tantivy::core::schema::{Field, Document};
use tantivy::core::reader::IndexReader;
use tantivy::core::writer::SimplePostingsWriter;
use tantivy::core::postings::PostingsWriter;
use tantivy::core::global::Flushable;
use std::io::{ BufWriter, Write };
use std::convert::From;
#[test]
fn test_intersection() {
@@ -39,3 +45,16 @@ fn test_indexing() {
let index_reader = IndexReader::open(&directory);
}
}
#[test]
fn test_postings_writer() {
let mut postings_writer = SimplePostingsWriter::new();
postings_writer.suscribe(1);
postings_writer.suscribe(4);
postings_writer.suscribe(5);
postings_writer.suscribe(17);
let mut buffer: Vec<u8> = Vec::new();
assert_eq!(buffer.len(), 0);
postings_writer.flush(&mut buffer);
assert_eq!(buffer.len(), 5 * 8);
}