mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
beeeee
This commit is contained in:
@@ -17,3 +17,5 @@ rustc-serialize = "0.3.16"
|
||||
log = "0.3.5"
|
||||
combine = "1.2.0"
|
||||
tempdir = "0.3.4"
|
||||
bincode = "0.4.0"
|
||||
serde = "0.6.11"
|
||||
|
||||
@@ -3,7 +3,7 @@ use core::serial::*;
|
||||
use std::io::Write;
|
||||
use fst::MapBuilder;
|
||||
use core::error::*;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use core::directory::Segment;
|
||||
use core::directory::SegmentComponent;
|
||||
use core::reader::*;
|
||||
@@ -13,6 +13,9 @@ use std::fs::File;
|
||||
|
||||
pub struct SimpleCodec;
|
||||
|
||||
|
||||
// TODO should we vint?
|
||||
|
||||
pub struct SimpleSegmentSerializer {
|
||||
written_bytes_postings: usize,
|
||||
postings_write: File,
|
||||
@@ -25,7 +28,7 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
|
||||
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
|
||||
self.cur_term_num_docs = doc_freq;
|
||||
// writing the size of the posting list
|
||||
match self.postings_write.write_u32::<LittleEndian>(doc_freq) {
|
||||
match self.postings_write.write_u32::<BigEndian>(doc_freq) {
|
||||
Ok(_) => {},
|
||||
Err(_) => {
|
||||
let msg = String::from("Failed writing posting list length");
|
||||
@@ -37,7 +40,7 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
|
||||
}
|
||||
|
||||
fn add_doc(&mut self, doc_id: DocId) -> Result<()> {
|
||||
match self.postings_write.write_u32::<LittleEndian>(doc_id as u32) {
|
||||
match self.postings_write.write_u32::<BigEndian>(doc_id as u32) {
|
||||
Ok(_) => {},
|
||||
Err(_) => {
|
||||
let msg = String::from("Failed while writing posting list");
|
||||
|
||||
@@ -9,7 +9,7 @@ pub trait Collector {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DocAddress(SegmentId, DocId);
|
||||
pub struct DocAddress(pub SegmentId, pub DocId);
|
||||
|
||||
pub struct TestCollector {
|
||||
docs: Vec<DocAddress>,
|
||||
@@ -33,7 +33,6 @@ impl TestCollector {
|
||||
impl Collector for TestCollector {
|
||||
|
||||
fn set_segment(&mut self, segment: &SegmentReader) {
|
||||
println!("eee");
|
||||
self.current_segment = Some(segment.id());
|
||||
}
|
||||
|
||||
|
||||
@@ -11,4 +11,5 @@ pub mod codec;
|
||||
pub mod error;
|
||||
pub mod searcher;
|
||||
pub mod collector;
|
||||
pub mod skip;
|
||||
pub use core::global::DocId;
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::io;
|
||||
use core::postings::IntersectionPostings;
|
||||
use fst::raw::Fst;
|
||||
use std::cmp::{Eq,PartialEq,Ord,PartialOrd,Ordering};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::borrow::Borrow;
|
||||
use std::io::Cursor;
|
||||
use core::global::DocId;
|
||||
@@ -35,7 +35,7 @@ pub struct SegmentPostings<'a> {
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
pub fn from_data(data: &[u8]) -> SegmentPostings {
|
||||
let mut cursor = Cursor::new(data);
|
||||
let doc_freq = cursor.read_u32::<LittleEndian>().unwrap() as usize;
|
||||
let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize;
|
||||
SegmentPostings {
|
||||
cursor: cursor,
|
||||
num_docs_remaining: doc_freq,
|
||||
@@ -71,7 +71,7 @@ impl<'a> Iterator for SegmentPostings<'a> {
|
||||
}
|
||||
else {
|
||||
self.num_docs_remaining -= 1;
|
||||
Some(self.cursor.read_u32::<LittleEndian>().unwrap() as DocId)
|
||||
Some(self.cursor.read_u32::<BigEndian>().unwrap() as DocId)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -129,7 +129,7 @@ impl SegmentReader {
|
||||
|
||||
fn write_postings<R: io::Read, Output, SegSer: SegmentSerializer<Output>>(mut cursor: R, num_docs: DocId, serializer: &mut SegSer) -> Result<()> {
|
||||
for i in 0..num_docs {
|
||||
let doc_id = cursor.read_u32::<LittleEndian>().unwrap();
|
||||
let doc_id = cursor.read_u32::<BigEndian>().unwrap();
|
||||
try!(serializer.add_doc(doc_id));
|
||||
}
|
||||
Ok(())
|
||||
@@ -146,7 +146,7 @@ impl SerializableSegment for SegmentReader {
|
||||
let offset = offset_u64 as usize;
|
||||
let data = unsafe { &self.postings_data.as_slice()[offset..] };
|
||||
let mut cursor = Cursor::new(data);
|
||||
let num_docs = cursor.read_u32::<LittleEndian>().unwrap() as DocId;
|
||||
let num_docs = cursor.read_u32::<BigEndian>().unwrap() as DocId;
|
||||
try!(serializer.new_term(&term, num_docs));
|
||||
try!(write_postings(cursor, num_docs, &mut serializer));
|
||||
},
|
||||
|
||||
@@ -20,7 +20,6 @@ impl Searcher {
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl Searcher {
|
||||
|
||||
pub fn search(&self, terms: &Vec<Term>, collector: &mut Collector) {
|
||||
@@ -30,7 +29,6 @@ impl Searcher {
|
||||
for doc_id in postings {
|
||||
collector.collect(doc_id);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
130
src/core/skip.rs
Normal file
130
src/core/skip.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
use std::io::Write;
|
||||
use std::io::BufWriter;
|
||||
use core::DocId;
|
||||
use std::ops::DerefMut;
|
||||
use serde::Serialize;
|
||||
use serde;
|
||||
use bincode;
|
||||
use byteorder;
|
||||
use core::error;
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
|
||||
|
||||
|
||||
// writer
|
||||
|
||||
struct LayerBuilder {
|
||||
period: usize,
|
||||
buffer: Vec<u8>,
|
||||
remaining: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl LayerBuilder {
|
||||
|
||||
fn written_size(&self,) -> usize {
|
||||
self.buffer.len()
|
||||
}
|
||||
|
||||
fn write(&self, output: &mut Write) -> Result<(), byteorder::Error> {
|
||||
try!(output.write_u32::<BigEndian>(self.len() as u32));
|
||||
try!(output.write_u32::<BigEndian>(self.buffer.len() as u32));
|
||||
try!(output.write_all(&self.buffer));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn len(&self,) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
fn with_period(period: usize) -> LayerBuilder {
|
||||
LayerBuilder {
|
||||
period: period,
|
||||
buffer: Vec::new(),
|
||||
remaining: period,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn insert<S: Serialize>(&mut self, doc_id: DocId, dest: S) -> InsertResult {
|
||||
self.remaining -= 1;
|
||||
self.len += 1;
|
||||
if self.remaining == 0 {
|
||||
let offset = self.written_size();
|
||||
dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
|
||||
self.remaining = self.period;
|
||||
InsertResult::SkipPointer(offset)
|
||||
}
|
||||
else {
|
||||
doc_id.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
|
||||
dest.serialize(&mut bincode::serde::Serializer::new(&mut self.buffer));
|
||||
InsertResult::NoNeedForSkip
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct SkipListBuilder {
|
||||
period: usize,
|
||||
layers: Vec<LayerBuilder>,
|
||||
}
|
||||
|
||||
|
||||
enum InsertResult {
|
||||
SkipPointer(usize),
|
||||
NoNeedForSkip,
|
||||
}
|
||||
|
||||
impl SkipListBuilder {
|
||||
|
||||
pub fn new(period: usize) -> SkipListBuilder {
|
||||
SkipListBuilder {
|
||||
period: period,
|
||||
layers: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn get_layer<'a>(&'a mut self, layer_id: usize) -> &mut LayerBuilder {
|
||||
if layer_id == self.layers.len() {
|
||||
let layer_builder = LayerBuilder::with_period(self.period);
|
||||
self.layers.push(layer_builder);
|
||||
}
|
||||
&mut self.layers[layer_id]
|
||||
}
|
||||
|
||||
pub fn insert<S: Serialize>(&mut self, doc_id: DocId, dest: S) {
|
||||
let mut layer_id = 0;
|
||||
match self.get_layer(0).insert(doc_id, dest) {
|
||||
InsertResult::SkipPointer(mut offset) => {
|
||||
loop {
|
||||
layer_id += 1;
|
||||
let skip_result = self.get_layer(layer_id)
|
||||
.insert(doc_id, offset);
|
||||
match skip_result {
|
||||
InsertResult::SkipPointer(next_offset) => {
|
||||
offset = next_offset;
|
||||
},
|
||||
InsertResult::NoNeedForSkip => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
InsertResult::NoNeedForSkip => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<W: Write>(self, output: &mut Write) -> error::Result<()> {
|
||||
output.write_u8(self.layers.len() as u8);
|
||||
for layer in self.layers.iter() {
|
||||
match layer.write(output) {
|
||||
Ok(())=> {},
|
||||
Err(someerr)=> { return Err(error::Error::WriteError(format!("Could not write skiplist {:?}", someerr) )) }
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -15,5 +15,7 @@ extern crate rustc_serialize;
|
||||
extern crate combine;
|
||||
extern crate atomicwrites;
|
||||
extern crate tempdir;
|
||||
extern crate bincode;
|
||||
extern crate serde;
|
||||
|
||||
pub mod core;
|
||||
|
||||
@@ -118,20 +118,11 @@ fn test_searcher() {
|
||||
let terms = vec!(Term::from_field_text(Field(1), "a"), Term::from_field_text(Field(1), "b"), );
|
||||
let mut collector = TestCollector::new();
|
||||
searcher.search(&terms, &mut collector);
|
||||
let vals = format!("{:?}", collector.docs());
|
||||
println!("{}",vals);
|
||||
assert_eq!(vals, "");
|
||||
let vals: Vec<DocId> = collector.docs().iter()
|
||||
.map(|doc| doc.1)
|
||||
.collect::<Vec<DocId>>();
|
||||
assert_eq!(vals, [1, 2]);
|
||||
}
|
||||
|
||||
//
|
||||
// let debug_serializer = DebugSegmentSerializer::new();
|
||||
// let segment_str_before_writing = DebugSegmentSerializer::debug_string(index_writer.current_segment_writer());
|
||||
// let commit_result = index_writer.commit();
|
||||
// assert!(commit_result.is_ok());
|
||||
// let segment = commit_result.unwrap();
|
||||
// let segment_reader = SegmentReader::open(segment).unwrap();
|
||||
// let segment_str_after_reading = DebugSegmentSerializer::debug_string(&segment_reader);
|
||||
// assert_eq!(segment_str_before_writing, segment_str_after_reading);
|
||||
}
|
||||
|
||||
|
||||
|
||||
37
tests/skip.rs
Normal file
37
tests/skip.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
extern crate tantivy;
|
||||
|
||||
use std::io::Write;
|
||||
use tantivy::core::skip::SkipListBuilder;
|
||||
|
||||
#[test]
|
||||
fn test_skip_list_builder() {
|
||||
{
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(10);
|
||||
skip_list_builder.insert(2, 3);
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
assert_eq!(output.len(), 17);
|
||||
assert_eq!(output[0], 1);
|
||||
}
|
||||
{
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3);
|
||||
for i in (0..9) {
|
||||
skip_list_builder.insert(i, i);
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
assert_eq!(output.len(), 129);
|
||||
assert_eq!(output[0], 3);
|
||||
}
|
||||
{
|
||||
// checking that void gets serialized to nothing.
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder = SkipListBuilder::new(3);
|
||||
for i in (0..9) {
|
||||
skip_list_builder.insert(i, ());
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output);
|
||||
assert_eq!(output.len(), 93);
|
||||
assert_eq!(output[0], 3);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user