mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
bugfix
This commit is contained in:
@@ -10,6 +10,7 @@ use core::schema::Term;
|
||||
use core::schema::DocId;
|
||||
use core::store::StoreWriter;
|
||||
use std::fs::File;
|
||||
use core::serialize::BinarySerializable;
|
||||
use fst;
|
||||
use core::simdcompression;
|
||||
use std::convert::From;
|
||||
@@ -49,18 +50,16 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
|
||||
self.term_fst_builder.insert(term.as_slice(), self.written_bytes_postings as u64);
|
||||
self.cur_term_num_docs = doc_freq;
|
||||
// writing the size of the posting list
|
||||
try!(self.postings_write.write_u32::<BigEndian>(doc_freq));
|
||||
self.written_bytes_postings += 4;
|
||||
self.written_bytes_postings += try!((doc_freq as u32).serialize(&mut self.postings_write));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<(), IOError> {
|
||||
// TODO write_all transmuted [u8]
|
||||
let docs_data = self.encoder.encode(doc_ids);
|
||||
try!(self.postings_write.write_u32::<BigEndian>(docs_data.len() as u32));
|
||||
self.written_bytes_postings += 4;
|
||||
self.written_bytes_postings += try!((docs_data.len() as u32).serialize(&mut self.postings_write));
|
||||
for num in docs_data {
|
||||
try!(self.postings_write.write_u32::<BigEndian>(num.clone() as u32));
|
||||
self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ pub mod analyzer;
|
||||
pub mod serial;
|
||||
pub mod reader;
|
||||
pub mod codec;
|
||||
pub mod error;
|
||||
pub mod searcher;
|
||||
pub mod collector;
|
||||
pub mod skip;
|
||||
|
||||
@@ -5,6 +5,7 @@ use core::schema::Document;
|
||||
use fst;
|
||||
use core::postings::IntersectionPostings;
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use core::serialize::BinarySerializable;
|
||||
use std::io::Cursor;
|
||||
use core::schema::DocId;
|
||||
use core::directory::SegmentComponent;
|
||||
@@ -39,14 +40,14 @@ impl SegmentPostings {
|
||||
|
||||
pub fn from_data(data: &[u8]) -> SegmentPostings {
|
||||
let mut cursor = Cursor::new(data);
|
||||
let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize;
|
||||
let doc_freq: u32 = u32::deserialize(&mut cursor).unwrap();
|
||||
let data_size = cursor.read_u32::<BigEndian>().unwrap() as usize;
|
||||
// TODO remove allocs
|
||||
let mut data = Vec::with_capacity(data_size);
|
||||
for _ in 0..data_size {
|
||||
data.push(cursor.read_u32::<BigEndian>().unwrap());
|
||||
}
|
||||
let mut doc_ids: Vec<u32> = (0..doc_freq as u32 ).collect();
|
||||
let mut doc_ids: Vec<u32> = (0..doc_freq as u32).collect();
|
||||
let decoder = Decoder::new();
|
||||
decoder.decode(&data, &mut doc_ids);
|
||||
SegmentPostings {
|
||||
@@ -135,7 +136,6 @@ impl SegmentReader {
|
||||
segment_postings.push(segment_posting);
|
||||
}
|
||||
None => {
|
||||
println!("not found {:?}", term);
|
||||
segment_postings.clear();
|
||||
segment_postings.push(SegmentPostings::empty());
|
||||
break;
|
||||
|
||||
@@ -70,13 +70,15 @@ impl Decoder {
|
||||
#[test]
|
||||
fn test_encode_big() {
|
||||
let mut encoder = Encoder::new();
|
||||
let input: Vec<u32> = (0..10000).into_iter().collect();
|
||||
let data = encoder.encode(&input);
|
||||
assert_eq!(data.len(), 962);
|
||||
let num_ints = 10000 as usize;
|
||||
let expected_length = 1274;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter().collect();
|
||||
let encoded_data = encoder.encode(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = Decoder::new();
|
||||
let mut data_output: Vec<u32> = (0..10000).collect();
|
||||
assert_eq!(10000, decoder.decode(&data[0..962], &mut data_output));
|
||||
for i in 0..10000 {
|
||||
assert_eq!(data_output[i], input[i]) ;
|
||||
}
|
||||
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::cell::RefCell;
|
||||
use core::schema::DocId;
|
||||
use core::schema::Document;
|
||||
use core::schema::FieldValue;
|
||||
use core::error;
|
||||
use core::serialize::BinarySerializable;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
@@ -193,7 +192,7 @@ mod tests {
|
||||
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
|
||||
{
|
||||
let mut store_writer = StoreWriter::new(store_file.reopen().unwrap());
|
||||
for i in 0..10000 {
|
||||
for i in 0..1000 {
|
||||
let mut fields: Vec<FieldValue> = Vec::new();
|
||||
{
|
||||
let field_value = FieldValue {
|
||||
@@ -219,7 +218,7 @@ mod tests {
|
||||
let store_mmap = MmapReadOnly::open(&store_file).unwrap();
|
||||
let store = StoreReader::new(store_mmap);
|
||||
assert_eq!(offsets, store.offsets);
|
||||
for i in 0..1000 {
|
||||
for i in (0..10).map(|i| i * 3 / 2) {
|
||||
assert_eq!(*store.get(&i).get_one(&field_title).unwrap(), format!("Doc {}", i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,36 +12,35 @@ use regex::Regex;
|
||||
|
||||
|
||||
|
||||
pub struct TestCollector {
|
||||
docs: Vec<DocAddress>,
|
||||
current_segment: Option<SegmentId>,
|
||||
// only make sense for a single segment
|
||||
struct TestCollector {
|
||||
docs: Vec<DocId>,
|
||||
}
|
||||
|
||||
impl TestCollector {
|
||||
pub fn new() -> TestCollector {
|
||||
TestCollector {
|
||||
docs: Vec::new(),
|
||||
current_segment: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn docs(self,) -> Vec<DocAddress> {
|
||||
pub fn docs(self,) -> Vec<DocId> {
|
||||
self.docs
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
|
||||
fn set_segment(&mut self, segment: &SegmentReader) {
|
||||
self.current_segment = Some(segment.id());
|
||||
}
|
||||
fn set_segment(&mut self, segment: &SegmentReader) {}
|
||||
|
||||
fn collect(&mut self, doc_id: DocId) {
|
||||
self.docs.push(DocAddress(self.current_segment.clone().unwrap(), doc_id));
|
||||
self.docs.push(doc_id);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
let mut schema = Schema::new();
|
||||
@@ -111,16 +110,43 @@ fn test_searcher() {
|
||||
let segment = commit_result.unwrap();
|
||||
}
|
||||
{
|
||||
|
||||
let searcher = Searcher::for_directory(directory);
|
||||
let terms = vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), );
|
||||
let mut collector = TestCollector::new();
|
||||
searcher.search(&terms, &mut collector);
|
||||
let vals: Vec<DocId> = collector
|
||||
.docs()
|
||||
.iter()
|
||||
.map(|doc| doc.1)
|
||||
.collect::<Vec<DocId>>();
|
||||
assert_eq!(vals, [1, 2]);
|
||||
let get_doc_ids = |terms: Vec<Term>| {
|
||||
let mut collector = TestCollector::new();
|
||||
searcher.search(&terms, &mut collector);
|
||||
collector.docs()
|
||||
};
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(&text_field, "a"))),
|
||||
vec!(1, 2));
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(&text_field, "af"))),
|
||||
vec!(0));
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(&text_field, "b"))),
|
||||
vec!(0, 1, 2));
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(&text_field, "c"))),
|
||||
vec!(1, 2));
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(&text_field, "d"))),
|
||||
vec!(2));
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), )),
|
||||
vec!(1, 2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user