Compare commits

...

2 Commits

Author SHA1 Message Date
Paul Masurel
8644d68023 Rename mlt to more_like_this 2021-05-21 16:29:07 +09:00
Paul Masurel
913850b590 Replaced RawDocument by OwnedBytes 2021-05-18 10:47:00 +09:00
8 changed files with 33 additions and 56 deletions

View File

@@ -1,4 +1,5 @@
use super::doc_id_mapping::DocIdMapping;
use crate::error::DataCorruption;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldSerializer;
@@ -927,19 +928,25 @@ impl IndexMerger {
.collect();
if let Some(doc_id_mapping) = doc_id_mapping {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
let store_reader = &mut document_iterators[reader_with_ordinal.ordinal as usize];
let raw_doc = store_reader.next().expect(&format!(
"unexpected missing document in docstore on merge, doc id {:?}",
old_doc_id
))?;
store_writer.store_bytes(raw_doc.get_bytes())?;
let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
} else {
return Err(DataCorruption::comment_only(&format!(
"unexpected missing document in docstore on merge, doc id {:?}",
old_doc_id
))
.into());
}
}
} else {
for reader in &self.readers {
let store_reader = reader.get_store_reader()?;
if reader.num_deleted_docs() > 0 {
for raw_doc in store_reader.iter_raw(reader.delete_bitset()) {
store_writer.store_bytes(raw_doc?.get_bytes())?;
for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
} else {
store_writer.stack(&store_reader)?;

View File

@@ -354,10 +354,8 @@ fn write(
.open_read(SegmentComponent::TempStore)?,
)?;
for old_doc_id in doc_id_map.iter_old_doc_ids() {
let raw_doc = store_read.get_raw(*old_doc_id)?;
serializer
.get_store_writer()
.store_bytes(raw_doc.get_bytes())?;
let doc_bytes = store_read.get_document_bytes(*old_doc_id)?;
serializer.get_store_writer().store_bytes(&doc_bytes)?;
}
// TODO delete temp store
}

View File

@@ -11,7 +11,7 @@ mod exclude;
mod explanation;
mod fuzzy_query;
mod intersection;
mod mlt;
mod more_like_this;
mod phrase_query;
mod query;
mod query_parser;
@@ -46,7 +46,7 @@ pub use self::explanation::Explanation;
pub(crate) use self::fuzzy_query::DfaWrapper;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;
pub use self::mlt::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{Query, QueryClone};
pub use self::query_parser::QueryParser;

View File

@@ -1,5 +1,5 @@
mod mlt;
mod more_like_this;
mod query;
pub use self::mlt::MoreLikeThis;
pub use self::more_like_this::MoreLikeThis;
pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};

View File

@@ -36,7 +36,6 @@ and should rely on either
mod index;
mod reader;
mod writer;
pub use self::reader::RawDocument;
pub use self::reader::StoreReader;
pub use self::writer::StoreWriter;

View File

@@ -15,7 +15,7 @@ use std::sync::{Arc, Mutex};
const LRU_CACHE_CAPACITY: usize = 100;
type Block = Arc<Vec<u8>>;
type Block = OwnedBytes;
type BlockCache = Arc<Mutex<LruCache<usize, Block>>>;
@@ -74,7 +74,7 @@ impl StoreReader {
let mut decompressed_block = vec![];
decompress(compressed_block.as_slice(), &mut decompressed_block)?;
let block = Arc::new(decompressed_block);
let block = OwnedBytes::new(decompressed_block);
self.cache
.lock()
.unwrap()
@@ -93,9 +93,8 @@ impl StoreReader {
/// It should not be called to score documents
/// for instance.
pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
let raw_doc = self.get_raw(doc_id)?;
let mut cursor = raw_doc.get_bytes();
Ok(Document::deserialize(&mut cursor)?)
let mut doc_bytes = self.get_document_bytes(doc_id)?;
Ok(Document::deserialize(&mut doc_bytes)?)
}
/// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end
@@ -106,7 +105,7 @@ impl StoreReader {
/// so accessing docs from the same compressed block should be faster.
/// For that reason a store reader should be kept and reused.
///
pub fn get_raw(&self, doc_id: DocId) -> crate::Result<RawDocument> {
pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| {
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
})?;
@@ -121,11 +120,7 @@ impl StoreReader {
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
let start_pos = cursor_len_before - cursor.len();
let end_pos = cursor_len_before - cursor.len() + doc_length;
Ok(RawDocument {
block,
start_pos,
end_pos,
})
Ok(block.slice(start_pos..end_pos))
}
/// Iterator over all Documents in their order as they are stored in the doc store.
@@ -135,10 +130,9 @@ impl StoreReader {
&'b self,
delete_bitset: Option<&'a DeleteBitSet>,
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
self.iter_raw(delete_bitset).map(|raw_doc| {
let raw_doc = raw_doc?;
let mut cursor = raw_doc.get_bytes();
Ok(Document::deserialize(&mut cursor)?)
self.iter_raw(delete_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;
Ok(Document::deserialize(&mut doc_bytes)?)
})
}
@@ -148,7 +142,7 @@ impl StoreReader {
pub(crate) fn iter_raw<'a: 'b, 'b>(
&'b self,
delete_bitset: Option<&'a DeleteBitSet>,
) -> impl Iterator<Item = crate::Result<RawDocument>> + 'b {
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
let last_docid = self
.block_checkpoints()
.last()
@@ -214,13 +208,9 @@ impl StoreReader {
}
};
let end_pos = block_start_pos + doc_length;
let raw_doc = RawDocument {
block,
start_pos: block_start_pos,
end_pos,
};
let doc_bytes = block.slice(block_start_pos..end_pos);
block_start_pos = end_pos;
Ok(raw_doc)
Ok(doc_bytes)
})
}
@@ -230,23 +220,6 @@ impl StoreReader {
}
}
/// Get the bytes of a serialized `Document` in a decompressed block.
pub struct RawDocument {
/// the block of data containing multiple documents
block: Arc<Vec<u8>>,
/// start position of the document in the block
start_pos: usize,
/// end position of the document in the block
end_pos: usize,
}
impl RawDocument {
/// Get the bytes of a serialized `Document` in a decompressed block.
pub fn get_bytes(&self) -> &[u8] {
&self.block[self.start_pos..self.end_pos]
}
}
fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> {
let (data, footer_len_bytes) = data.split_from_end(size_of::<u64>());
let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?;