Rename mlt to more_like_this

Replaced RawDocument by OwnedBytes
2025-12-28 04:52:55 +00:00 · 2021-05-21 16:29:07 +09:00 · 2021-05-18 10:47:00 +09:00
8 changed files with 33 additions and 56 deletions
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -1,4 +1,5 @@
 use super::doc_id_mapping::DocIdMapping;
+use crate::error::DataCorruption;
 use crate::fastfield::DeleteBitSet;
 use crate::fastfield::FastFieldReader;
 use crate::fastfield::FastFieldSerializer;
@@ -927,19 +928,25 @@ impl IndexMerger {
            .collect();
        if let Some(doc_id_mapping) = doc_id_mapping {
            for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
-                let store_reader = &mut document_iterators[reader_with_ordinal.ordinal as usize];
-                let raw_doc = store_reader.next().expect(&format!(
-                    "unexpected missing document in docstore on merge, doc id {:?}",
-                    old_doc_id
-                ))?;
-                store_writer.store_bytes(raw_doc.get_bytes())?;
+                let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
+                if let Some(doc_bytes_res) = doc_bytes_it.next() {
+                    let doc_bytes = doc_bytes_res?;
+                    store_writer.store_bytes(&doc_bytes)?;
+                } else {
+                    return Err(DataCorruption::comment_only(&format!(
+                        "unexpected missing document in docstore on merge, doc id {:?}",
+                        old_doc_id
+                    ))
+                    .into());
+                }
            }
        } else {
            for reader in &self.readers {
                let store_reader = reader.get_store_reader()?;
                if reader.num_deleted_docs() > 0 {
-                    for raw_doc in store_reader.iter_raw(reader.delete_bitset()) {
-                        store_writer.store_bytes(raw_doc?.get_bytes())?;
+                    for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) {
+                        let doc_bytes = doc_bytes_res?;
+                        store_writer.store_bytes(&doc_bytes)?;
                    }
                } else {
                    store_writer.stack(&store_reader)?;
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -354,10 +354,8 @@ fn write(
                .open_read(SegmentComponent::TempStore)?,
        )?;
        for old_doc_id in doc_id_map.iter_old_doc_ids() {
-            let raw_doc = store_read.get_raw(*old_doc_id)?;
-            serializer
-                .get_store_writer()
-                .store_bytes(raw_doc.get_bytes())?;
+            let doc_bytes = store_read.get_document_bytes(*old_doc_id)?;
+            serializer.get_store_writer().store_bytes(&doc_bytes)?;
        }
        // TODO delete temp store
    }
--- a/src/query/mod.rs
+++ b/src/query/mod.rs
@@ -11,7 +11,7 @@ mod exclude;
 mod explanation;
 mod fuzzy_query;
 mod intersection;
-mod mlt;
+mod more_like_this;
 mod phrase_query;
 mod query;
 mod query_parser;
@@ -46,7 +46,7 @@ pub use self::explanation::Explanation;
 pub(crate) use self::fuzzy_query::DfaWrapper;
 pub use self::fuzzy_query::FuzzyTermQuery;
 pub use self::intersection::intersect_scorers;
-pub use self::mlt::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
+pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
 pub use self::phrase_query::PhraseQuery;
 pub use self::query::{Query, QueryClone};
 pub use self::query_parser::QueryParser;
--- a/src/query/more_like_this/mod.rs
+++ b/src/query/more_like_this/mod.rs
@@ -1,5 +1,5 @@
-mod mlt;
+mod more_like_this;
 mod query;

-pub use self::mlt::MoreLikeThis;
+pub use self::more_like_this::MoreLikeThis;
 pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
--- a/src/query/more_like_this/more_like_this.rs
+++ b/src/query/more_like_this/more_like_this.rs
--- a/src/query/more_like_this/query.rs
+++ b/src/query/more_like_this/query.rs
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -36,7 +36,6 @@ and should rely on either
 mod index;
 mod reader;
 mod writer;
-pub use self::reader::RawDocument;
 pub use self::reader::StoreReader;
 pub use self::writer::StoreWriter;

--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -15,7 +15,7 @@ use std::sync::{Arc, Mutex};

 const LRU_CACHE_CAPACITY: usize = 100;

-type Block = Arc<Vec<u8>>;
+type Block = OwnedBytes;

 type BlockCache = Arc<Mutex<LruCache<usize, Block>>>;

@@ -74,7 +74,7 @@ impl StoreReader {
        let mut decompressed_block = vec![];
        decompress(compressed_block.as_slice(), &mut decompressed_block)?;

-        let block = Arc::new(decompressed_block);
+        let block = OwnedBytes::new(decompressed_block);
        self.cache
            .lock()
            .unwrap()
@@ -93,9 +93,8 @@ impl StoreReader {
    /// It should not be called to score documents
    /// for instance.
    pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
-        let raw_doc = self.get_raw(doc_id)?;
-        let mut cursor = raw_doc.get_bytes();
-        Ok(Document::deserialize(&mut cursor)?)
+        let mut doc_bytes = self.get_document_bytes(doc_id)?;
+        Ok(Document::deserialize(&mut doc_bytes)?)
    }

    /// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end
@@ -106,7 +105,7 @@ impl StoreReader {
    /// so accessing docs from the same compressed block should be faster.
    /// For that reason a store reader should be kept and reused.
    ///
-    pub fn get_raw(&self, doc_id: DocId) -> crate::Result<RawDocument> {
+    pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
        let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| {
            crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
        })?;
@@ -121,11 +120,7 @@ impl StoreReader {
        let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
        let start_pos = cursor_len_before - cursor.len();
        let end_pos = cursor_len_before - cursor.len() + doc_length;
-        Ok(RawDocument {
-            block,
-            start_pos,
-            end_pos,
-        })
+        Ok(block.slice(start_pos..end_pos))
    }

    /// Iterator over all Documents in their order as they are stored in the doc store.
@@ -135,10 +130,9 @@ impl StoreReader {
        &'b self,
        delete_bitset: Option<&'a DeleteBitSet>,
    ) -> impl Iterator<Item = crate::Result<Document>> + 'b {
-        self.iter_raw(delete_bitset).map(|raw_doc| {
-            let raw_doc = raw_doc?;
-            let mut cursor = raw_doc.get_bytes();
-            Ok(Document::deserialize(&mut cursor)?)
+        self.iter_raw(delete_bitset).map(|doc_bytes_res| {
+            let mut doc_bytes = doc_bytes_res?;
+            Ok(Document::deserialize(&mut doc_bytes)?)
        })
    }

@@ -148,7 +142,7 @@ impl StoreReader {
    pub(crate) fn iter_raw<'a: 'b, 'b>(
        &'b self,
        delete_bitset: Option<&'a DeleteBitSet>,
-    ) -> impl Iterator<Item = crate::Result<RawDocument>> + 'b {
+    ) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
        let last_docid = self
            .block_checkpoints()
            .last()
@@ -214,13 +208,9 @@ impl StoreReader {
                    }
                };
                let end_pos = block_start_pos + doc_length;
-                let raw_doc = RawDocument {
-                    block,
-                    start_pos: block_start_pos,
-                    end_pos,
-                };
+                let doc_bytes = block.slice(block_start_pos..end_pos);
                block_start_pos = end_pos;
-                Ok(raw_doc)
+                Ok(doc_bytes)
            })
    }

@@ -230,23 +220,6 @@ impl StoreReader {
    }
 }

-/// Get the bytes of a serialized `Document` in a decompressed block.
-pub struct RawDocument {
-    /// the block of data containing multiple documents
-    block: Arc<Vec<u8>>,
-    /// start position of the document in the block
-    start_pos: usize,
-    /// end position of the document in the block
-    end_pos: usize,
-}
-
-impl RawDocument {
-    /// Get the bytes of a serialized `Document` in a decompressed block.
-    pub fn get_bytes(&self) -> &[u8] {
-        &self.block[self.start_pos..self.end_pos]
-    }
-}
-
 fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> {
    let (data, footer_len_bytes) = data.split_from_end(size_of::<u64>());
    let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?;
Author	SHA1	Message	Date
Paul Masurel	8644d68023	Rename mlt to more_like_this	2021-05-21 16:29:07 +09:00
Paul Masurel	913850b590	Replaced RawDocument by OwnedBytes	2021-05-18 10:47:00 +09:00