From 083aec51258e8b90a238d5352d6a0632573b763f Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Mon, 15 Apr 2024 18:25:24 +0200 Subject: [PATCH] Provide StoreReader::enumerate to simplify creation of secondary indexes For secondary indexes, it is often necessary to read all documents, compute some function on them and associated the result with a document ID. Currently, this requires something like ```rust let reader = segment.get_store_reader(1)?; for doc_id in segment.doc_ids_alive() { let doc = reader.get(doc_id)?; // Use doc and doc_id here ... } ``` which can be simplified to ```rust let reader = segment.get_store_reader(1)?; for res in reader.enumerate() { let (doc_id, doc) = res?; // Use doc and doc_id here ... } ``` using the method proposed here. (I added a new method instead of modifying `StoreReader::iter` to make the change backwards compatible, i.e. possible to include in a point release.) --- src/indexer/merger.rs | 4 ++-- src/store/reader.rs | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 88494e8df..dd747977b 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -696,7 +696,7 @@ impl IndexMerger { for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() { let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize]; if let Some(doc_bytes_res) = doc_bytes_it.next() { - let doc_bytes = doc_bytes_res?; + let (_, doc_bytes) = doc_bytes_res?; store_writer.store_bytes(&doc_bytes)?; } else { return Err(DataCorruption::comment_only(format!( @@ -728,7 +728,7 @@ impl IndexMerger { || store_reader.decompressor() != store_writer.compressor().into() { for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) { - let doc_bytes = doc_bytes_res?; + let (_, doc_bytes) = doc_bytes_res?; store_writer.store_bytes(&doc_bytes)?; } } else { diff --git a/src/store/reader.rs b/src/store/reader.rs index 1e4432e5f..faf9ca93c 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -241,12 +241,23 @@ impl StoreReader { &'b self, alive_bitset: Option<&'a AliveBitSet>, ) -> impl Iterator> + 'b { + self.enumerate(alive_bitset) + .map(|res| res.map(|(_, doc)| doc)) + } + + /// A variant of [`iter`][Self::iter] which also yields document ID. + pub fn enumerate<'a: 'b, 'b, D: DocumentDeserialize>( + &'b self, + alive_bitset: Option<&'a AliveBitSet>, + ) -> impl Iterator> + 'b { self.iter_raw(alive_bitset).map(|doc_bytes_res| { - let mut doc_bytes = doc_bytes_res?; + let (doc_id, mut doc_bytes) = doc_bytes_res?; let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) .map_err(crate::TantivyError::from)?; - D::deserialize(deserializer).map_err(crate::TantivyError::from) + let doc = D::deserialize(deserializer).map_err(crate::TantivyError::from)?; + + Ok((doc_id, doc)) }) } @@ -256,7 +267,7 @@ impl StoreReader { pub(crate) fn iter_raw<'a: 'b, 'b>( &'b self, alive_bitset: Option<&'a AliveBitSet>, - ) -> impl Iterator> + 'b { + ) -> impl Iterator> + 'b { let last_doc_id = self .block_checkpoints() .last() @@ -284,14 +295,14 @@ impl StoreReader { let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id)); let res = if alive { - Some((curr_block.clone(), doc_pos)) + Some((doc_id, curr_block.clone(), doc_pos)) } else { None }; doc_pos += 1; res }) - .map(move |(block, doc_pos)| { + .map(move |(doc_id, block, doc_pos)| { let block = block .ok_or_else(|| { DataCorruption::comment_only( @@ -304,7 +315,7 @@ impl StoreReader { })?; let range = block_read_index(&block, doc_pos)?; - Ok(block.slice(range)) + Ok((doc_id, block.slice(range))) }) }