Provide StoreReader::enumerate to simplify creation of secondary indexes

For secondary indexes, it is often necessary to read all documents, compute
some function on them and associated the result with a document ID.

Currently, this requires something like

```rust
let reader = segment.get_store_reader(1)?;

for doc_id in segment.doc_ids_alive() {
    let doc = reader.get(doc_id)?;

    // Use doc and doc_id here ...
}
```

which can be simplified to

```rust
let reader = segment.get_store_reader(1)?;

for res in reader.enumerate() {
    let (doc_id, doc) = res?;

    // Use doc and doc_id here ...
}
```

using the method proposed here.

(I added a new method instead of modifying `StoreReader::iter` to make the
change backwards compatible, i.e. possible to include in a point release.)
This commit is contained in:
Adam Reichold
2024-04-15 18:25:24 +02:00
parent b806122c81
commit 083aec5125
2 changed files with 19 additions and 8 deletions

View File

@@ -696,7 +696,7 @@ impl IndexMerger {
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
let (_, doc_bytes) = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
} else {
return Err(DataCorruption::comment_only(format!(
@@ -728,7 +728,7 @@ impl IndexMerger {
|| store_reader.decompressor() != store_writer.compressor().into()
{
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
let doc_bytes = doc_bytes_res?;
let (_, doc_bytes) = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
} else {

View File

@@ -241,12 +241,23 @@ impl StoreReader {
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<D>> + 'b {
self.enumerate(alive_bitset)
.map(|res| res.map(|(_, doc)| doc))
}
/// A variant of [`iter`][Self::iter] which also yields document ID.
pub fn enumerate<'a: 'b, 'b, D: DocumentDeserialize>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<(DocId, D)>> + 'b {
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;
let (doc_id, mut doc_bytes) = doc_bytes_res?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
let doc = D::deserialize(deserializer).map_err(crate::TantivyError::from)?;
Ok((doc_id, doc))
})
}
@@ -256,7 +267,7 @@ impl StoreReader {
pub(crate) fn iter_raw<'a: 'b, 'b>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
) -> impl Iterator<Item = crate::Result<(DocId, OwnedBytes)>> + 'b {
let last_doc_id = self
.block_checkpoints()
.last()
@@ -284,14 +295,14 @@ impl StoreReader {
let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
let res = if alive {
Some((curr_block.clone(), doc_pos))
Some((doc_id, curr_block.clone(), doc_pos))
} else {
None
};
doc_pos += 1;
res
})
.map(move |(block, doc_pos)| {
.map(move |(doc_id, block, doc_pos)| {
let block = block
.ok_or_else(|| {
DataCorruption::comment_only(
@@ -304,7 +315,7 @@ impl StoreReader {
})?;
let range = block_read_index(&block, doc_pos)?;
Ok(block.slice(range))
Ok((doc_id, block.slice(range)))
})
}