mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Provide StoreReader::enumerate to simplify creation of secondary indexes
For secondary indexes, it is often necessary to read all documents, compute
some function on them and associated the result with a document ID.
Currently, this requires something like
```rust
let reader = segment.get_store_reader(1)?;
for doc_id in segment.doc_ids_alive() {
let doc = reader.get(doc_id)?;
// Use doc and doc_id here ...
}
```
which can be simplified to
```rust
let reader = segment.get_store_reader(1)?;
for res in reader.enumerate() {
let (doc_id, doc) = res?;
// Use doc and doc_id here ...
}
```
using the method proposed here.
(I added a new method instead of modifying `StoreReader::iter` to make the
change backwards compatible, i.e. possible to include in a point release.)
This commit is contained in:
@@ -696,7 +696,7 @@ impl IndexMerger {
|
||||
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
|
||||
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
|
||||
if let Some(doc_bytes_res) = doc_bytes_it.next() {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
let (_, doc_bytes) = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
} else {
|
||||
return Err(DataCorruption::comment_only(format!(
|
||||
@@ -728,7 +728,7 @@ impl IndexMerger {
|
||||
|| store_reader.decompressor() != store_writer.compressor().into()
|
||||
{
|
||||
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
let (_, doc_bytes) = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -241,12 +241,23 @@ impl StoreReader {
|
||||
&'b self,
|
||||
alive_bitset: Option<&'a AliveBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<D>> + 'b {
|
||||
self.enumerate(alive_bitset)
|
||||
.map(|res| res.map(|(_, doc)| doc))
|
||||
}
|
||||
|
||||
/// A variant of [`iter`][Self::iter] which also yields document ID.
|
||||
pub fn enumerate<'a: 'b, 'b, D: DocumentDeserialize>(
|
||||
&'b self,
|
||||
alive_bitset: Option<&'a AliveBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<(DocId, D)>> + 'b {
|
||||
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
|
||||
let mut doc_bytes = doc_bytes_res?;
|
||||
let (doc_id, mut doc_bytes) = doc_bytes_res?;
|
||||
|
||||
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
|
||||
.map_err(crate::TantivyError::from)?;
|
||||
D::deserialize(deserializer).map_err(crate::TantivyError::from)
|
||||
let doc = D::deserialize(deserializer).map_err(crate::TantivyError::from)?;
|
||||
|
||||
Ok((doc_id, doc))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -256,7 +267,7 @@ impl StoreReader {
|
||||
pub(crate) fn iter_raw<'a: 'b, 'b>(
|
||||
&'b self,
|
||||
alive_bitset: Option<&'a AliveBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
|
||||
) -> impl Iterator<Item = crate::Result<(DocId, OwnedBytes)>> + 'b {
|
||||
let last_doc_id = self
|
||||
.block_checkpoints()
|
||||
.last()
|
||||
@@ -284,14 +295,14 @@ impl StoreReader {
|
||||
|
||||
let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
|
||||
let res = if alive {
|
||||
Some((curr_block.clone(), doc_pos))
|
||||
Some((doc_id, curr_block.clone(), doc_pos))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
doc_pos += 1;
|
||||
res
|
||||
})
|
||||
.map(move |(block, doc_pos)| {
|
||||
.map(move |(doc_id, block, doc_pos)| {
|
||||
let block = block
|
||||
.ok_or_else(|| {
|
||||
DataCorruption::comment_only(
|
||||
@@ -304,7 +315,7 @@ impl StoreReader {
|
||||
})?;
|
||||
|
||||
let range = block_read_index(&block, doc_pos)?;
|
||||
Ok(block.slice(range))
|
||||
Ok((doc_id, block.slice(range)))
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user