Extend DocumentDeserialize with a stateful variant DocumentDeserializeSeed

This is modelled on Serde's `DeserializeSeed` include how the relevant API entry
points gain a `_seed` variant. It can be used for example to obtain runtime
field ID values when deserializing a struct field by field without relying on
the order of the fields as written to/read from the document store.
This commit is contained in:
Adam Reichold
2024-04-15 18:56:42 +02:00
parent b806122c81
commit e9c16a4fb1
5 changed files with 92 additions and 12 deletions

View File

@@ -1,4 +1,5 @@
use std::collections::BTreeMap;
use std::marker::PhantomData;
use std::sync::Arc;
use std::{fmt, io};
@@ -6,7 +7,7 @@ use crate::collector::Collector;
use crate::core::Executor;
use crate::index::{SegmentId, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::DocumentDeserialize;
use crate::schema::document::{DocumentDeserialize, DocumentDeserializeSeed};
use crate::schema::{Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader};
@@ -86,8 +87,17 @@ impl Searcher {
/// The searcher uses the segment ordinal to route the
/// request to the right `Segment`.
pub fn doc<D: DocumentDeserialize>(&self, doc_address: DocAddress) -> crate::Result<D> {
self.doc_seed(doc_address, PhantomData)
}
/// A stateful variant of [`doc`][Self::doc].`
pub fn doc_seed<T: DocumentDeserializeSeed>(
&self,
doc_address: DocAddress,
seed: T,
) -> crate::Result<T::Value> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id)
store_reader.get_seed(doc_address.doc_id, seed)
}
/// The cache stats for the underlying store reader.
@@ -109,9 +119,21 @@ impl Searcher {
&self,
doc_address: DocAddress,
) -> crate::Result<D> {
self.doc_async_seed(doc_address, PhantomData).await
}
#[cfg(feature = "quickwit")]
/// A stateful variant of [`doc_async`][Self::doc_async].
pub async fn doc_async_seed<T: DocumentDeserializeSeed>(
&self,
doc_address: DocAddress,
seed: T,
) -> crate::Result<T::Value> {
let executor = self.inner.index.search_executor();
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id, executor).await
store_reader
.get_async_seed(doc_address.doc_id, executor, seed)
.await
}
/// Access the schema associated with the index of this searcher.

View File

@@ -69,6 +69,28 @@ pub trait DocumentDeserialize: Sized {
where D: DocumentDeserializer<'de>;
}
/// A stateful extension of [`DocumentDeserialize`].
pub trait DocumentDeserializeSeed: Sized {
/// The type produced by using this seed.
type Value;
/// Attempts to deserialize `Self::Value` from the given `seed` and `deserializer`.
fn deserialize<'de, D>(self, deserializer: D) -> Result<Self::Value, DeserializeError>
where D: DocumentDeserializer<'de>;
}
impl<T> DocumentDeserializeSeed for PhantomData<T>
where T: DocumentDeserialize
{
/// The type produced by using this seed.
type Value = T;
fn deserialize<'de, D>(self, deserializer: D) -> Result<Self::Value, DeserializeError>
where D: DocumentDeserializer<'de> {
<T as DocumentDeserialize>::deserialize(deserializer)
}
}
/// A deserializer that can walk through each entry in the document.
pub trait DocumentDeserializer<'de> {
/// A indicator as to how many values are in the document.

View File

@@ -603,7 +603,7 @@ impl<'a> Iterator for CompactDocObjectIter<'a> {
container: self.container,
value,
};
return Some((key, value));
Some((key, value))
}
}
@@ -637,7 +637,7 @@ impl<'a> Iterator for CompactDocArrayIter<'a> {
container: self.container,
value,
};
return Some(value);
Some(value)
}
}

View File

@@ -169,8 +169,9 @@ use std::mem;
pub(crate) use self::de::BinaryDocumentDeserializer;
pub use self::de::{
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess,
ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor,
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializeSeed,
DocumentDeserializer, ObjectAccess, ValueDeserialize, ValueDeserializer, ValueType,
ValueVisitor,
};
pub use self::default_document::{
CompactDocArrayIter, CompactDocObjectIter, CompactDocValue, DocParsingError, TantivyDocument,

View File

@@ -1,5 +1,6 @@
use std::io;
use std::iter::Sum;
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use std::ops::{AddAssign, Range};
use std::sync::atomic::{AtomicUsize, Ordering};
@@ -14,7 +15,9 @@ use super::Decompressor;
use crate::directory::FileSlice;
use crate::error::DataCorruption;
use crate::fastfield::AliveBitSet;
use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize};
use crate::schema::document::{
BinaryDocumentDeserializer, DocumentDeserialize, DocumentDeserializeSeed,
};
use crate::space_usage::StoreSpaceUsage;
use crate::store::index::Checkpoint;
use crate::DocId;
@@ -201,11 +204,21 @@ impl StoreReader {
/// It should not be called to score documents
/// for instance.
pub fn get<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> {
self.get_seed(doc_id, PhantomData)
}
/// A stateful version of [`get`][Self::get].
pub fn get_seed<T: DocumentDeserializeSeed>(
&self,
doc_id: DocId,
seed: T,
) -> crate::Result<T::Value> {
let mut doc_bytes = self.get_document_bytes(doc_id)?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
seed.deserialize(deserializer)
.map_err(crate::TantivyError::from)
}
/// Returns raw bytes of a given document.
@@ -237,16 +250,27 @@ impl StoreReader {
/// Iterator over all Documents in their order as they are stored in the doc store.
/// Use this, if you want to extract all Documents from the doc store.
/// The `alive_bitset` has to be forwarded from the `SegmentReader` or the results may be wrong.
pub fn iter<'a: 'b, 'b, D: DocumentDeserialize>(
pub fn iter<'a: 'b, 'b, D: DocumentDeserialize + 'b>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<D>> + 'b {
self.iter_seed(alive_bitset, &PhantomData)
}
/// A stateful variant of [`iter`][Self::iter].
pub fn iter_seed<'a: 'b, 'b, T: DocumentDeserializeSeed + Clone + 'b>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
seed: &'b T,
) -> impl Iterator<Item = crate::Result<T::Value>> + 'b {
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
seed.clone()
.deserialize(deserializer)
.map_err(crate::TantivyError::from)
})
}
@@ -389,11 +413,22 @@ impl StoreReader {
doc_id: DocId,
executor: &Executor,
) -> crate::Result<D> {
self.get_async_seed(doc_id, executor, PhantomData).await
}
/// A stateful variant of [`get_async`][Self::get_async].
pub async fn get_async_seed<T: DocumentDeserializeSeed>(
&self,
doc_id: DocId,
executor: &Executor,
seed: T,
) -> crate::Result<T::Value> {
let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;
let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
seed.deserialize(deserializer)
.map_err(crate::TantivyError::from)
}
}