Files
tantivy/src/core/searcher.rs
2022-01-28 15:55:55 +09:00

221 lines
7.6 KiB
Rust

use std::collections::BTreeMap;
use std::{fmt, io};
use crate::collector::Collector;
use crate::core::{Executor, SegmentReader};
use crate::query::Query;
use crate::schema::{Document, Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::StoreReader;
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
/// Identifies the searcher generation accessed by a [Searcher].
///
/// While this might seem redundant, a [SearcherGeneration] contains
/// both a `generation_id` AND a list of `(SegmentId, DeleteOpstamp)`.
///
/// This is on purpose. This object is used by the `Warmer` API.
/// Having both information makes it possible to identify which
/// artifact should be refreshed or garbage collected.
///
/// Depending on the use case, `Warmer`'s implementers can decide to
/// produce artifacts per:
/// - `generation_id` (e.g. some searcher level aggregates)
/// - `(segment_id, delete_opstamp)` (e.g. segment level aggregates)
/// - `segment_id` (e.g. for immutable document level information)
/// - `(generation_id, segment_id)` (e.g. for consistent dynamic column)
/// - ...
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct SearcherGeneration {
segments: BTreeMap<SegmentId, Option<Opstamp>>,
generation_id: u64,
}
impl SearcherGeneration {
pub(crate) fn from_segment_readers(
segment_readers: &[SegmentReader],
generation_id: u64,
) -> Self {
let mut segment_id_to_del_opstamp = BTreeMap::new();
for segment_reader in segment_readers {
segment_id_to_del_opstamp
.insert(segment_reader.segment_id(), segment_reader.delete_opstamp());
}
Self {
segments: segment_id_to_del_opstamp,
generation_id,
}
}
/// Returns the searcher generation id.
pub fn generation_id(&self) -> u64 {
self.generation_id
}
/// Return a `(SegmentId -> DeleteOpstamp)` mapping.
pub fn segments(&self) -> &BTreeMap<SegmentId, Option<Opstamp>> {
&self.segments
}
}
/// Holds a list of `SegmentReader`s ready for search.
///
/// It guarantees that the `Segment` will not be removed before
/// the destruction of the `Searcher`.
pub struct Searcher {
schema: Schema,
index: Index,
segment_readers: Vec<SegmentReader>,
store_readers: Vec<StoreReader>,
generation: TrackedObject<SearcherGeneration>,
}
impl Searcher {
/// Creates a new `Searcher`
pub(crate) fn new(
schema: Schema,
index: Index,
segment_readers: Vec<SegmentReader>,
generation: TrackedObject<SearcherGeneration>,
) -> io::Result<Searcher> {
let store_readers: Vec<StoreReader> = segment_readers
.iter()
.map(SegmentReader::get_store_reader)
.collect::<io::Result<Vec<_>>>()?;
Ok(Searcher {
schema,
index,
segment_readers,
store_readers,
generation,
})
}
/// Returns the `Index` associated to the `Searcher`
pub fn index(&self) -> &Index {
&self.index
}
/// [SearcherGeneration] which identifies the version of the snapshot held by this `Searcher`.
pub fn generation(&self) -> &SearcherGeneration {
self.generation.as_ref()
}
/// Fetches a document from tantivy's store given a `DocAddress`.
///
/// The searcher uses the segment ordinal to route the
/// the request to the right `Segment`.
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
let store_reader = &self.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id)
}
/// Access the schema associated to the index of this searcher.
pub fn schema(&self) -> &Schema {
&self.schema
}
/// Returns the overall number of documents in the index.
pub fn num_docs(&self) -> u64 {
self.segment_readers
.iter()
.map(|segment_reader| u64::from(segment_reader.num_docs()))
.sum::<u64>()
}
/// Return the overall number of documents containing
/// the given term.
pub fn doc_freq(&self, term: &Term) -> crate::Result<u64> {
let mut total_doc_freq = 0;
for segment_reader in &self.segment_readers {
let inverted_index = segment_reader.inverted_index(term.field())?;
let doc_freq = inverted_index.doc_freq(term)?;
total_doc_freq += u64::from(doc_freq);
}
Ok(total_doc_freq)
}
/// Return the list of segment readers
pub fn segment_readers(&self) -> &[SegmentReader] {
&self.segment_readers
}
/// Returns the segment_reader associated with the given segment_ord
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.segment_readers[segment_ord as usize]
}
/// Runs a query on the segment readers wrapped by the searcher.
///
/// Search works as follows :
///
/// First the weight object associated to the query is created.
///
/// Then, the query loops over the segments and for each segment :
/// - setup the collector and informs it that the segment being processed has changed.
/// - creates a SegmentCollector for collecting documents associated to the segment
/// - creates a `Scorer` object associated for this segment
/// - iterate through the matched documents and push them to the segment collector.
///
/// Finally, the Collector merges each of the child collectors into itself for result usability
/// by the caller.
pub fn search<C: Collector>(
&self,
query: &dyn Query,
collector: &C,
) -> crate::Result<C::Fruit> {
let executor = self.index.search_executor();
self.search_with_executor(query, collector, executor)
}
/// Same as [`search(...)`](#method.search) but multithreaded.
///
/// The current implementation is rather naive :
/// multithreading is by splitting search into as many task
/// as there are segments.
///
/// It is powerless at making search faster if your index consists in
/// one large segment.
///
/// Also, keep in my multithreading a single query on several
/// threads will not improve your throughput. It can actually
/// hurt it. It will however, decrease the average response time.
pub fn search_with_executor<C: Collector>(
&self,
query: &dyn Query,
collector: &C,
executor: &Executor,
) -> crate::Result<C::Fruit> {
let scoring_enabled = collector.requires_scoring();
let weight = query.weight(self, scoring_enabled)?;
let segment_readers = self.segment_readers();
let fruits = executor.map(
|(segment_ord, segment_reader)| {
collector.collect_segment(weight.as_ref(), segment_ord as u32, segment_reader)
},
segment_readers.iter().enumerate(),
)?;
collector.merge_fruits(fruits)
}
/// Summarize total space usage of this searcher.
pub fn space_usage(&self) -> io::Result<SearcherSpaceUsage> {
let mut space_usage = SearcherSpaceUsage::new();
for segment_reader in &self.segment_readers {
space_usage.add_segment(segment_reader.space_usage()?);
}
Ok(space_usage)
}
}
impl fmt::Debug for Searcher {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let segment_ids = self
.segment_readers
.iter()
.map(SegmentReader::segment_id)
.collect::<Vec<_>>();
write!(f, "Searcher({:?})", segment_ids)
}
}