improve docs, rework exports (#2220)

* rework exports

move snippet and advice
make indexer pub, remove indexer reexports

* add deprecation warning

* add architecture overview
This commit is contained in:
PSeitz
2023-10-18 09:22:24 +02:00
committed by GitHub
parent 7e1980b218
commit c2b0469180
10 changed files with 148 additions and 34 deletions

View File

@@ -10,7 +10,8 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator};
use tantivy::snippet::{Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {

View File

@@ -18,11 +18,11 @@ use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_L
use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::indexer::IndexWriter;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
fn load_metas(
directory: &dyn Directory,

View File

@@ -8,6 +8,8 @@ use std::sync::{Arc, RwLock, Weak};
use common::StableDeref;
use fs4::FileExt;
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
use memmap2::Mmap;
use serde::{Deserialize, Serialize};
use tempfile::TempDir;
@@ -21,8 +23,6 @@ use crate::directory::{
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
WatchCallback, WatchHandle, WritePtr,
};
#[cfg(unix)]
use crate::Advice;
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;

View File

@@ -63,10 +63,13 @@ impl MergeOperation {
}
}
/// Returns the opstamp up to which we want to consume the delete queue and reflect their
/// deletes.
pub fn target_opstamp(&self) -> Opstamp {
self.inner.target_opstamp
}
/// Returns the list of segment to be merged.
pub fn segment_ids(&self) -> &[SegmentId] {
&self.inner.segment_ids[..]
}

View File

@@ -1,23 +1,29 @@
pub mod delete_queue;
//! Indexing and merging data.
//!
//! Contains code to create and merge segments.
//! `IndexWriter` is the main entry point for that, which created from
//! [`Index::writer`](crate::Index::writer).
pub mod doc_id_mapping;
pub(crate) mod delete_queue;
pub(crate) mod doc_id_mapping;
mod doc_opstamp_mapping;
mod flat_map_with_buffer;
pub mod index_writer;
mod index_writer_status;
pub(crate) mod index_writer;
pub(crate) mod index_writer_status;
mod log_merge_policy;
mod merge_operation;
pub mod merge_policy;
pub mod merger;
pub(crate) mod merge_policy;
pub(crate) mod merger;
mod merger_sorted_index_test;
pub mod operation;
pub mod prepared_commit;
pub(crate) mod operation;
pub(crate) mod prepared_commit;
mod segment_entry;
mod segment_manager;
mod segment_register;
pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
pub(crate) mod segment_serializer;
pub(crate) mod segment_updater;
pub(crate) mod segment_writer;
mod stamper;
use crossbeam_channel as channel;
@@ -27,10 +33,10 @@ pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::operation::UserOperation;
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub(crate) use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::{merge_filtered_segments, merge_indices};
pub use self::segment_writer::SegmentWriter;
use crate::indexer::operation::AddOperation;

View File

@@ -155,6 +155,8 @@ impl SegmentWriter {
Ok(doc_opstamps)
}
/// Returns an estimation of the current memory usage of the segment writer.
/// If the mem usage exceeds the `memory_budget`, the segment be serialized.
pub fn mem_usage(&self) -> usize {
self.ctx.mem_usage()
+ self.fieldnorms_writer.mem_usage()

View File

@@ -103,7 +103,48 @@
//! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/quickwit-oss/tantivy/blob/main/examples/basic_search.rs))
//!
//! # Tantivy Architecture Overview
//!
//! Tantivy is inspired by Lucene, the Architecture is very similar.
//!
//! ## Core Concepts
//!
//! - **[Index]**: A collection of segments. The top level entry point for tantivy users to search
//! and index data.
//!
//! - **[Segment]**: At the heart of Tantivy's indexing structure is the [Segment]. It contains
//! documents and indices and is the atomic unit of indexing and search.
//!
//! - **[Schema](schema)**: A schema is a set of fields in an index. Each field has a specific data
//! type and set of attributes.
//!
//! - **[IndexWriter]**: Responsible creating and merging segments. It executes the indexing
//! pipeline including tokenization, creating indices, and storing the index in the
//! [Directory](directory).
//!
//! - **Searching**: [Searcher] searches the segments with anything that implements
//! [Query](query::Query) and merges the results. The list of [supported
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
//! [Query](query::Query) trait.
//!
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
//!
//! - **[Tokenizer](tokenizer)**: Breaks down text into individual tokens. Users can implement or
//! use provided tokenizers.
//!
//! ## Architecture Flow
//!
//! 1. **Document Addition**: Users create documents according to the defined schema. The documents
//! fields are tokenized, processed, and added to the current segment. See
//! [Document](schema::document) for the structure and usage.
//!
//! 2. **Segment Creation**: Once the memory limit threshold is reached or a commit is called, the
//! segment is written to the Directory. Documents are searchable after `commit`.
//!
//! 3. **Merging**: To optimize space and search speed, segments might be merged. This operation is
//! performed in the background. Customize the merge behaviour via
//! [IndexWriter::set_merge_policy].
#[cfg_attr(test, macro_use)]
extern crate serde_json;
#[macro_use]
@@ -137,7 +178,7 @@ pub use crate::future_result::FutureResult;
pub type Result<T> = std::result::Result<T, TantivyError>;
mod core;
mod indexer;
pub mod indexer;
#[allow(unused_doc_comments)]
pub mod error;
@@ -161,8 +202,7 @@ pub mod termdict;
mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
pub mod snippet;
mod docset;
use std::fmt;
@@ -173,6 +213,11 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
pub use self::docset::{DocSet, TERMINATED};
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from snippet module instead"
)]
pub use self::snippet::{Snippet, SnippetGenerator};
#[doc(hidden)]
pub use crate::core::json_utils;
pub use crate::core::{
@@ -181,8 +226,12 @@ pub use crate::core::{
SegmentReader, SingleSegmentIndexWriter,
};
pub use crate::directory::Directory;
pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit};
pub use crate::indexer::IndexWriter;
#[deprecated(
since = "0.22.0",
note = "Will be removed in tantivy 0.23. Use export from indexer module instead"
)]
pub use crate::indexer::{merge_filtered_segments, merge_indices, PreparedCommit};
pub use crate::postings::Postings;
#[allow(deprecated)]
pub use crate::schema::DatePrecision;
@@ -191,9 +240,6 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocumen
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 5;
#[cfg(all(feature = "mmap", unix))]
pub use memmap2::Advice;
/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Version {

View File

@@ -1,3 +1,59 @@
//! [`SnippetGenerator`]
//! Generates a text snippet for a given document, and some highlighted parts inside it.
//! Imagine you doing a text search in a document
//! and want to show a preview of where in the document the search terms occur,
//! along with some surrounding text to give context, and the search terms highlighted.
//!
//! [`SnippetGenerator`] serves this purpose.
//! It scans a document and constructs a snippet, which consists of sections where the search terms
//! have been found, stitched together with "..." in between sections if necessary.
//!
//! ## Example
//!
//! ```rust
//! # use tantivy::query::QueryParser;
//! # use tantivy::schema::{Schema, TEXT};
//! # use tantivy::{doc, Index};
//! use tantivy::snippet::SnippetGenerator;
//!
//! # fn main() -> tantivy::Result<()> {
//! # let mut schema_builder = Schema::builder();
//! # let text_field = schema_builder.add_text_field("text", TEXT);
//! # let schema = schema_builder.build();
//! # let index = Index::create_in_ram(schema);
//! # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
//! # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles,
//! # Je ne me sentis plus guidé par les haleurs :
//! # Des Peaux-Rouges criards les avaient pris pour cibles,
//! # Les ayant cloués nus aux poteaux de couleurs.
//! #
//! # J'étais insoucieux de tous les équipages,
//! # Porteur de blés flamands ou de cotons anglais.
//! # Quand avec mes haleurs ont fini ces tapages,
//! # Les Fleuves m'ont laissé descendre où je voulais.
//! # "#);
//! # index_writer.add_document(doc.clone())?;
//! # index_writer.commit()?;
//! # let query_parser = QueryParser::for_index(&index, vec![text_field]);
//! // ...
//! let query = query_parser.parse_query("haleurs flamands").unwrap();
//! # let reader = index.reader()?;
//! # let searcher = reader.searcher();
//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
//! snippet_generator.set_max_num_chars(100);
//! let snippet = snippet_generator.snippet_from_doc(&doc);
//! let snippet_html: String = snippet.to_html();
//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des");
//! # Ok(())
//! # }
//! ```
//!
//! You can also specify the maximum number of characters for the snippets generated with the
//! `set_max_num_chars` method. By default, this limit is set to 150.
//!
//! SnippetGenerator needs to be created from the `Searcher` and the query, and the field on which
//! the `SnippetGenerator` should generate the snippets.
use std::cmp::Ordering;
use std::collections::{BTreeMap, BTreeSet};
use std::ops::Range;
@@ -16,7 +72,7 @@ const DEFAULT_SNIPPET_PREFIX: &str = "<b>";
const DEFAULT_SNIPPET_POSTFIX: &str = "</b>";
#[derive(Debug)]
pub struct FragmentCandidate {
pub(crate) struct FragmentCandidate {
score: Score,
start_offset: usize,
stop_offset: usize,
@@ -256,7 +312,7 @@ fn is_sorted(mut it: impl Iterator<Item = usize>) -> bool {
/// # use tantivy::query::QueryParser;
/// # use tantivy::schema::{Schema, TEXT};
/// # use tantivy::{doc, Index};
/// use tantivy::SnippetGenerator;
/// use tantivy::snippet::SnippetGenerator;
///
/// # fn main() -> tantivy::Result<()> {
/// # let mut schema_builder = Schema::builder();
@@ -346,7 +402,7 @@ impl SnippetGenerator {
})
}
/// Sets a maximum number of chars.
/// Sets a maximum number of chars. Default is 150.
pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
self.max_num_chars = max_num_chars;
}
@@ -398,8 +454,9 @@ mod tests {
use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination};
use crate::query::QueryParser;
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
use crate::snippet::SnippetGenerator;
use crate::tokenizer::{NgramTokenizer, SimpleTokenizer};
use crate::{Index, SnippetGenerator};
use crate::Index;
const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by
Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and

View File

@@ -1,5 +1,5 @@
//! The term dictionary main role is to associate the sorted [`Term`s](crate::Term) to
//! a [`TermInfo`](crate::postings::TermInfo) struct that contains some meta-information
//! a [`TermInfo`] struct that contains some meta-information
//! about the term.
//!
//! Internally, the term dictionary relies on the `fst` crate to store
@@ -16,8 +16,7 @@
//! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
//! as `u64`.
//!
//! A second datastructure makes it possible to access a
//! [`TermInfo`](crate::postings::TermInfo).
//! A second datastructure makes it possible to access a [`TermInfo`].
#[cfg(not(feature = "quickwit"))]
mod fst_termdict;

View File

@@ -2,7 +2,7 @@
//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
//! for each new tantivy version.
//!
//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
//! To add support for a tokenizer, implement the [`Tokenizer`] trait.
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
use std::borrow::{Borrow, BorrowMut};