mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
re-enable examples (#1860)
This commit is contained in:
129
examples/aggregation.rs
Normal file
129
examples/aggregation.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
// # Aggregation example
|
||||
//
|
||||
// This example shows how you can use built-in aggregations.
|
||||
// We will use range buckets and compute the average in each bucket.
|
||||
//
|
||||
|
||||
use serde_json::Value;
|
||||
use tantivy::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use tantivy::aggregation::agg_result::AggregationResults;
|
||||
use tantivy::aggregation::metric::AverageAggregation;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use tantivy::{doc, Index, Term};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
|
||||
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
|
||||
let price_field = schema_builder.add_f64_field("price", score_fieldtype);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Lets index a bunch of documents for this example.
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 1f64,
|
||||
price_field => 0f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 3f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 5f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "nohit",
|
||||
highscore_field => 6f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 7f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 11f64,
|
||||
price_field => 10f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 14f64,
|
||||
price_field => 15f64,
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 15f64,
|
||||
price_field => 20f64,
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let sub_agg_req_1: Aggregations = vec![(
|
||||
"average_price".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("price".to_string()),
|
||||
)),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"score_ranges".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "highscore".to_string(),
|
||||
ranges: vec![
|
||||
(-1f64..9f64).into(),
|
||||
(9f64..14f64).into(),
|
||||
(14f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req_1,
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(agg_res)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
225
examples/basic_search.rs
Normal file
225
examples/basic_search.rs
Normal file
@@ -0,0 +1,225 @@
|
||||
// # Basic Example
|
||||
//
|
||||
// This example covers the basic functionalities of
|
||||
// tantivy.
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// - create an index in a directory
|
||||
// - index a few documents into our index
|
||||
// - search for the best document matching a basic query
|
||||
// - retrieve the best document's original content.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// First we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Our first field is title.
|
||||
// We want full-text search for it, and we also want
|
||||
// to be able to retrieve the document after the search.
|
||||
//
|
||||
// `TEXT | STORED` is some syntactic sugar to describe
|
||||
// that.
|
||||
//
|
||||
// `TEXT` means the field should be tokenized and indexed,
|
||||
// along with its term frequency and term positions.
|
||||
//
|
||||
// `STORED` means that the field will also be saved
|
||||
// in a compressed, row-oriented key-value store.
|
||||
// This store is useful for reconstructing the
|
||||
// documents that were selected during the search phase.
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter by omitting the `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
// To insert a document we will need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we give tantivy a budget of `50MB`.
|
||||
// Using a bigger memory_arena for the indexer may increase
|
||||
// throughput, but 50 MB is already plenty.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
// ### Adding documents
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
|
||||
eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))?;
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
))?;
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 3 minutes on my computer!
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call `.commit()` explicitly to force the
|
||||
// `index_writer` to finish processing the documents in the queue,
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
index_writer.commit()?;
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents that have been added are guaranteed to be
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if it has rolled back to its last
|
||||
// commit.
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// ### Searcher
|
||||
//
|
||||
// A reader is required first in order to search an index.
|
||||
// It acts as a `Searcher` pool that reloads itself,
|
||||
// depending on a `ReloadPolicy`.
|
||||
//
|
||||
// For a search server you will typically create one reader for the entire lifetime of your
|
||||
// program, and acquire a new searcher for every single request.
|
||||
//
|
||||
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
|
||||
// will reload the index automatically after each commit.
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
// We now need to acquire a searcher.
|
||||
//
|
||||
// A searcher points to a snapshotted, immutable version of the index.
|
||||
//
|
||||
// Some search experience might require more than
|
||||
// one query. Using the same searcher ensures that all of these queries will run on the
|
||||
// same version of the index.
|
||||
//
|
||||
// Acquiring a `searcher` is very cheap.
|
||||
//
|
||||
// You should acquire a searcher every time you start processing a request and
|
||||
// and release it right after your query is finished.
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// ### Query
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// `QueryParser` may fail if the query is not in the right
|
||||
// format. For user facing applications, this can be a problem.
|
||||
// A ticket has been opened regarding this problem.
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
//
|
||||
// A query created by the query parser is scored according
|
||||
// to a metric called Tf-Idf, and will consider
|
||||
// any document matching at least one of our terms.
|
||||
|
||||
// ### Collectors
|
||||
//
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the `TopDocs` collector.
|
||||
|
||||
// We can now perform our query.
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
183
examples/custom_collector.rs
Normal file
183
examples/custom_collector.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
// # Custom collector example
|
||||
//
|
||||
// This example shows how you can implement your own
|
||||
// collector. As an example, we will compute a collector
|
||||
// that computes the standard deviation of a given fast field.
|
||||
//
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::column_values::ColumnValues;
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::{doc, Index, Score, SegmentReader};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
count: usize,
|
||||
sum: f64,
|
||||
squared_sum: f64,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
|
||||
pub fn mean(&self) -> f64 {
|
||||
self.sum / (self.count as f64)
|
||||
}
|
||||
|
||||
fn square_mean(&self) -> f64 {
|
||||
self.squared_sum / (self.count as f64)
|
||||
}
|
||||
|
||||
pub fn standard_deviation(&self) -> f64 {
|
||||
let mean = self.mean();
|
||||
(self.square_mean() - mean * mean).sqrt()
|
||||
}
|
||||
|
||||
fn non_zero_count(self) -> Option<Stats> {
|
||||
if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct StatsCollector {
|
||||
field: String,
|
||||
}
|
||||
|
||||
impl StatsCollector {
|
||||
fn with_field(field: String) -> StatsCollector {
|
||||
StatsCollector { field }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for StatsCollector {
|
||||
// That's the type of our result.
|
||||
// Our standard deviation will be a float.
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
type Child = StatsSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> tantivy::Result<StatsSegmentCollector> {
|
||||
let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
|
||||
Ok(StatsSegmentCollector {
|
||||
fast_field_reader,
|
||||
stats: Stats::default(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
// this collector does not care about score.
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
||||
let mut stats = Stats::default();
|
||||
for segment_stats in segment_stats.into_iter().flatten() {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
Ok(stats.non_zero_count())
|
||||
}
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: Arc<dyn ColumnValues>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
impl SegmentCollector for StatsSegmentCollector {
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: Score) {
|
||||
let value = self.fast_field_reader.get_val(doc) as f64;
|
||||
self.stats.count += 1;
|
||||
self.stats.sum += value;
|
||||
self.stats.squared_sum += value * value;
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
self.stats.non_zero_count()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// We'll assume a fictional index containing
|
||||
// products, and with a name, a description, and a price.
|
||||
let product_name = schema_builder.add_text_field("name", TEXT);
|
||||
let product_description = schema_builder.add_text_field("description", TEXT);
|
||||
let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Lets index a bunch of fake documents for the sake of
|
||||
// this example.
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Super Broom 2000",
|
||||
product_description => "While it is ok for short distance travel, this broom \
|
||||
was designed quiditch. It will up your game.",
|
||||
price => 30_200u64
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Turbulobroom",
|
||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
||||
You'll enjoy its sharp turns, and rapid acceleration",
|
||||
price => 29_240u64
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Broomio",
|
||||
product_description => "Great value for the price. This broom is a market favorite",
|
||||
price => 21_240u64
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Whack a Mole",
|
||||
product_description => "Prime quality bat.",
|
||||
price => 5_200u64
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
let query = query_parser.parse_query("broom")?;
|
||||
if let Some(stats) =
|
||||
searcher.search(&query, &StatsCollector::with_field("price".to_string()))?
|
||||
{
|
||||
println!("count: {}", stats.count());
|
||||
println!("mean: {}", stats.mean());
|
||||
println!("standard deviation: {}", stats.standard_deviation());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
111
examples/custom_tokenizer.rs
Normal file
111
examples/custom_tokenizer.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
// # Defining a tokenizer pipeline
|
||||
//
|
||||
// In this example, we'll see how to define a tokenizer pipeline
|
||||
// by aligning a bunch of `TokenFilter`.
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Our first field is title.
|
||||
// In this example we want to use NGram searching
|
||||
// we will set that to 3 characters, so any three
|
||||
// char in the title should be findable.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("ngram3")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
let title = schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter by omitting the `STORED` flag.
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
// To simplify we will work entirely in RAM.
|
||||
// This is not what you want in reality, but it is very useful
|
||||
// for your unit tests... Or this example.
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// here we are registering our custom tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
index
|
||||
.tokenizers()
|
||||
.register("ngram3", NgramTokenizer::new(3, 3, false));
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// memory arena for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent
|
||||
limbs and branches that arch over the pool"#
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||
increasing confidence in the success of my undertaking."#
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
let query = query_parser.parse_query("ken")?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
for (_, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
69
examples/date_time_field.rs
Normal file
69
examples/date_time_field.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
// # DateTime field example
|
||||
//
|
||||
// This example shows how the DateTime field can be used
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let opts = DateOptions::from(INDEXED)
|
||||
.set_stored()
|
||||
.set_fast()
|
||||
.set_precision(tantivy::DatePrecision::Seconds);
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T12:53:50.53Z",
|
||||
"event": "pull-request"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T13:00:00.22Z",
|
||||
"event": "comment"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type]);
|
||||
{
|
||||
let query = query_parser.parse_query("event:comment")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser
|
||||
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
for (_score, doc_address) in count_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
assert!(matches!(
|
||||
retrieved_doc.get_first(occurred_at),
|
||||
Some(Value::Date(_))
|
||||
));
|
||||
assert_eq!(
|
||||
schema.to_json(&retrieved_doc),
|
||||
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
144
examples/deleting_updating_documents.rs
Normal file
144
examples/deleting_updating_documents.rs
Normal file
@@ -0,0 +1,144 @@
|
||||
// # Deleting and Updating (?) documents
|
||||
//
|
||||
// This example explains how to delete and update documents.
|
||||
// In fact there is actually no such thing as an update in tantivy.
|
||||
//
|
||||
// To update a document, you need to delete a document and then reinsert
|
||||
// its new version.
|
||||
//
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, IndexReader};
|
||||
|
||||
// A simple helper function to fetch a single document
|
||||
// given its id from our index.
|
||||
// It will be helpful to check our work.
|
||||
fn extract_doc_given_isbn(
|
||||
reader: &IndexReader,
|
||||
isbn_term: &Term,
|
||||
) -> tantivy::Result<Option<Document>> {
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// This is the simplest query you can think of.
|
||||
// It matches all of the documents containing a specific term.
|
||||
//
|
||||
// The second argument is here to tell we don't care about decoding positions,
|
||||
// or term frequencies.
|
||||
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
||||
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
|
||||
|
||||
if let Some((_score, doc_address)) = top_docs.first() {
|
||||
let doc = searcher.doc(*doc_address)?;
|
||||
Ok(Some(doc))
|
||||
} else {
|
||||
// no doc matching this ID.
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// Check out the *basic_search* example if this makes
|
||||
// small sense to you.
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Tantivy does not really have a notion of primary id.
|
||||
// This may change in the future.
|
||||
//
|
||||
// Still, we can create a `isbn` field and use it as an id. This
|
||||
// field can be `u64` or a `text`, depending on your use case.
|
||||
// It just needs to be indexed.
|
||||
//
|
||||
// If it is `text`, let's make sure to keep it `raw` and let's avoid
|
||||
// running any text processing on it.
|
||||
// This is done by associating this field to the tokenizer named `raw`.
|
||||
// Rather than building our
|
||||
// [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually, We
|
||||
// use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions)
|
||||
// and untokenized.
|
||||
//
|
||||
// Because we also want to be able to see this `id` in our returned documents,
|
||||
// we also mark the field as stored.
|
||||
let isbn = schema_builder.add_text_field("isbn", STRING | STORED);
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's add a couple of documents, for the sake of the example.
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0099908401",
|
||||
title => "The old Man and the see"
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0140177398",
|
||||
title => "Of Mice and Men",
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankentein", //< Oops there is a typo here.
|
||||
isbn => "978-9176370711",
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
||||
|
||||
// Oops our frankenstein doc seems misspelled
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_doc_misspelled),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
|
||||
);
|
||||
|
||||
// # Update = Delete + Insert
|
||||
//
|
||||
// Here we will want to update the typo in the `Frankenstein` book.
|
||||
//
|
||||
// Tantivy does not handle updates directly, we need to delete
|
||||
// and reinsert the document.
|
||||
//
|
||||
// This can be complicated as it means you need to have access
|
||||
// to the entire document. It is good practise to integrate tantivy
|
||||
// with a key value store for this reason.
|
||||
//
|
||||
// To remove one of the document, we just call `delete_term`
|
||||
// on its id.
|
||||
//
|
||||
// Note that `tantivy` does nothing to enforce the idea that
|
||||
// there is only one document associated with this id.
|
||||
//
|
||||
// Also you might have noticed that we apply the delete before
|
||||
// having committed. This does not matter really...
|
||||
index_writer.delete_term(frankenstein_isbn.clone());
|
||||
|
||||
// We now need to reinsert our document without the typo.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
isbn => "978-9176370711",
|
||||
))?;
|
||||
|
||||
// You are guaranteed that your clients will only observe your index in
|
||||
// the state it was in after a commit.
|
||||
// In this example, your search engine will at no point be missing the *Frankenstein* document.
|
||||
// Everything happened as if the document was updated.
|
||||
index_writer.commit()?;
|
||||
// We reload our searcher to make our change available to clients.
|
||||
reader.reload()?;
|
||||
|
||||
// No more typo!
|
||||
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_new_doc),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
114
examples/faceted_search.rs
Normal file
114
examples/faceted_search.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
// # Faceted Search
|
||||
//
|
||||
// This example covers the faceted search functionalities of
|
||||
// tantivy.
|
||||
//
|
||||
// We will :
|
||||
// - define a text field "name" in our schema
|
||||
// - define a facet field "classification" in our schema
|
||||
// - create an index in memory
|
||||
// - index few documents with respective facets in our index
|
||||
// - search and count the number of documents that the classifications start the facet "/Felidae"
|
||||
// - Search the facet "/Felidae/Pantherinae" and count the number of documents that the
|
||||
// classifications include the facet.
|
||||
//
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::FacetCollector;
|
||||
use tantivy::query::{AllQuery, TermQuery};
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the sake of this example
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let name = schema_builder.add_text_field("name", TEXT | STORED);
|
||||
// this is our faceted field: its scientific classification
|
||||
let classification = schema_builder.add_facet_field("classification", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cat",
|
||||
classification => Facet::from("/Felidae/Felinae/Felis")
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Canada lynx",
|
||||
classification => Facet::from("/Felidae/Felinae/Lynx")
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cheetah",
|
||||
classification => Facet::from("/Felidae/Felinae/Acinonyx")
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Tiger",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Lion",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Jaguar",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Sunda clouded leopard",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Neofelis")
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Fossa",
|
||||
classification => Facet::from("/Eupleridae/Cryptoprocta")
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let mut facet_collector = FacetCollector::for_field("classification");
|
||||
facet_collector.add_facet("/Felidae");
|
||||
let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
// This lists all of the facet counts, right below "/Felidae".
|
||||
let facets: Vec<(&Facet, u64)> = facet_counts.get("/Felidae").collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/Felidae/Felinae"), 3),
|
||||
(&Facet::from("/Felidae/Pantherinae"), 4),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
// Facets are also searchable.
|
||||
//
|
||||
// For instance a common UI pattern is to allow the user someone to click on a facet link
|
||||
// (e.g: `Pantherinae`) to drill down and filter the current result set with this subfacet.
|
||||
//
|
||||
// The search would then look as follows.
|
||||
|
||||
// Check the reference doc for different ways to create a `Facet` object.
|
||||
{
|
||||
let facet = Facet::from("/Felidae/Pantherinae");
|
||||
let facet_term = Term::from_facet(classification, &facet);
|
||||
let facet_term_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
|
||||
let mut facet_collector = FacetCollector::for_field("classification");
|
||||
facet_collector.add_facet("/Felidae/Pantherinae");
|
||||
let facet_counts = searcher.search(&facet_term_query, &facet_collector)?;
|
||||
let facets: Vec<(&Facet, u64)> = facet_counts.get("/Felidae/Pantherinae").collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/Felidae/Pantherinae/Neofelis"), 1),
|
||||
(&Facet::from("/Felidae/Pantherinae/Panthera"), 3),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
95
examples/faceted_search_with_tweaked_score.rs
Normal file
95
examples/faceted_search_with_tweaked_score.rs
Normal file
@@ -0,0 +1,95 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::BooleanQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, DocId, Index, Score, SegmentReader};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let title = schema_builder.add_text_field("title", STORED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Fried egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Scrambled egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/butter"),
|
||||
ingredient => Facet::from("/ingredient/milk"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Egg rolls",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/garlic"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||
ingredient => Facet::from("/ingredient/mushroom"),
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let facets = vec![
|
||||
Facet::from("/ingredient/egg"),
|
||||
Facet::from("/ingredient/oil"),
|
||||
Facet::from("/ingredient/garlic"),
|
||||
Facet::from("/ingredient/mushroom"),
|
||||
];
|
||||
let query = BooleanQuery::new_multiterms_query(
|
||||
facets
|
||||
.iter()
|
||||
.map(|key| Term::from_facet(ingredient, key))
|
||||
.collect(),
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
|
||||
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
|
||||
let facet_dict = ingredient_reader.facet_dict();
|
||||
|
||||
let query_ords: HashSet<u64> = facets
|
||||
.iter()
|
||||
.filter_map(|key| facet_dict.term_ord(key.encoded_str()).unwrap())
|
||||
.collect();
|
||||
|
||||
move |doc: DocId, original_score: Score| {
|
||||
let missing_ingredients = ingredient_reader
|
||||
.facet_ords(doc)
|
||||
.filter(|ord| !query_ords.contains(ord))
|
||||
.count();
|
||||
let tweak = 1.0 / 4_f32.powi(missing_ingredients as i32);
|
||||
|
||||
original_score * tweak
|
||||
}
|
||||
});
|
||||
let top_docs = searcher.search(&query, &top_docs_by_custom_score)?;
|
||||
|
||||
let titles: Vec<String> = top_docs
|
||||
.iter()
|
||||
.map(|(_, doc_id)| {
|
||||
searcher
|
||||
.doc(*doc_id)
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap()
|
||||
.to_owned()
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
35
examples/integer_range_search.rs
Normal file
35
examples/integer_range_search.rs
Normal file
@@ -0,0 +1,35 @@
|
||||
// # Searching a range on an indexed int field.
|
||||
//
|
||||
// Below is an example of creating an indexed integer field in your schema
|
||||
// You can use RangeQuery to get a Count of all occurrences in a given range.
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, Result};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// `INDEXED` is a short-hand to indicate that our field should be "searchable".
|
||||
let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index.reader()?;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
for year in 1950u64..2019u64 {
|
||||
index_writer.add_document(doc!(year_field => year))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
// The index will be a range of years
|
||||
}
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// The end is excluded i.e. here we are searching up to 1969
|
||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
||||
// Uses a Count collector to sum the total number of docs in the range
|
||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(num_60s_books, 10);
|
||||
Ok(())
|
||||
}
|
||||
73
examples/ip_field.rs
Normal file
73
examples/ip_field.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
// # IP Address example
|
||||
//
|
||||
// This example shows how the ip field can be used
|
||||
// with IpV6 and IpV4.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
let ip = schema_builder.add_ip_addr_field("ip", STORED | INDEXED | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"ip": "192.168.0.33",
|
||||
"event_type": "login"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"ip": "192.168.0.80",
|
||||
"event_type": "checkout"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
|
||||
"event_type": "checkout"
|
||||
}"#,
|
||||
)?;
|
||||
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, ip]);
|
||||
{
|
||||
let query = query_parser.parse_query("ip:[192.168.0.0 TO 192.168.0.100]")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("ip:[192.168.1.0 TO 192.168.1.100]")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 0);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("ip:192.168.0.80")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
// IpV6 needs to be escaped because it contains `:`
|
||||
let query = query_parser.parse_query("ip:\"2001:0db8:85a3:0000:0000:8a2e:0370:7334\"")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
135
examples/iterating_docs_and_positions.rs
Normal file
135
examples/iterating_docs_and_positions.rs
Normal file
@@ -0,0 +1,135 @@
|
||||
// # Iterating docs and positions.
|
||||
//
|
||||
// At its core of tantivy, relies on a data structure
|
||||
// called an inverted index.
|
||||
//
|
||||
// This example shows how to manually iterate through
|
||||
// the list of documents containing a term, getting
|
||||
// its term frequency, and accessing its positions.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// We first create a schema for the sake of the
|
||||
// example. Check the `basic_search` example for more information.
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// For this example, we need to make sure to index positions for our title
|
||||
// field. `TEXT` precisely does this.
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// A tantivy index is actually a collection of segments.
|
||||
// Similarly, a searcher just wraps a list `segment_reader`.
|
||||
//
|
||||
// (Because we indexed a very small number of documents over one thread
|
||||
// there is actually only one segment here, but let's iterate through the list
|
||||
// anyway)
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
// A segment contains different data structure.
|
||||
// Inverted index stands for the combination of
|
||||
// - the term dictionary
|
||||
// - the inverted lists associated with each terms and their positions
|
||||
let inverted_index = segment_reader.inverted_index(title)?;
|
||||
|
||||
// A `Term` is a text token associated with a field.
|
||||
// Let's go through all docs containing the term `title:the` and access their position
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
|
||||
// frequencies and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing
|
||||
// less information.
|
||||
if let Some(mut segment_postings) =
|
||||
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
// this buffer will be used to request for positions
|
||||
let mut positions: Vec<u32> = Vec::with_capacity(100);
|
||||
let mut doc_id = segment_postings.doc();
|
||||
while doc_id != TERMINATED {
|
||||
// This MAY contains deleted documents as well.
|
||||
if segment_reader.is_deleted(doc_id) {
|
||||
doc_id = segment_postings.advance();
|
||||
continue;
|
||||
}
|
||||
|
||||
// the number of time the term appears in the document.
|
||||
let term_freq: u32 = segment_postings.term_freq();
|
||||
// accessing positions is slightly expensive and lazy, do not request
|
||||
// for them if you don't need them for some documents.
|
||||
segment_postings.positions(&mut positions);
|
||||
|
||||
// By definition we should have `term_freq` positions.
|
||||
assert_eq!(positions.len(), term_freq as usize);
|
||||
|
||||
// This prints:
|
||||
// ```
|
||||
// Doc 0: TermFreq 2: [0, 4]
|
||||
// Doc 2: TermFreq 1: [0]
|
||||
// ```
|
||||
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
|
||||
doc_id = segment_postings.advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A `Term` is a text token associated with a field.
|
||||
// Let's go through all docs containing the term `title:the` and access their position
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
// Some other powerful operations (especially `.skip_to`) may be useful to consume these
|
||||
// posting lists rapidly.
|
||||
// You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
|
||||
// and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait
|
||||
|
||||
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
|
||||
// you can get better performance by accessing directly the blocks of doc ids.
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
// A segment contains different data structure.
|
||||
// Inverted index stands for the combination of
|
||||
// - the term dictionary
|
||||
// - the inverted lists associated with each terms and their positions
|
||||
let inverted_index = segment_reader.inverted_index(title)?;
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
|
||||
// frequencies and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing
|
||||
// less information.
|
||||
if let Some(mut block_segment_postings) =
|
||||
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
|
||||
{
|
||||
loop {
|
||||
let docs = block_segment_postings.docs();
|
||||
if docs.is_empty() {
|
||||
break;
|
||||
}
|
||||
// Once again these docs MAY contains deleted documents as well.
|
||||
let docs = block_segment_postings.docs();
|
||||
// Prints `Docs [0, 2].`
|
||||
println!("Docs {:?}", docs);
|
||||
block_segment_postings.advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
105
examples/json_field.rs
Normal file
105
examples/json_field.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
// # Json field example
|
||||
//
|
||||
// This example shows how the json field can be used
|
||||
// to make tantivy partially schemaless by setting it as
|
||||
// default query parser field.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_date_field("timestamp", FAST | STORED);
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
let attributes = schema_builder.add_json_field("attributes", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:50.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 103},
|
||||
"description": "the best vacuum cleaner ever"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:51.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 133},
|
||||
"description": "das keyboard",
|
||||
"event_type": "holiday-sale"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type and attributes
|
||||
// By setting attributes as a default field it allows omitting attributes itself, e.g. "target",
|
||||
// instead of "attributes.target"
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit-button")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("cart.product_id:103")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("click AND cart.product_id:133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
{
|
||||
// The sub-fields in the json field marked as default field still need to be explicitly
|
||||
// addressed
|
||||
let query = query_parser.parse_query("click AND 133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
{
|
||||
// Default json fields are ignored if they collide with the schema
|
||||
let query = query_parser.parse_query("event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
// # Query via full attribute path
|
||||
{
|
||||
// This only searches in our schema's `event_type` field
|
||||
let query = query_parser.parse_query("event_type:click")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 2);
|
||||
}
|
||||
{
|
||||
// Default json fields can still be accessed by full path
|
||||
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
104
examples/multiple_producer.rs
Normal file
104
examples/multiple_producer.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
// # Indexing from different threads.
|
||||
//
|
||||
// It is fairly common to have to index from different threads.
|
||||
// Tantivy forbids to create more than one `IndexWriter` at a time.
|
||||
//
|
||||
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
|
||||
// indexing threads will not help. However, it can still be useful for some applications.
|
||||
//
|
||||
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
|
||||
// your application, it is reasonable to have multiple threads.
|
||||
//
|
||||
// Another very common reason to want to index from multiple threads, is implementing a webserver
|
||||
// with CRUD capabilities. The server framework will most likely handle request from
|
||||
// different threads.
|
||||
//
|
||||
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
|
||||
// `Arc<RwLock<IndexWriter>>`.
|
||||
//
|
||||
// While this is counterintuitive, adding and deleting documents do not require mutability
|
||||
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
|
||||
//
|
||||
// The example below does not represent an actual real-life use case (who would spawn thread to
|
||||
// index a single document?), but aims at demonstrating the mechanism that makes indexing
|
||||
// from several threads possible.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
|
||||
|
||||
// # First indexing thread.
|
||||
let index_writer_clone_1 = index_writer.clone();
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
let opstamp = index_writer_clone_1
|
||||
.read().unwrap() //< A read lock is sufficient here.
|
||||
.add_document(
|
||||
doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))?;
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
let index_writer_clone_2 = index_writer.clone();
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
// A read lock is sufficient here.
|
||||
let opstamp = {
|
||||
let index_writer_rlock = index_writer_clone_2.read().unwrap();
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))?
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
for _ in 0..10 {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block
|
||||
// other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit()?
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
135
examples/pre_tokenized_text.rs
Normal file
135
examples/pre_tokenized_text.rs
Normal file
@@ -0,0 +1,135 @@
|
||||
// # Pre-tokenized text example
|
||||
//
|
||||
// This example shows how to use pre-tokenized text. Sometimes you might
|
||||
// want to index and search through text which is already split into
|
||||
// tokens by some external tool.
|
||||
//
|
||||
// In this example we will:
|
||||
// - use tantivy tokenizer to create tokens and load them directly into tantivy,
|
||||
// - import tokenized text straight from json,
|
||||
// - perform a search on documents with pre-tokenized text
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, Tokenizer};
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||
let mut token_stream = SimpleTokenizer.token_stream(text);
|
||||
let mut tokens = vec![];
|
||||
while token_stream.advance() {
|
||||
tokens.push(token_stream.token().clone());
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let title_text = "The Old Man and the Sea";
|
||||
let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";
|
||||
|
||||
// Content of our first document
|
||||
// We create `PreTokenizedString` which contains original text and vector of tokens
|
||||
let title_tok = PreTokenizedString {
|
||||
text: String::from(title_text),
|
||||
tokens: pre_tokenize_text(title_text),
|
||||
};
|
||||
|
||||
println!(
|
||||
"Original text: \"{}\" and tokens: {:?}",
|
||||
title_tok.text, title_tok.tokens
|
||||
);
|
||||
|
||||
let body_tok = PreTokenizedString {
|
||||
text: String::from(body_text),
|
||||
tokens: pre_tokenize_text(body_text),
|
||||
};
|
||||
|
||||
// Now lets create a document and add our `PreTokenizedString`
|
||||
let old_man_doc = doc!(title => title_tok, body => body_tok);
|
||||
|
||||
// ... now let's just add it to the IndexWriter
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// Pretokenized text can also be fed as JSON
|
||||
let short_man_json = r#"{
|
||||
"title":[{
|
||||
"text":"The Old Man",
|
||||
"tokens":[
|
||||
{"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1},
|
||||
{"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1},
|
||||
{"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1}
|
||||
]
|
||||
}]
|
||||
}"#;
|
||||
|
||||
let short_man_doc = schema.parse_document(short_man_json)?;
|
||||
|
||||
index_writer.add_document(short_man_doc)?;
|
||||
|
||||
// Let's commit changes
|
||||
index_writer.commit()?;
|
||||
|
||||
// ... and now is the time to query our index
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// We want to get documents with token "Man", we will use TermQuery to do it
|
||||
// Using PreTokenizedString means the tokens are stored as is avoiding stemming
|
||||
// and lowercasing, which preserves full words in their original form
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(title, "Man"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 2);
|
||||
|
||||
// Now let's print out the results.
|
||||
// Note that the tokens are not stored along with the original text
|
||||
// in the document store
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("Document: {}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// In contrary to the previous query, when we search for the "man" term we
|
||||
// should get no results, as it's not one of the indexed tokens. SimpleTokenizer
|
||||
// only splits text on whitespace / punctuation.
|
||||
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(title, "man"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
85
examples/snippet.rs
Normal file
85
examples/snippet.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
// # Snippet example
|
||||
//
|
||||
// This example shows how to return a representative snippet of
|
||||
// your hit result.
|
||||
// Snippet are an extracted of a target document, and returned in HTML format.
|
||||
// The keyword searched by the user are highlighted with a `<b>` tag.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index, Snippet, SnippetGenerator};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// we'll only need one doc for this example.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))?;
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query = query_parser.parse_query("sycamore spring")?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("Document score {}:", score);
|
||||
println!(
|
||||
"title: {}",
|
||||
doc.get_first(title).unwrap().as_text().unwrap()
|
||||
);
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
println!("custom highlighting: {}", highlight(snippet));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn highlight(snippet: Snippet) -> String {
|
||||
let mut result = String::new();
|
||||
let mut start_from = 0;
|
||||
|
||||
for fragment_range in snippet.highlighted() {
|
||||
result.push_str(&snippet.fragment()[start_from..fragment_range.start]);
|
||||
result.push_str(" --> ");
|
||||
result.push_str(&snippet.fragment()[fragment_range.clone()]);
|
||||
result.push_str(" <-- ");
|
||||
start_from = fragment_range.end;
|
||||
}
|
||||
|
||||
result.push_str(&snippet.fragment()[start_from..]);
|
||||
result
|
||||
}
|
||||
113
examples/stop_words.rs
Normal file
113
examples/stop_words.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
// # Stop Words Example
|
||||
//
|
||||
// This example covers the basic usage of stop words
|
||||
// with tantivy
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// - create an index in a directory
|
||||
// - add a few stop words
|
||||
// - index few documents in our index
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// This configures your custom options for how tantivy will
|
||||
// store and process your content in the index; The key
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// stop words are applied on the query as well.
|
||||
// The following will be equivalent to `title:frankenstein`
|
||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("\n==\nDocument score {}:", score);
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
219
examples/warmer.rs
Normal file
219
examples/warmer.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, TEXT};
|
||||
use tantivy::{
|
||||
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
|
||||
SegmentReader, Warmer,
|
||||
};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
// load a values from an external sources using the Warmer API.
|
||||
//
|
||||
// In this example, we assume an e-commerce search engine.
|
||||
|
||||
type ProductId = u64;
|
||||
|
||||
/// Price
|
||||
type Price = u32;
|
||||
|
||||
pub trait PriceFetcher: Send + Sync + 'static {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
|
||||
}
|
||||
|
||||
struct DynamicPriceColumn {
|
||||
field: String,
|
||||
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
|
||||
price_fetcher: Box<dyn PriceFetcher>,
|
||||
}
|
||||
|
||||
impl DynamicPriceColumn {
|
||||
pub fn with_product_id_field<T: PriceFetcher>(field: String, price_fetcher: T) -> Self {
|
||||
DynamicPriceColumn {
|
||||
field,
|
||||
price_cache: Default::default(),
|
||||
price_fetcher: Box::new(price_fetcher),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
|
||||
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
|
||||
self.price_cache.read().unwrap().get(&segment_key).cloned()
|
||||
}
|
||||
}
|
||||
impl Warmer for DynamicPriceColumn {
|
||||
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
|
||||
for segment in searcher.segment_readers() {
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
let product_id_reader = segment.fast_fields().u64(&self.field)?;
|
||||
let product_ids: Vec<ProductId> = segment
|
||||
.doc_ids_alive()
|
||||
.map(|doc| product_id_reader.get_val(doc))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
for doc in 0..segment.max_doc() {
|
||||
if segment.is_deleted(doc) {
|
||||
price_vals.push(0);
|
||||
} else {
|
||||
price_vals.push(prices_it.next().unwrap())
|
||||
}
|
||||
}
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(key, Arc::new(price_vals));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
|
||||
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
|
||||
live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
let mut price_cache_wrt = self.price_cache.write().unwrap();
|
||||
// let price_cache = std::mem::take(&mut *price_cache_wrt);
|
||||
// Drain would be nicer here.
|
||||
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
|
||||
.into_iter()
|
||||
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
||||
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
|
||||
/// This map represents a map (ProductId -> Price)
|
||||
///
|
||||
/// In practise, it could be fetching things from an external service, like a SQL table.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ExternalPriceTable {
|
||||
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
|
||||
}
|
||||
|
||||
impl ExternalPriceTable {
|
||||
pub fn update_price(&self, product_id: ProductId, price: Price) {
|
||||
let mut prices_wrt = self.prices.write().unwrap();
|
||||
prices_wrt.insert(product_id, price);
|
||||
}
|
||||
}
|
||||
|
||||
impl PriceFetcher for ExternalPriceTable {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
|
||||
let prices_read = self.prices.read().unwrap();
|
||||
product_ids
|
||||
.iter()
|
||||
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Declaring our schema.
|
||||
let mut schema_builder = Schema::builder();
|
||||
// The product id is assumed to be a primary id for our external price source.
|
||||
let product_id = schema_builder.add_u64_field("product_id", FAST);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema: Schema = schema_builder.build();
|
||||
|
||||
let price_table = ExternalPriceTable::default();
|
||||
let price_dynamic_column = Arc::new(DynamicPriceColumn::with_product_id_field(
|
||||
"product_id".to_string(),
|
||||
price_table.clone(),
|
||||
));
|
||||
price_table.update_price(OLIVE_OIL, 12);
|
||||
price_table.update_price(GLOVES, 13);
|
||||
price_table.update_price(SNEAKERS, 80);
|
||||
|
||||
const OLIVE_OIL: ProductId = 323423;
|
||||
const GLOVES: ProductId = 3966623;
|
||||
const SNEAKERS: ProductId = 23222;
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
|
||||
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
|
||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
||||
writer.commit()?;
|
||||
|
||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
||||
)];
|
||||
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
|
||||
reader.reload()?;
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||
let query = query_parser.parse_query("cooking")?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let score_by_price = move |segment_reader: &SegmentReader| {
|
||||
let price = price_dynamic_column
|
||||
.price_for_segment(segment_reader)
|
||||
.unwrap();
|
||||
move |doc_id: DocId| Reverse(price[doc_id as usize])
|
||||
};
|
||||
|
||||
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
|
||||
|
||||
let hits = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits,
|
||||
&[
|
||||
(
|
||||
Reverse(12u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
// Olive oil just got more expensive!
|
||||
price_table.update_price(OLIVE_OIL, 15);
|
||||
|
||||
// The price update are directly reflected on `reload`.
|
||||
//
|
||||
// Be careful here though!...
|
||||
// You may have spotted that we are still using the same `Searcher`.
|
||||
//
|
||||
// It is up to the `Warmer` implementer to decide how
|
||||
// to control this behavior.
|
||||
|
||||
reader.reload()?;
|
||||
|
||||
let hits_with_new_prices = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits_with_new_prices,
|
||||
&[
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(15u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
40
examples/working_with_json.rs
Normal file
40
examples/working_with_json.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
use tantivy::schema::*;
|
||||
|
||||
// # Document from json
|
||||
//
|
||||
// For convenience, `Document` can be parsed directly from json.
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's first define a schema and an index.
|
||||
// Check out the basic example if this is confusing to you.
|
||||
//
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// Let's assume we have a json-serialized document.
|
||||
let mice_and_men_doc_json = r#"{
|
||||
"title": "Of Mice and Men",
|
||||
"year": 1937
|
||||
}"#;
|
||||
|
||||
// We can parse our document
|
||||
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let frankenstein_json = r#"{
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"year": 1818
|
||||
}"#;
|
||||
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
|
||||
|
||||
// Note that the schema is saved in your index directory.
|
||||
//
|
||||
// As a result, Indexes are aware of their schema, and you can use this feature
|
||||
// just by opening an existing `Index`, and calling `index.schema()..parse_document(json)`.
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user