mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
update examples for literate docs (#1880)
This commit is contained in:
@@ -1,129 +1,319 @@
|
||||
// # Aggregation example
|
||||
//
|
||||
// This example shows how you can use built-in aggregations.
|
||||
// We will use range buckets and compute the average in each bucket.
|
||||
//
|
||||
// We will use nested aggregations with buckets and metrics:
|
||||
// - Range buckets and compute the average in each bucket.
|
||||
// - Term aggregation and compute the min price in each bucket
|
||||
// ---
|
||||
|
||||
use serde_json::Value;
|
||||
use serde_json::{Deserializer, Value};
|
||||
use tantivy::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use tantivy::aggregation::agg_result::AggregationResults;
|
||||
use tantivy::aggregation::bucket::RangeAggregationRange;
|
||||
use tantivy::aggregation::metric::AverageAggregation;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use tantivy::{doc, Index, Term};
|
||||
use tantivy::query::AllQuery;
|
||||
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Create Schema
|
||||
//
|
||||
// Lets create a schema for a footwear shop, with 4 fields: name, category, stock and price.
|
||||
// category, stock and price will be fast fields as that's the requirement
|
||||
// for aggregation queries.
|
||||
//
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
// In preparation of the `TermsAggregation`, the category field is configured with:
|
||||
// - `set_fast`
|
||||
// - `raw` tokenizer
|
||||
//
|
||||
// The tokenizer is set to "raw", because the fast field uses the same dictionary as the
|
||||
// inverted index. (This behaviour will change in tantivy 0.20, where the fast field will
|
||||
// always be raw tokenized independent from the regular tokenizing)
|
||||
//
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqs)
|
||||
.set_tokenizer("raw"),
|
||||
)
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
|
||||
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
|
||||
let price_field = schema_builder.add_f64_field("price", score_fieldtype);
|
||||
schema_builder.add_text_field("category", text_fieldtype);
|
||||
schema_builder.add_f64_field("stock", FAST);
|
||||
schema_builder.add_f64_field("price", FAST);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Lets index a bunch of documents for this example.
|
||||
let index = Index::create_in_ram(schema);
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let data = r#"{
|
||||
"name": "Almond Toe Court Shoes, Patent Black",
|
||||
"category": "Womens Footwear",
|
||||
"price": 99.00,
|
||||
"stock": 5
|
||||
}
|
||||
{
|
||||
"name": "Suede Shoes, Blue",
|
||||
"category": "Womens Footwear",
|
||||
"price": 42.00,
|
||||
"stock": 4
|
||||
}
|
||||
{
|
||||
"name": "Leather Driver Saddle Loafers, Tan",
|
||||
"category": "Mens Footwear",
|
||||
"price": 34.00,
|
||||
"stock": 12
|
||||
}
|
||||
{
|
||||
"name": "Flip Flops, Red",
|
||||
"category": "Mens Footwear",
|
||||
"price": 19.00,
|
||||
"stock": 6
|
||||
}
|
||||
{
|
||||
"name": "Flip Flops, Blue",
|
||||
"category": "Mens Footwear",
|
||||
"price": 19.00,
|
||||
"stock": 0
|
||||
}
|
||||
{
|
||||
"name": "Gold Button Cardigan, Black",
|
||||
"category": "Womens Casualwear",
|
||||
"price": 167.00,
|
||||
"stock": 6
|
||||
}
|
||||
{
|
||||
"name": "Cotton Shorts, Medium Red",
|
||||
"category": "Womens Casualwear",
|
||||
"price": 30.00,
|
||||
"stock": 5
|
||||
}
|
||||
{
|
||||
"name": "Fine Stripe Short SleeveShirt, Grey",
|
||||
"category": "Mens Casualwear",
|
||||
"price": 49.99,
|
||||
"stock": 9
|
||||
}
|
||||
{
|
||||
"name": "Fine Stripe Short SleeveShirt, Green",
|
||||
"category": "Mens Casualwear",
|
||||
"price": 49.99,
|
||||
"offer": 39.99,
|
||||
"stock": 9
|
||||
}
|
||||
{
|
||||
"name": "Sharkskin Waistcoat, Charcoal",
|
||||
"category": "Mens Formalwear",
|
||||
"price": 75.00,
|
||||
"stock": 2
|
||||
}
|
||||
{
|
||||
"name": "Lightweight Patch PocketBlazer, Deer",
|
||||
"category": "Mens Formalwear",
|
||||
"price": 175.50,
|
||||
"stock": 1
|
||||
}
|
||||
{
|
||||
"name": "Bird Print Dress, Black",
|
||||
"category": "Womens Formalwear",
|
||||
"price": 270.00,
|
||||
"stock": 10
|
||||
}
|
||||
{
|
||||
"name": "Mid Twist Cut-Out Dress, Pink",
|
||||
"category": "Womens Formalwear",
|
||||
"price": 540.00,
|
||||
"stock": 5
|
||||
}"#;
|
||||
|
||||
let stream = Deserializer::from_str(data).into_iter::<Value>();
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 1f64,
|
||||
price_field => 0f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 3f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 5f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "nohit",
|
||||
highscore_field => 6f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 7f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 11f64,
|
||||
price_field => 10f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 14f64,
|
||||
price_field => 15f64,
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 15f64,
|
||||
price_field => 20f64,
|
||||
))?;
|
||||
let mut num_indexed = 0;
|
||||
for value in stream {
|
||||
let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
|
||||
index_writer.add_document(doc)?;
|
||||
num_indexed += 1;
|
||||
if num_indexed > 4 {
|
||||
// Writing the first segment
|
||||
index_writer.commit()?;
|
||||
}
|
||||
}
|
||||
|
||||
// Writing the second segment
|
||||
index_writer.commit()?;
|
||||
|
||||
// We have two segments now. The `AggregationCollector` will run the aggregation on each
|
||||
// segment and then merge the results into an `IntermediateAggregationResult`.
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
let searcher = reader.searcher();
|
||||
// ---
|
||||
// # Aggregation Query
|
||||
//
|
||||
//
|
||||
// We can construct the query by building the request structure or by deserializing from JSON.
|
||||
// The JSON API is more stable and therefore recommended.
|
||||
//
|
||||
// ## Request 1
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
let agg_req_str = r#"
|
||||
{
|
||||
"group_by_stock": {
|
||||
"aggs": {
|
||||
"average_price": { "avg": { "field": "price" } }
|
||||
},
|
||||
"range": {
|
||||
"field": "stock",
|
||||
"ranges": [
|
||||
{ "key": "few", "to": 1.0 },
|
||||
{ "key": "some", "from": 1.0, "to": 10.0 },
|
||||
{ "key": "many", "from": 10.0 }
|
||||
]
|
||||
}
|
||||
}
|
||||
} "#;
|
||||
|
||||
let sub_agg_req_1: Aggregations = vec![(
|
||||
"average_price".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("price".to_string()),
|
||||
)),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
// In this Aggregation we want to get the average price for different groups, depending on how
|
||||
// many items are in stock. We define custom ranges `few`, `some`, `many` via the
|
||||
// range aggregation.
|
||||
// For every bucket we want the average price, so we create a nested metric aggregation on the
|
||||
// range bucket aggregation. Only buckets support nested aggregations.
|
||||
// ### Request JSON API
|
||||
//
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"score_ranges".to_string(),
|
||||
let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
let res2: Value = serde_json::to_value(agg_res)?;
|
||||
|
||||
// ### Request Rust API
|
||||
//
|
||||
// This is exactly the same request as above, but via the rust structures.
|
||||
//
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"group_by_stock".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "highscore".to_string(),
|
||||
field: "stock".to_string(),
|
||||
ranges: vec![
|
||||
(-1f64..9f64).into(),
|
||||
(9f64..14f64).into(),
|
||||
(14f64..20f64).into(),
|
||||
RangeAggregationRange {
|
||||
key: Some("few".into()),
|
||||
from: None,
|
||||
to: Some(1f64),
|
||||
},
|
||||
RangeAggregationRange {
|
||||
key: Some("some".into()),
|
||||
from: Some(1f64),
|
||||
to: Some(10f64),
|
||||
},
|
||||
RangeAggregationRange {
|
||||
key: Some("many".into()),
|
||||
from: Some(10f64),
|
||||
to: None,
|
||||
},
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req_1,
|
||||
sub_aggregation: vec![(
|
||||
"average_price".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("price".to_string()),
|
||||
)),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
// We use the `AllQuery` which will pass all documents to the AggregationCollector.
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
let res1: Value = serde_json::to_value(agg_res)?;
|
||||
|
||||
// ### Aggregation Result
|
||||
//
|
||||
// The resulting structure deserializes in the same JSON format as elastic search.
|
||||
//
|
||||
let expected_res = r#"
|
||||
{
|
||||
"group_by_stock":{
|
||||
"buckets":[
|
||||
{"average_price":{"value":19.0},"doc_count":1,"key":"few","to":1.0},
|
||||
{"average_price":{"value":124.748},"doc_count":10,"from":1.0,"key":"some","to":10.0},
|
||||
{"average_price":{"value":152.0},"doc_count":2,"from":10.0,"key":"many"}
|
||||
]
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let expected_json: Value = serde_json::from_str(expected_res)?;
|
||||
assert_eq!(expected_json, res1);
|
||||
assert_eq!(expected_json, res2);
|
||||
|
||||
// ### Request 2
|
||||
//
|
||||
// Now we are interested in the minimum price per category, so we create a bucket per
|
||||
// category via `TermsAggregation`. We are interested in the highest minimum prices, and set the
|
||||
// order of the buckets `"order": { "min_price": "desc" }` to be sorted by the the metric of
|
||||
// the sub aggregation. (awesome)
|
||||
//
|
||||
let agg_req_str = r#"
|
||||
{
|
||||
"min_price_per_category": {
|
||||
"aggs": {
|
||||
"min_price": { "min": { "field": "price" } }
|
||||
},
|
||||
"terms": {
|
||||
"field": "category",
|
||||
"min_doc_count": 1,
|
||||
"order": { "min_price": "desc" }
|
||||
}
|
||||
}
|
||||
} "#;
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
|
||||
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
let res: Value = serde_json::to_value(agg_res)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
// Minimum price per category, sorted by minimum price descending
|
||||
//
|
||||
// As you can see, the starting prices for `Formalwear` are higher than `Casualwear`.
|
||||
//
|
||||
let expected_res = r#"
|
||||
{
|
||||
"min_price_per_category": {
|
||||
"buckets": [
|
||||
{ "doc_count": 2, "key": "Womens Formalwear", "min_price": { "value": 270.0 } },
|
||||
{ "doc_count": 2, "key": "Mens Formalwear", "min_price": { "value": 75.0 } },
|
||||
{ "doc_count": 2, "key": "Mens Casualwear", "min_price": { "value": 49.99 } },
|
||||
{ "doc_count": 2, "key": "Womens Footwear", "min_price": { "value": 42.0 } },
|
||||
{ "doc_count": 2, "key": "Womens Casualwear", "min_price": { "value": 30.0 } },
|
||||
{ "doc_count": 3, "key": "Mens Footwear", "min_price": { "value": 19.0 } }
|
||||
],
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let expected_json: Value = serde_json::from_str(expected_res)?;
|
||||
|
||||
assert_eq!(expected_json, res);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -171,7 +171,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
// here we want to search for `broom` and use `StatsCollector` on the hits.
|
||||
let query = query_parser.parse_query("broom")?;
|
||||
if let Some(stats) =
|
||||
searcher.search(&query, &StatsCollector::with_field("price".to_string()))?
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// # Defining a tokenizer pipeline
|
||||
//
|
||||
// In this example, we'll see how to define a tokenizer pipeline
|
||||
// by aligning a bunch of `TokenFilter`.
|
||||
// In this example, we'll see how to define a tokenizer
|
||||
// by creating a custom `NgramTokenizer`.
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
|
||||
@@ -14,6 +14,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.set_stored()
|
||||
.set_fast()
|
||||
.set_precision(tantivy::DatePrecision::Seconds);
|
||||
// Add `occurred_at` date field type
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
let schema = schema_builder.build();
|
||||
@@ -22,6 +23,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
// The dates are passed as string in the RFC3339 format
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T12:53:50.53Z",
|
||||
@@ -41,14 +43,16 @@ fn main() -> tantivy::Result<()> {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type
|
||||
// # Search
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type]);
|
||||
{
|
||||
let query = query_parser.parse_query("event:comment")?;
|
||||
// Simple exact search on the date
|
||||
let query = query_parser.parse_query("occurred_at:\"2022-06-22T12:53:50.53Z\"")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
// Range query on the date field
|
||||
let query = query_parser
|
||||
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
// # Faceted Search With Tweak Score
|
||||
//
|
||||
// This example covers the faceted search functionalities of
|
||||
// tantivy.
|
||||
//
|
||||
// We will :
|
||||
// - define a text field "name" in our schema
|
||||
// - define a facet field "classification" in our schema
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
@@ -55,6 +64,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.collect(),
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
// Call TopDocs with a custom tweak score
|
||||
TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
|
||||
let ingredient_reader = segment_reader.facet_reader("ingredient").unwrap();
|
||||
let facet_dict = ingredient_reader.facet_dict();
|
||||
@@ -65,6 +75,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.collect();
|
||||
|
||||
move |doc: DocId, original_score: Score| {
|
||||
// Update the original score with a tweaked score
|
||||
let missing_ingredients = ingredient_reader
|
||||
.facet_ords(doc)
|
||||
.filter(|ord| !query_ords.contains(ord))
|
||||
|
||||
170
examples/fuzzy_search.rs
Normal file
170
examples/fuzzy_search.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
// # Basic Example
|
||||
//
|
||||
// This example covers the basic functionalities of
|
||||
// tantivy.
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// - create an index in a directory
|
||||
// - index a few documents into our index
|
||||
// - search for the best document matching a basic query
|
||||
// - retrieve the best document's original content.
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::{FuzzyTermQuery, QueryParser};
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, DocId, Index, ReloadPolicy, Score, SegmentReader};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// First we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Our first field is title.
|
||||
// We want full-text search for it, and we also want
|
||||
// to be able to retrieve the document after the search.
|
||||
//
|
||||
// `TEXT | STORED` is some syntactic sugar to describe
|
||||
// that.
|
||||
//
|
||||
// `TEXT` means the field should be tokenized and indexed,
|
||||
// along with its term frequency and term positions.
|
||||
//
|
||||
// `STORED` means that the field will also be saved
|
||||
// in a compressed, row-oriented key-value store.
|
||||
// This store is useful for reconstructing the
|
||||
// documents that were selected during the search phase.
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
// To insert a document we will need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we give tantivy a budget of `50MB`.
|
||||
// Using a bigger memory_arena for the indexer may increase
|
||||
// throughput, but 50 MB is already plenty.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
// ### Adding documents
|
||||
//
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Name of the Wind",
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Diary of Muadib",
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "A Dairy Cow",
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Diary of a Young Girl",
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call `.commit()` explicitly to force the
|
||||
// `index_writer` to finish processing the documents in the queue,
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
index_writer.commit()?;
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents that have been added are guaranteed to be
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if it has rolled back to its last
|
||||
// commit.
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// ### Searcher
|
||||
//
|
||||
// A reader is required first in order to search an index.
|
||||
// It acts as a `Searcher` pool that reloads itself,
|
||||
// depending on a `ReloadPolicy`.
|
||||
//
|
||||
// For a search server you will typically create one reader for the entire lifetime of your
|
||||
// program, and acquire a new searcher for every single request.
|
||||
//
|
||||
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
|
||||
// will reload the index automatically after each commit.
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
// We now need to acquire a searcher.
|
||||
//
|
||||
// A searcher points to a snapshotted, immutable version of the index.
|
||||
//
|
||||
// Some search experience might require more than
|
||||
// one query. Using the same searcher ensures that all of these queries will run on the
|
||||
// same version of the index.
|
||||
//
|
||||
// Acquiring a `searcher` is very cheap.
|
||||
//
|
||||
// You should acquire a searcher every time you start processing a request and
|
||||
// and release it right after your query is finished.
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// ### FuzzyTermQuery
|
||||
{
|
||||
let term = Term::from_field_text(title, "Diary");
|
||||
let query = FuzzyTermQuery::new(term, 2, true);
|
||||
|
||||
let (top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(5), Count))
|
||||
.unwrap();
|
||||
assert_eq!(count, 3);
|
||||
assert_eq!(top_docs.len(), 3);
|
||||
for (score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
// Note that the score is not lower for the fuzzy hit.
|
||||
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
|
||||
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
|
||||
// score 1.0 doc {"title":["The Diary of Muadib"]}
|
||||
//
|
||||
// score 1.0 doc {"title":["The Diary of a Young Girl"]}
|
||||
//
|
||||
// score 1.0 doc {"title":["A Dairy Cow"]}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -10,6 +10,10 @@ use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
// We set the IP field as `INDEXED`, so it can be searched
|
||||
// `FAST` will create a fast field. The fast field will be used to execute search queries.
|
||||
// `FAST` is not a requirement for range queries, it can also be executed on the inverted index
|
||||
// which is created by `INDEXED`.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
let ip = schema_builder.add_ip_addr_field("ip", STORED | INDEXED | FAST);
|
||||
@@ -19,51 +23,81 @@ fn main() -> tantivy::Result<()> {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// ### IPv4
|
||||
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
|
||||
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
|
||||
// internally as IPv6.
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"ip": "192.168.0.33",
|
||||
"event_type": "login"
|
||||
}"#,
|
||||
"ip": "192.168.0.33",
|
||||
"event_type": "login"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"ip": "192.168.0.80",
|
||||
"event_type": "checkout"
|
||||
}"#,
|
||||
"ip": "192.168.0.80",
|
||||
"event_type": "checkout"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
// ### IPv6
|
||||
// Adding a document that contains an IPv6 address.
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
|
||||
"event_type": "checkout"
|
||||
}"#,
|
||||
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
|
||||
"event_type": "checkout"
|
||||
}"#,
|
||||
)?;
|
||||
|
||||
index_writer.add_document(doc)?;
|
||||
// Commit will create a segment containing our documents.
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Search
|
||||
// Range queries on IPv4. Since we created a fast field, the fast field will be used to execute
|
||||
// the search.
|
||||
// ### Range Queries
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, ip]);
|
||||
{
|
||||
let query = query_parser.parse_query("ip:[192.168.0.0 TO 192.168.0.100]")?;
|
||||
// Inclusive range queries
|
||||
let query = query_parser.parse_query("ip:[192.168.0.80 TO 192.168.0.100]")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("ip:[192.168.1.0 TO 192.168.1.100]")?;
|
||||
// Exclusive range queries
|
||||
let query = query_parser.parse_query("ip:{192.168.0.80 TO 192.168.1.100]")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 0);
|
||||
}
|
||||
{
|
||||
// Find docs with IP addresses smaller equal 192.168.1.100
|
||||
let query = query_parser.parse_query("ip:[* TO 192.168.1.100]")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
// Find docs with IP addresses smaller than 192.168.1.100
|
||||
let query = query_parser.parse_query("ip:[* TO 192.168.1.100}")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
|
||||
// ### Exact Queries
|
||||
// Exact search on IPv4.
|
||||
{
|
||||
let query = query_parser.parse_query("ip:192.168.0.80")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
// Exact search on IPv6.
|
||||
// IpV6 addresses need to be quoted because they contain `:`
|
||||
{
|
||||
// IpV6 needs to be escaped because it contains `:`
|
||||
let query = query_parser.parse_query("ip:\"2001:0db8:85a3:0000:0000:8a2e:0370:7334\"")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
|
||||
@@ -17,7 +17,6 @@ use tantivy::{
|
||||
|
||||
type ProductId = u64;
|
||||
|
||||
/// Price
|
||||
type Price = u32;
|
||||
|
||||
pub trait PriceFetcher: Send + Sync + 'static {
|
||||
@@ -90,10 +89,10 @@ impl Warmer for DynamicPriceColumn {
|
||||
}
|
||||
}
|
||||
|
||||
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
|
||||
/// This map represents a map (ProductId -> Price)
|
||||
///
|
||||
/// In practise, it could be fetching things from an external service, like a SQL table.
|
||||
// For the sake of this example, the table is just an editable HashMap behind a RwLock.
|
||||
// This map represents a map (ProductId -> Price)
|
||||
//
|
||||
// In practise, it could be fetching things from an external service, like a SQL table.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ExternalPriceTable {
|
||||
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
|
||||
|
||||
Reference in New Issue
Block a user