mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 00:20:42 +00:00
Add an example of using the stopwords filter (#377)
This commit is contained in:
committed by
Paul Masurel
parent
60a9a7f837
commit
674524ba91
129
examples/stop_words.rs
Normal file
129
examples/stop_words.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
// # Stop Words Example
|
||||
//
|
||||
// This example covers the basic usage of stop words
|
||||
// with tantivy
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// - create an index in a directory
|
||||
// - add a few stop words
|
||||
// - index few documents in our index
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopCollector;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// This configures your custom options for how tantivy will
|
||||
// store and process your content in the index; The key
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// this will have NO hits because it was filtered out
|
||||
// because the query is run through the analyzer you
|
||||
// actually will get an error here because the query becomes
|
||||
// empty
|
||||
assert!(query_parser.parse_query("the").is_err());
|
||||
|
||||
// this will have hits
|
||||
let query = query_parser.parse_query("is")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
Reference in New Issue
Block a user