diff --git a/examples/stop_words.rs b/examples/stop_words.rs new file mode 100644 index 000000000..950a42afd --- /dev/null +++ b/examples/stop_words.rs @@ -0,0 +1,129 @@ +// # Stop Words Example +// +// This example covers the basic usage of stop words +// with tantivy +// +// We will : +// - define our schema +// - create an index in a directory +// - add a few stop words +// - index few documents in our index + +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::TopCollector; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::tokenizer::*; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // this example assumes you understand the content in `basic_search` + let index_path = TempDir::new("tantivy_stopwords_example_dir")?; + let mut schema_builder = SchemaBuilder::default(); + + // This configures your custom options for how tantivy will + // store and process your content in the index; The key + // to note is that we are setting the tokenizer to `stoppy` + // which will be defined and registered below. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); + + // Our first field is title. + schema_builder.add_text_field("title", text_options); + + // Our second field is body. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); + schema_builder.add_text_field("body", text_options); + + let schema = schema_builder.build(); + + let index = Index::create_in_dir(&index_path, schema.clone())?; + + // This tokenizer lowers all of the text (to help with stop word matching) + // then removes all instances of `the` and `and` from the corpus + let tokenizer = SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec![ + "the".to_string(), + "and".to_string(), + ])); + + index.tokenizers().register("stoppy", tokenizer); + + let mut index_writer = index.writer(50_000_000)?; + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + index_writer.add_document(doc!( + title => "The Old Man and the Sea", + body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish." + )); + + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); + + index_writer.add_document(doc!( + title => "Frankenstein", + body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ + enterprise which you have regarded with such evil forebodings. I arrived here \ + yesterday, and my first task is to assure my dear sister of my welfare and \ + increasing confidence in the success of my undertaking." + )); + + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // this will have NO hits because it was filtered out + // because the query is run through the analyzer you + // actually will get an error here because the query becomes + // empty + assert!(query_parser.parse_query("the").is_err()); + + // this will have hits + let query = query_parser.parse_query("is")?; + + let mut top_collector = TopCollector::with_limit(10); + + searcher.search(&*query, &mut top_collector)?; + + let doc_addresses = top_collector.docs(); + + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } + + Ok(()) +} + +use tempdir::TempDir;