From eea70030bf5296a05c47bd51092bdda1ff1aa85f Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 7 May 2024 09:59:41 +0200 Subject: [PATCH 1/5] cleanup top level exports (#2382) remove some top level exports --- examples/custom_collector.rs | 3 ++- examples/date_time_field.rs | 2 +- examples/iterating_docs_and_positions.rs | 3 ++- examples/warmer.rs | 3 ++- src/aggregation/agg_req_with_accessor.rs | 3 ++- src/aggregation/collector.rs | 3 ++- src/collector/top_collector.rs | 3 ++- src/core/searcher.rs | 4 ++-- src/core/tests.rs | 6 ++++-- src/fastfield/mod.rs | 9 +++++---- src/fastfield/writer.rs | 4 ++-- src/index/mod.rs | 2 -- src/indexer/delete_queue.rs | 3 ++- src/indexer/log_merge_policy.rs | 4 ++-- src/indexer/merge_operation.rs | 3 ++- src/indexer/merger.rs | 9 ++++----- src/indexer/merger_sorted_index_test.rs | 5 +++-- src/indexer/segment_writer.rs | 10 +++++----- src/lib.rs | 20 +++++--------------- src/postings/serializer.rs | 2 +- src/query/empty_query.rs | 3 ++- src/query/term_query/term_scorer.rs | 4 ++-- src/reader/warming.rs | 3 ++- src/schema/flags.rs | 3 +-- src/space_usage/mod.rs | 4 ++-- 25 files changed, 59 insertions(+), 59 deletions(-) diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs index d7d4cd10c..29b606930 100644 --- a/examples/custom_collector.rs +++ b/examples/custom_collector.rs @@ -11,9 +11,10 @@ use columnar::Column; // --- // Importing tantivy... use tantivy::collector::{Collector, SegmentCollector}; +use tantivy::index::SegmentReader; use tantivy::query::QueryParser; use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; -use tantivy::{doc, Index, IndexWriter, Score, SegmentReader}; +use tantivy::{doc, Index, IndexWriter, Score}; #[derive(Default)] struct Stats { diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs index d508bc80f..aba9411d3 100644 --- a/examples/date_time_field.rs +++ b/examples/date_time_field.rs @@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> { let opts = DateOptions::from(INDEXED) .set_stored() .set_fast() - .set_precision(tantivy::DateTimePrecision::Seconds); + .set_precision(tantivy::schema::DateTimePrecision::Seconds); // Add `occurred_at` date field type let occurred_at = schema_builder.add_date_field("occurred_at", opts); let event_type = schema_builder.add_text_field("event", STRING | STORED); diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs index d3709a2b2..b58276614 100644 --- a/examples/iterating_docs_and_positions.rs +++ b/examples/iterating_docs_and_positions.rs @@ -7,10 +7,11 @@ // the list of documents containing a term, getting // its term frequency, and accessing its positions. +use tantivy::postings::Postings; // --- // Importing tantivy... use tantivy::schema::*; -use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED}; +use tantivy::{doc, DocSet, Index, IndexWriter, TERMINATED}; fn main() -> tantivy::Result<()> { // We first create a schema for the sake of the diff --git a/examples/warmer.rs b/examples/warmer.rs index 67bf157c5..1cae9d349 100644 --- a/examples/warmer.rs +++ b/examples/warmer.rs @@ -3,10 +3,11 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, RwLock, Weak}; use tantivy::collector::TopDocs; +use tantivy::index::SegmentId; use tantivy::query::QueryParser; use tantivy::schema::{Schema, FAST, TEXT}; use tantivy::{ - doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId, + doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentReader, Warmer, }; diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs index 4f8a7c6f0..64d9b8943 100644 --- a/src/aggregation/agg_req_with_accessor.rs +++ b/src/aggregation/agg_req_with_accessor.rs @@ -17,7 +17,8 @@ use super::metric::{ use super::segment_agg_result::AggregationLimits; use super::VecWithNames; use crate::aggregation::{f64_to_fastfield_u64, Key}; -use crate::{SegmentOrdinal, SegmentReader}; +use crate::index::SegmentReader; +use crate::SegmentOrdinal; #[derive(Default)] pub(crate) struct AggregationsWithAccessor { diff --git a/src/aggregation/collector.rs b/src/aggregation/collector.rs index d0e9ec5b8..2b9ee6f61 100644 --- a/src/aggregation/collector.rs +++ b/src/aggregation/collector.rs @@ -8,7 +8,8 @@ use super::segment_agg_result::{ }; use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate; use crate::collector::{Collector, SegmentCollector}; -use crate::{DocId, SegmentOrdinal, SegmentReader, TantivyError}; +use crate::index::SegmentReader; +use crate::{DocId, SegmentOrdinal, TantivyError}; /// The default max bucket count, before the aggregation fails. pub const DEFAULT_BUCKET_LIMIT: u32 = 65000; diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 5a07e4218..4a399b27c 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -4,7 +4,8 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use super::top_score_collector::TopNComputer; -use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader}; +use crate::index::SegmentReader; +use crate::{DocAddress, DocId, SegmentOrdinal}; /// Contains a feature (field, score, etc.) of a document along with the document address. /// diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8c5dad3da..56816145e 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -4,13 +4,13 @@ use std::{fmt, io}; use crate::collector::Collector; use crate::core::Executor; -use crate::index::SegmentReader; +use crate::index::{SegmentId, SegmentReader}; use crate::query::{Bm25StatisticsProvider, EnableScoring, Query}; use crate::schema::document::DocumentDeserialize; use crate::schema::{Schema, Term}; use crate::space_usage::SearcherSpaceUsage; use crate::store::{CacheStats, StoreReader}; -use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject}; +use crate::{DocAddress, Index, Opstamp, TrackedObject}; /// Identifies the searcher generation accessed by a [`Searcher`]. /// diff --git a/src/core/tests.rs b/src/core/tests.rs index deb02ca4f..62baedf1d 100644 --- a/src/core/tests.rs +++ b/src/core/tests.rs @@ -1,12 +1,14 @@ use crate::collector::Count; use crate::directory::{RamDirectory, WatchCallback}; +use crate::index::SegmentId; use crate::indexer::{LogMergePolicy, NoMergePolicy}; +use crate::postings::Postings; use crate::query::TermQuery; use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT}; use crate::tokenizer::TokenizerManager; use crate::{ - Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, Postings, - ReloadPolicy, SegmentId, TantivyDocument, Term, + Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy, + TantivyDocument, Term, }; #[test] diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index e0689650b..e7ad2e413 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -80,7 +80,7 @@ mod tests { use std::path::Path; use columnar::StrColumn; - use common::{ByteCount, HasLen, TerminatingWrite}; + use common::{ByteCount, DateTimePrecision, HasLen, TerminatingWrite}; use once_cell::sync::Lazy; use rand::prelude::SliceRandom; use rand::rngs::StdRng; @@ -88,14 +88,15 @@ mod tests { use super::*; use crate::directory::{Directory, RamDirectory, WritePtr}; + use crate::index::SegmentId; use crate::merge_policy::NoMergePolicy; use crate::schema::{ - Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, TantivyDocument, - TextOptions, FAST, INDEXED, STORED, STRING, TEXT, + DateOptions, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, + TantivyDocument, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, }; use crate::time::OffsetDateTime; use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager}; - use crate::{DateOptions, DateTimePrecision, Index, IndexWriter, SegmentId, SegmentReader}; + use crate::{Index, IndexWriter, SegmentReader}; pub static SCHEMA: Lazy = Lazy::new(|| { let mut schema_builder = Schema::builder(); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index ca0da8145..8212f2b2f 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,14 +1,14 @@ use std::io; use columnar::{ColumnarWriter, NumericalValue}; -use common::JsonPathWriter; +use common::{DateTimePrecision, JsonPathWriter}; use tokenizer_api::Token; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type}; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; -use crate::{DateTimePrecision, DocId, TantivyError}; +use crate::{DocId, TantivyError}; /// Only index JSON down to a depth of 20. /// This is mostly to guard us from a stack overflow triggered by malicious input. diff --git a/src/index/mod.rs b/src/index/mod.rs index 89e71d2c8..fcd9a1475 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1,5 +1,3 @@ -//! # Index Module -//! //! The `index` module in Tantivy contains core components to read and write indexes. //! //! It contains `Index` and `Segment`, where a `Index` consists of one or more `Segment`s. diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index cfac8e479..4f8a8a383 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -246,8 +246,9 @@ impl DeleteCursor { mod tests { use super::{DeleteOperation, DeleteQueue}; + use crate::index::SegmentReader; use crate::query::{Explanation, Scorer, Weight}; - use crate::{DocId, Score, SegmentReader}; + use crate::{DocId, Score}; struct DummyWeight; impl Weight for DummyWeight { diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 726deb578..70488285e 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -144,9 +144,9 @@ mod tests { use once_cell::sync::Lazy; use super::*; - use crate::index::SegmentMetaInventory; + use crate::index::{SegmentId, SegmentMetaInventory}; + use crate::schema; use crate::schema::INDEXED; - use crate::{schema, SegmentId}; static INVENTORY: Lazy = Lazy::new(SegmentMetaInventory::default); diff --git a/src/indexer/merge_operation.rs b/src/indexer/merge_operation.rs index 90e5ee86c..a4ce49c93 100644 --- a/src/indexer/merge_operation.rs +++ b/src/indexer/merge_operation.rs @@ -1,7 +1,8 @@ use std::collections::HashSet; use std::ops::Deref; -use crate::{Inventory, Opstamp, SegmentId, TrackedObject}; +use crate::index::SegmentId; +use crate::{Inventory, Opstamp, TrackedObject}; #[derive(Default)] pub(crate) struct MergeOperationInventory(Inventory); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 4cc455713..9dd027a59 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED}; use crate::error::DataCorruption; use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; -use crate::index::{Segment, SegmentReader}; +use crate::index::{Segment, SegmentComponent, SegmentReader}; use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; use crate::indexer::SegmentSerializer; use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; @@ -21,8 +21,7 @@ use crate::schema::{value_type_to_column_type, Field, FieldType, Schema}; use crate::store::StoreWriter; use crate::termdict::{TermMerger, TermOrdinal}; use crate::{ - DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, - SegmentComponent, SegmentOrdinal, + DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal, }; /// Segment's max doc must be `< MAX_DOC_LIMIT`. @@ -794,7 +793,7 @@ mod tests { BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE, }; use crate::collector::{Count, FacetCollector}; - use crate::index::Index; + use crate::index::{Index, SegmentId}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::schema::document::Value; use crate::schema::{ @@ -804,7 +803,7 @@ mod tests { use crate::time::OffsetDateTime; use crate::{ assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings, - IndexSortByField, IndexWriter, Order, Searcher, SegmentId, + IndexSortByField, IndexWriter, Order, Searcher, }; #[test] diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index 3b256a634..d698b357f 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -3,6 +3,7 @@ mod tests { use crate::collector::TopDocs; use crate::fastfield::AliveBitSet; use crate::index::Index; + use crate::postings::Postings; use crate::query::QueryParser; use crate::schema::document::Value; use crate::schema::{ @@ -10,8 +11,8 @@ mod tests { TextFieldIndexing, TextOptions, }; use crate::{ - DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, Postings, - TantivyDocument, Term, + DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument, + Term, }; fn create_test_index_posting_list_issue(index_settings: Option) -> Index { diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index ca0b0993d..2ee2843c0 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -7,7 +7,7 @@ use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}; use super::operation::AddOperation; use crate::fastfield::FastFieldsWriter; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; -use crate::index::Segment; +use crate::index::{Segment, SegmentComponent}; use crate::indexer::segment_serializer::SegmentSerializer; use crate::json_utils::{index_json_value, IndexingPositionsPerPath}; use crate::postings::{ @@ -18,7 +18,7 @@ use crate::schema::document::{Document, Value}; use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED}; use crate::store::{StoreReader, StoreWriter}; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer}; -use crate::{DocId, Opstamp, SegmentComponent, TantivyError}; +use crate::{DocId, Opstamp, TantivyError}; /// Computes the initial size of the hash table. /// @@ -498,7 +498,7 @@ mod tests { use crate::collector::{Count, TopDocs}; use crate::directory::RamDirectory; use crate::fastfield::FastValue; - use crate::postings::TermInfo; + use crate::postings::{Postings, TermInfo}; use crate::query::{PhraseQuery, QueryParser}; use crate::schema::document::Value; use crate::schema::{ @@ -510,8 +510,8 @@ mod tests { use crate::time::OffsetDateTime; use crate::tokenizer::{PreTokenizedString, Token}; use crate::{ - DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, Postings, TantivyDocument, - Term, TERMINATED, + DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, TantivyDocument, Term, + TERMINATED, }; #[test] diff --git a/src/lib.rs b/src/lib.rs index 077725f57..2978f4178 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -216,11 +216,6 @@ use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED}; -#[deprecated( - since = "0.22.0", - note = "Will be removed in tantivy 0.23. Use export from snippet module instead" -)] -pub use self::snippet::{Snippet, SnippetGenerator}; #[doc(hidden)] pub use crate::core::json_utils; pub use crate::core::{Executor, Searcher, SearcherGeneration}; @@ -228,16 +223,10 @@ pub use crate::directory::Directory; #[allow(deprecated)] // Remove with index sorting pub use crate::index::{ Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order, - Segment, SegmentComponent, SegmentId, SegmentMeta, SegmentReader, + Segment, SegmentMeta, SegmentReader, }; -#[deprecated( - since = "0.22.0", - note = "Will be removed in tantivy 0.23. Use export from indexer module instead" -)] -pub use crate::indexer::PreparedCommit; pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter}; -pub use crate::postings::Postings; -pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocument, Term}; +pub use crate::schema::{Document, TantivyDocument, Term}; /// Index format version. const INDEX_FORMAT_VERSION: u32 = 6; @@ -392,9 +381,10 @@ pub mod tests { use crate::docset::{DocSet, TERMINATED}; use crate::index::SegmentReader; use crate::merge_policy::NoMergePolicy; + use crate::postings::Postings; use crate::query::BooleanQuery; use crate::schema::*; - use crate::{DateTime, DocAddress, Index, IndexWriter, Postings, ReloadPolicy}; + use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy}; pub fn fixed_size_test() { let mut buffer = Vec::new(); @@ -1109,9 +1099,9 @@ pub mod tests { #[test] fn test_update_via_delete_insert() -> crate::Result<()> { use crate::collector::Count; + use crate::index::SegmentId; use crate::indexer::NoMergePolicy; use crate::query::AllQuery; - use crate::SegmentId; const DOC_COUNT: u64 = 2u64; diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index c0757f8fd..91b8ec4ca 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -56,7 +56,7 @@ pub struct InvertedIndexSerializer { impl InvertedIndexSerializer { /// Open a new `InvertedIndexSerializer` for the given segment pub fn open(segment: &mut Segment) -> crate::Result { - use crate::SegmentComponent::{Positions, Postings, Terms}; + use crate::index::SegmentComponent::{Positions, Postings, Terms}; let inv_index_serializer = InvertedIndexSerializer { terms_write: CompositeWrite::wrap(segment.open_write(Terms)?), postings_write: CompositeWrite::wrap(segment.open_write(Postings)?), diff --git a/src/query/empty_query.rs b/src/query/empty_query.rs index 76eadddb4..86ff84c08 100644 --- a/src/query/empty_query.rs +++ b/src/query/empty_query.rs @@ -1,8 +1,9 @@ use super::Scorer; use crate::docset::TERMINATED; +use crate::index::SegmentReader; use crate::query::explanation::does_not_match; use crate::query::{EnableScoring, Explanation, Query, Weight}; -use crate::{DocId, DocSet, Score, Searcher, SegmentReader}; +use crate::{DocId, DocSet, Score, Searcher}; /// `EmptyQuery` is a dummy `Query` in which no document matches. /// diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 2bbd0c630..594272019 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -127,6 +127,7 @@ impl Scorer for TermScorer { mod tests { use proptest::prelude::*; + use crate::index::SegmentId; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::merge_policy::NoMergePolicy; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; @@ -134,8 +135,7 @@ mod tests { use crate::query::{Bm25Weight, EnableScoring, Scorer, TermQuery}; use crate::schema::{IndexRecordOption, Schema, TEXT}; use crate::{ - assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, SegmentId, Term, - TERMINATED, + assert_nearly_equals, DocId, DocSet, Index, IndexWriter, Score, Searcher, Term, TERMINATED, }; #[test] diff --git a/src/reader/warming.rs b/src/reader/warming.rs index 936fc34da..63a274e91 100644 --- a/src/reader/warming.rs +++ b/src/reader/warming.rs @@ -179,9 +179,10 @@ mod tests { use super::Warmer; use crate::core::searcher::SearcherGeneration; use crate::directory::RamDirectory; + use crate::index::SegmentId; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::schema::{Schema, INDEXED}; - use crate::{Index, IndexSettings, ReloadPolicy, Searcher, SegmentId}; + use crate::{Index, IndexSettings, ReloadPolicy, Searcher}; #[derive(Default)] struct TestWarmer { diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 449e12a4c..4f8caa612 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -1,7 +1,6 @@ use std::ops::BitOr; -use crate::schema::{NumericOptions, TextOptions}; -use crate::DateOptions; +use crate::schema::{DateOptions, NumericOptions, TextOptions}; #[derive(Clone)] pub struct StoredFlag; diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 466d67aae..238ada60f 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -12,8 +12,8 @@ use std::collections::HashMap; use common::ByteCount; use serde::{Deserialize, Serialize}; +use crate::index::SegmentComponent; use crate::schema::Field; -use crate::SegmentComponent; /// Enum containing any of the possible space usage results for segment components. pub enum ComponentSpaceUsage { @@ -115,7 +115,7 @@ impl SegmentSpaceUsage { /// Use the components directly if this is somehow in performance critical code. pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage { use self::ComponentSpaceUsage::*; - use crate::SegmentComponent::*; + use crate::index::SegmentComponent::*; match component { Postings => PerField(self.postings().clone()), Positions => PerField(self.positions().clone()), From c6b213d8f059240369649c36e75ff0465daa155b Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 7 May 2024 11:29:49 +0200 Subject: [PATCH 2/5] use bingang for agg benchmark (#2378) * use bingang for agg benchmark use bingang for agg benchmark, which includes memory consumption Output: ``` full histogram Memory: 15.8 KB Avg: 10.9322ms (+5.44%) Median: 10.8790ms (+9.28%) Min: 10.7470ms Max: 11.3263ms histogram_hard_bounds Memory: 15.5 KB Avg: 5.1939ms (+6.61%) Median: 5.1722ms (+10.98%) Min: 5.0432ms Max: 5.3910ms histogram_with_avg_sub_agg Memory: 48.7 KB Avg: 23.8165ms (+4.57%) Median: 23.7264ms (+10.06%) Min: 23.4995ms Max: 24.8107ms dense histogram Memory: 17.3 KB Avg: 15.6810ms (-8.54%) Median: 15.6174ms (-8.89%) Min: 15.4953ms Max: 16.0702ms histogram_hard_bounds Memory: 15.4 KB Avg: 10.0720ms (-7.33%) Median: 10.0572ms (-7.06%) Min: 9.8500ms Max: 10.4819ms histogram_with_avg_sub_agg Memory: 50.1 KB Avg: 33.0993ms (-7.04%) Median: 32.9499ms (-6.86%) Min: 32.8284ms Max: 34.0529ms sparse histogram Memory: 16.3 KB Avg: 19.2325ms (-0.44%) Median: 19.1211ms (-1.26%) Min: 19.0348ms Max: 19.7902ms histogram_hard_bounds Memory: 16.1 KB Avg: 18.5179ms (-0.61%) Median: 18.4552ms (-0.90%) Min: 18.3799ms Max: 19.0535ms histogram_with_avg_sub_agg Memory: 34.7 KB Avg: 21.2589ms (-0.69%) Median: 21.1867ms (-1.05%) Min: 21.0342ms Max: 21.9900ms ``` * add more bench with term as sub agg --- Cargo.toml | 6 + benches/agg_bench.rs | 413 +++++++++++++++++++++++++ src/aggregation/agg_bench.rs | 585 ----------------------------------- src/aggregation/mod.rs | 2 - 4 files changed, 419 insertions(+), 587 deletions(-) create mode 100644 benches/agg_bench.rs delete mode 100644 src/aggregation/agg_bench.rs diff --git a/Cargo.toml b/Cargo.toml index 3580168e1..1862db26e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -67,6 +67,7 @@ fnv = "1.0.7" winapi = "0.3.9" [dev-dependencies] +binggan = "0.5.1" rand = "0.8.5" maplit = "1.0.2" matches = "0.1.9" @@ -143,3 +144,8 @@ harness = false [[bench]] name = "index-bench" harness = false + +[[bench]] +name = "agg_bench" +harness = false + diff --git a/benches/agg_bench.rs b/benches/agg_bench.rs new file mode 100644 index 000000000..d35124999 --- /dev/null +++ b/benches/agg_bench.rs @@ -0,0 +1,413 @@ +use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM}; +use rand::prelude::SliceRandom; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use rand_distr::Distribution; +use serde_json::json; +use tantivy::aggregation::agg_req::Aggregations; +use tantivy::aggregation::AggregationCollector; +use tantivy::query::{AllQuery, TermQuery}; +use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING}; +use tantivy::{doc, Index, Term}; + +#[global_allocator] +pub static GLOBAL: &PeakMemAlloc = &INSTRUMENTED_SYSTEM; + +/// Mini macro to register a function via its name +/// runner.register("average_u64", move |index| average_u64(index)); +macro_rules! register { + ($runner:expr, $func:ident) => { + $runner.register(stringify!($func), move |index| $func(index)) + }; +} + +fn main() { + let inputs = vec![ + ("full", get_test_index_bench(Cardinality::Full).unwrap()), + ( + "dense", + get_test_index_bench(Cardinality::OptionalDense).unwrap(), + ), + ( + "sparse", + get_test_index_bench(Cardinality::OptionalSparse).unwrap(), + ), + ( + "multivalue", + get_test_index_bench(Cardinality::Multivalued).unwrap(), + ), + ]; + + bench_agg(InputGroup::new_with_inputs(inputs)); +} + +fn bench_agg(mut group: InputGroup) { + group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting. + register!(group, average_u64); + register!(group, average_f64); + register!(group, average_f64_u64); + register!(group, stats_f64); + register!(group, percentiles_f64); + register!(group, terms_few); + register!(group, terms_many); + register!(group, terms_many_order_by_term); + register!(group, terms_many_with_top_hits); + register!(group, terms_many_with_avg_sub_agg); + register!(group, terms_many_json_mixed_type_with_sub_agg_card); + register!(group, range_agg); + register!(group, range_agg_with_avg_sub_agg); + register!(group, range_agg_with_term_agg_few); + register!(group, range_agg_with_term_agg_many); + register!(group, histogram); + register!(group, histogram_hard_bounds); + register!(group, histogram_with_avg_sub_agg); + register!(group, avg_and_range_with_avg_sub_agg); + + group.run(); +} + +fn exec_term_with_agg(index: &Index, agg_req: serde_json::Value) { + let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap(); + + let reader = index.reader().unwrap(); + let text_field = reader.searcher().schema().get_field("text").unwrap(); + let term_query = TermQuery::new( + Term::from_field_text(text_field, "cool"), + IndexRecordOption::Basic, + ); + let collector = get_collector(agg_req); + let searcher = reader.searcher(); + black_box(searcher.search(&term_query, &collector).unwrap()); +} + +fn average_u64(index: &Index) { + let agg_req = json!({ + "average": { "avg": { "field": "score", } } + }); + exec_term_with_agg(index, agg_req) +} +fn average_f64(index: &Index) { + let agg_req = json!({ + "average": { "avg": { "field": "score_f64", } } + }); + exec_term_with_agg(index, agg_req) +} +fn average_f64_u64(index: &Index) { + let agg_req = json!({ + "average_f64": { "avg": { "field": "score_f64" } }, + "average": { "avg": { "field": "score" } }, + }); + exec_term_with_agg(index, agg_req) +} +fn stats_f64(index: &Index) { + let agg_req = json!({ + "average_f64": { "stats": { "field": "score_f64", } } + }); + exec_term_with_agg(index, agg_req) +} + +fn percentiles_f64(index: &Index) { + let agg_req = json!({ + "mypercentiles": { + "percentiles": { + "field": "score_f64", + "percents": [ 95, 99, 99.9 ] + } + } + }); + execute_agg(index, agg_req); +} +fn terms_few(index: &Index) { + let agg_req = json!({ + "my_texts": { "terms": { "field": "text_few_terms" } }, + }); + execute_agg(index, agg_req); +} +fn terms_many(index: &Index) { + let agg_req = json!({ + "my_texts": { "terms": { "field": "text_many_terms" } }, + }); + execute_agg(index, agg_req); +} +fn terms_many_order_by_term(index: &Index) { + let agg_req = json!({ + "my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } }, + }); + execute_agg(index, agg_req); +} +fn terms_many_with_top_hits(index: &Index) { + let agg_req = json!({ + "my_texts": { + "terms": { "field": "text_many_terms" }, + "aggs": { + "top_hits": { "top_hits": + { + "sort": [ + { "score": "desc" } + ], + "size": 2, + "doc_value_fields": ["score_f64"] + } + } + } + }, + }); + execute_agg(index, agg_req); +} +fn terms_many_with_avg_sub_agg(index: &Index) { + let agg_req = json!({ + "my_texts": { + "terms": { "field": "text_many_terms" }, + "aggs": { + "average_f64": { "avg": { "field": "score_f64" } } + } + }, + }); + execute_agg(index, agg_req); +} +fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) { + let agg_req = json!({ + "my_texts": { + "terms": { "field": "json.mixed_type" }, + "aggs": { + "average_f64": { "avg": { "field": "score_f64" } } + } + }, + }); + execute_agg(index, agg_req); +} + +fn execute_agg(index: &Index, agg_req: serde_json::Value) { + let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap(); + let collector = get_collector(agg_req); + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + black_box(searcher.search(&AllQuery, &collector).unwrap()); +} +fn range_agg(index: &Index) { + let agg_req = json!({ + "range_f64": { "range": { "field": "score_f64", "ranges": [ + { "from": 3, "to": 7000 }, + { "from": 7000, "to": 20000 }, + { "from": 20000, "to": 30000 }, + { "from": 30000, "to": 40000 }, + { "from": 40000, "to": 50000 }, + { "from": 50000, "to": 60000 } + ] } }, + }); + execute_agg(index, agg_req); +} +fn range_agg_with_avg_sub_agg(index: &Index) { + let agg_req = json!({ + "rangef64": { + "range": { + "field": "score_f64", + "ranges": [ + { "from": 3, "to": 7000 }, + { "from": 7000, "to": 20000 }, + { "from": 20000, "to": 30000 }, + { "from": 30000, "to": 40000 }, + { "from": 40000, "to": 50000 }, + { "from": 50000, "to": 60000 } + ] + }, + "aggs": { + "average_f64": { "avg": { "field": "score_f64" } } + } + }, + }); + execute_agg(index, agg_req); +} + +fn range_agg_with_term_agg_few(index: &Index) { + let agg_req = json!({ + "rangef64": { + "range": { + "field": "score_f64", + "ranges": [ + { "from": 3, "to": 7000 }, + { "from": 7000, "to": 20000 }, + { "from": 20000, "to": 30000 }, + { "from": 30000, "to": 40000 }, + { "from": 40000, "to": 50000 }, + { "from": 50000, "to": 60000 } + ] + }, + "aggs": { + "my_texts": { "terms": { "field": "text_few_terms" } }, + } + }, + }); + execute_agg(index, agg_req); +} +fn range_agg_with_term_agg_many(index: &Index) { + let agg_req = json!({ + "rangef64": { + "range": { + "field": "score_f64", + "ranges": [ + { "from": 3, "to": 7000 }, + { "from": 7000, "to": 20000 }, + { "from": 20000, "to": 30000 }, + { "from": 30000, "to": 40000 }, + { "from": 40000, "to": 50000 }, + { "from": 50000, "to": 60000 } + ] + }, + "aggs": { + "my_texts": { "terms": { "field": "text_many_terms" } }, + } + }, + }); + execute_agg(index, agg_req); +} +fn histogram(index: &Index) { + let agg_req = json!({ + "rangef64": { + "histogram": { + "field": "score_f64", + "interval": 100 // 1000 buckets + }, + } + }); + execute_agg(index, agg_req); +} +fn histogram_hard_bounds(index: &Index) { + let agg_req = json!({ + "rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } }, + }); + execute_agg(index, agg_req); +} +fn histogram_with_avg_sub_agg(index: &Index) { + let agg_req = json!({ + "rangef64": { + "histogram": { "field": "score_f64", "interval": 100 }, + "aggs": { + "average_f64": { "avg": { "field": "score_f64" } } + } + } + }); + execute_agg(index, agg_req); +} +fn avg_and_range_with_avg_sub_agg(index: &Index) { + let agg_req = json!({ + "rangef64": { + "range": { + "field": "score_f64", + "ranges": [ + { "from": 3, "to": 7000 }, + { "from": 7000, "to": 20000 }, + { "from": 20000, "to": 60000 } + ] + }, + "aggs": { + "average_in_range": { "avg": { "field": "score" } } + } + }, + "average": { "avg": { "field": "score" } } + }); + execute_agg(index, agg_req); +} + +#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)] +enum Cardinality { + /// All documents contain exactly one value. + /// `Full` is the default for auto-detecting the Cardinality, since it is the most strict. + #[default] + Full = 0, + /// All documents contain at most one value. + OptionalDense = 1, + /// All documents may contain any number of values. + Multivalued = 2, + /// 1 / 20 documents has a value + OptionalSparse = 3, +} + +fn get_collector(agg_req: Aggregations) -> AggregationCollector { + AggregationCollector::from_aggs(agg_req, Default::default()) +} + +fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result { + let mut schema_builder = Schema::builder(); + let text_fieldtype = tantivy::schema::TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), + ) + .set_stored(); + let text_field = schema_builder.add_text_field("text", text_fieldtype); + let json_field = schema_builder.add_json_field("json", FAST); + let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST); + let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST); + let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast(); + let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone()); + let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone()); + let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype); + let index = Index::create_from_tempdir(schema_builder.build())?; + let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"]; + + let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap(); + + let many_terms_data = (0..150_000) + .map(|num| format!("author{}", num)) + .collect::>(); + { + let mut rng = StdRng::from_seed([1u8; 32]); + let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?; + // To make the different test cases comparable we just change one doc to force the + // cardinality + if cardinality == Cardinality::OptionalDense { + index_writer.add_document(doc!())?; + } + if cardinality == Cardinality::Multivalued { + index_writer.add_document(doc!( + json_field => json!({"mixed_type": 10.0}), + json_field => json!({"mixed_type": 10.0}), + text_field => "cool", + text_field => "cool", + text_field_many_terms => "cool", + text_field_many_terms => "cool", + text_field_few_terms => "cool", + text_field_few_terms => "cool", + score_field => 1u64, + score_field => 1u64, + score_field_f64 => lg_norm.sample(&mut rng), + score_field_f64 => lg_norm.sample(&mut rng), + score_field_i64 => 1i64, + score_field_i64 => 1i64, + ))?; + } + let mut doc_with_value = 1_000_000; + if cardinality == Cardinality::OptionalSparse { + doc_with_value /= 20; + } + let _val_max = 1_000_000.0; + for _ in 0..doc_with_value { + let val: f64 = rng.gen_range(0.0..1_000_000.0); + let json = if rng.gen_bool(0.1) { + // 10% are numeric values + json!({ "mixed_type": val }) + } else { + json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()}) + }; + index_writer.add_document(doc!( + text_field => "cool", + json_field => json, + text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(), + text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(), + score_field => val as u64, + score_field_f64 => lg_norm.sample(&mut rng), + score_field_i64 => val as i64, + ))?; + if cardinality == Cardinality::OptionalSparse { + for _ in 0..20 { + index_writer.add_document(doc!(text_field => "cool"))?; + } + } + } + // writing the segment + index_writer.commit()?; + } + + Ok(index) +} diff --git a/src/aggregation/agg_bench.rs b/src/aggregation/agg_bench.rs deleted file mode 100644 index 84c0bb382..000000000 --- a/src/aggregation/agg_bench.rs +++ /dev/null @@ -1,585 +0,0 @@ -#[cfg(all(test, feature = "unstable"))] -mod bench { - - use rand::prelude::SliceRandom; - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - use rand_distr::Distribution; - use serde_json::json; - use test::{self, Bencher}; - - use crate::aggregation::agg_req::Aggregations; - use crate::aggregation::AggregationCollector; - use crate::query::{AllQuery, TermQuery}; - use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING}; - use crate::{Index, Term}; - - #[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)] - enum Cardinality { - /// All documents contain exactly one value. - /// `Full` is the default for auto-detecting the Cardinality, since it is the most strict. - #[default] - Full = 0, - /// All documents contain at most one value. - Optional = 1, - /// All documents may contain any number of values. - Multivalued = 2, - /// 1 / 20 documents has a value - Sparse = 3, - } - - fn get_collector(agg_req: Aggregations) -> AggregationCollector { - AggregationCollector::from_aggs(agg_req, Default::default()) - } - - fn get_test_index_bench(cardinality: Cardinality) -> crate::Result { - let mut schema_builder = Schema::builder(); - let text_fieldtype = crate::schema::TextOptions::default() - .set_indexing_options( - TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), - ) - .set_stored(); - let text_field = schema_builder.add_text_field("text", text_fieldtype); - let json_field = schema_builder.add_json_field("json", FAST); - let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST); - let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST); - let score_fieldtype = crate::schema::NumericOptions::default().set_fast(); - let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone()); - let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone()); - let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype); - let index = Index::create_from_tempdir(schema_builder.build())?; - let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"]; - - let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap(); - - let many_terms_data = (0..150_000) - .map(|num| format!("author{}", num)) - .collect::>(); - { - let mut rng = StdRng::from_seed([1u8; 32]); - let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?; - // To make the different test cases comparable we just change one doc to force the - // cardinality - if cardinality == Cardinality::Optional { - index_writer.add_document(doc!())?; - } - if cardinality == Cardinality::Multivalued { - index_writer.add_document(doc!( - json_field => json!({"mixed_type": 10.0}), - json_field => json!({"mixed_type": 10.0}), - text_field => "cool", - text_field => "cool", - text_field_many_terms => "cool", - text_field_many_terms => "cool", - text_field_few_terms => "cool", - text_field_few_terms => "cool", - score_field => 1u64, - score_field => 1u64, - score_field_f64 => lg_norm.sample(&mut rng), - score_field_f64 => lg_norm.sample(&mut rng), - score_field_i64 => 1i64, - score_field_i64 => 1i64, - ))?; - } - let mut doc_with_value = 1_000_000; - if cardinality == Cardinality::Sparse { - doc_with_value /= 20; - } - let _val_max = 1_000_000.0; - for _ in 0..doc_with_value { - let val: f64 = rng.gen_range(0.0..1_000_000.0); - let json = if rng.gen_bool(0.1) { - // 10% are numeric values - json!({ "mixed_type": val }) - } else { - json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()}) - }; - index_writer.add_document(doc!( - text_field => "cool", - json_field => json, - text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(), - text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(), - score_field => val as u64, - score_field_f64 => lg_norm.sample(&mut rng), - score_field_i64 => val as i64, - ))?; - if cardinality == Cardinality::Sparse { - for _ in 0..20 { - index_writer.add_document(doc!(text_field => "cool"))?; - } - } - } - // writing the segment - index_writer.commit()?; - } - - Ok(index) - } - - use paste::paste; - #[macro_export] - macro_rules! bench_all_cardinalities { - ( $x:ident ) => { - paste! { - #[bench] - fn $x(b: &mut Bencher) { - [<$x _card>](b, Cardinality::Full) - } - - #[bench] - fn [<$x _opt>](b: &mut Bencher) { - [<$x _card>](b, Cardinality::Optional) - } - - #[bench] - fn [<$x _multi>](b: &mut Bencher) { - [<$x _card>](b, Cardinality::Multivalued) - } - - #[bench] - fn [<$x _sparse>](b: &mut Bencher) { - [<$x _card>](b, Cardinality::Sparse) - } - - } - }; - } - - bench_all_cardinalities!(bench_aggregation_average_u64); - - fn bench_aggregation_average_u64_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - let text_field = reader.searcher().schema().get_field("text").unwrap(); - - b.iter(|| { - let term_query = TermQuery::new( - Term::from_field_text(text_field, "cool"), - IndexRecordOption::Basic, - ); - - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "average": { "avg": { "field": "score", } } - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&term_query, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_stats_f64); - - fn bench_aggregation_stats_f64_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - let text_field = reader.searcher().schema().get_field("text").unwrap(); - - b.iter(|| { - let term_query = TermQuery::new( - Term::from_field_text(text_field, "cool"), - IndexRecordOption::Basic, - ); - - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "average_f64": { "stats": { "field": "score_f64", } } - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&term_query, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_average_f64); - - fn bench_aggregation_average_f64_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - let text_field = reader.searcher().schema().get_field("text").unwrap(); - - b.iter(|| { - let term_query = TermQuery::new( - Term::from_field_text(text_field, "cool"), - IndexRecordOption::Basic, - ); - - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "average_f64": { "avg": { "field": "score_f64", } } - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&term_query, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_percentiles_f64); - - fn bench_aggregation_percentiles_f64_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req_str = r#" - { - "mypercentiles": { - "percentiles": { - "field": "score_f64", - "percents": [ 95, 99, 99.9 ] - } - } - } "#; - let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_average_u64_and_f64); - - fn bench_aggregation_average_u64_and_f64_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - let text_field = reader.searcher().schema().get_field("text").unwrap(); - - b.iter(|| { - let term_query = TermQuery::new( - Term::from_field_text(text_field, "cool"), - IndexRecordOption::Basic, - ); - - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "average_f64": { "avg": { "field": "score_f64" } }, - "average": { "avg": { "field": "score" } }, - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&term_query, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_terms_few); - - fn bench_aggregation_terms_few_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req: Aggregations = serde_json::from_value(json!({ - "my_texts": { "terms": { "field": "text_few_terms" } }, - })) - .unwrap(); - - let collector = get_collector(agg_req); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_terms_many_with_top_hits_agg); - - fn bench_aggregation_terms_many_with_top_hits_agg_card( - b: &mut Bencher, - cardinality: Cardinality, - ) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req: Aggregations = serde_json::from_value(json!({ - "my_texts": { - "terms": { "field": "text_many_terms" }, - "aggs": { - "top_hits": { "top_hits": - { - "sort": [ - { "score": "desc" } - ], - "size": 2, - "doc_value_fields": ["score_f64"] - } - } - } - }, - })) - .unwrap(); - - let collector = get_collector(agg_req); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_terms_many_with_sub_agg); - - fn bench_aggregation_terms_many_with_sub_agg_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req: Aggregations = serde_json::from_value(json!({ - "my_texts": { - "terms": { "field": "text_many_terms" }, - "aggs": { - "average_f64": { "avg": { "field": "score_f64" } } - } - }, - })) - .unwrap(); - - let collector = get_collector(agg_req); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_terms_many_json_mixed_type_with_sub_agg); - - fn bench_aggregation_terms_many_json_mixed_type_with_sub_agg_card( - b: &mut Bencher, - cardinality: Cardinality, - ) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req: Aggregations = serde_json::from_value(json!({ - "my_texts": { - "terms": { "field": "json.mixed_type" }, - "aggs": { - "average_f64": { "avg": { "field": "score_f64" } } - } - }, - })) - .unwrap(); - - let collector = get_collector(agg_req); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_terms_many2); - - fn bench_aggregation_terms_many2_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req: Aggregations = serde_json::from_value(json!({ - "my_texts": { "terms": { "field": "text_many_terms" } }, - })) - .unwrap(); - - let collector = get_collector(agg_req); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_terms_many_order_by_term); - - fn bench_aggregation_terms_many_order_by_term_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req: Aggregations = serde_json::from_value(json!({ - "my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } }, - })) - .unwrap(); - - let collector = get_collector(agg_req); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_range_only); - - fn bench_aggregation_range_only_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "range_f64": { "range": { "field": "score_f64", "ranges": [ - { "from": 3, "to": 7000 }, - { "from": 7000, "to": 20000 }, - { "from": 20000, "to": 30000 }, - { "from": 30000, "to": 40000 }, - { "from": 40000, "to": 50000 }, - { "from": 50000, "to": 60000 } - ] } }, - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_range_with_avg); - - fn bench_aggregation_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "rangef64": { - "range": { - "field": "score_f64", - "ranges": [ - { "from": 3, "to": 7000 }, - { "from": 7000, "to": 20000 }, - { "from": 20000, "to": 30000 }, - { "from": 30000, "to": 40000 }, - { "from": 40000, "to": 50000 }, - { "from": 50000, "to": 60000 } - ] - }, - "aggs": { - "average_f64": { "avg": { "field": "score_f64" } } - } - }, - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - // hard bounds has a different algorithm, because it actually limits collection range - // - bench_all_cardinalities!(bench_aggregation_histogram_only_hard_bounds); - - fn bench_aggregation_histogram_only_hard_bounds_card( - b: &mut Bencher, - cardinality: Cardinality, - ) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } }, - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_histogram_with_avg); - - fn bench_aggregation_histogram_with_avg_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "rangef64": { - "histogram": { "field": "score_f64", "interval": 100 }, - "aggs": { - "average_f64": { "avg": { "field": "score_f64" } } - } - } - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_histogram_only); - - fn bench_aggregation_histogram_only_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - - b.iter(|| { - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "rangef64": { - "histogram": { - "field": "score_f64", - "interval": 100 // 1000 buckets - }, - } - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&AllQuery, &collector).unwrap() - }); - } - - bench_all_cardinalities!(bench_aggregation_avg_and_range_with_avg); - - fn bench_aggregation_avg_and_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) { - let index = get_test_index_bench(cardinality).unwrap(); - let reader = index.reader().unwrap(); - let text_field = reader.searcher().schema().get_field("text").unwrap(); - - b.iter(|| { - let term_query = TermQuery::new( - Term::from_field_text(text_field, "cool"), - IndexRecordOption::Basic, - ); - - let agg_req_1: Aggregations = serde_json::from_value(json!({ - "rangef64": { - "range": { - "field": "score_f64", - "ranges": [ - { "from": 3, "to": 7000 }, - { "from": 7000, "to": 20000 }, - { "from": 20000, "to": 60000 } - ] - }, - "aggs": { - "average_in_range": { "avg": { "field": "score" } } - } - }, - "average": { "avg": { "field": "score" } } - })) - .unwrap(); - - let collector = get_collector(agg_req_1); - - let searcher = reader.searcher(); - searcher.search(&term_query, &collector).unwrap() - }); - } -} diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index fbb2925dd..cb45885ac 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -143,8 +143,6 @@ use std::fmt::Display; #[cfg(test)] mod agg_tests; -mod agg_bench; - use core::fmt; pub use agg_limits::AggregationLimits; From 2b76335a95eb3936305858ec6fc12ce605d071b4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 8 May 2024 13:32:52 +0900 Subject: [PATCH 3/5] Removed usage of num_cpus (#2387) * Removed usage of num_cpus * handling error --- Cargo.toml | 3 +-- src/index/index.rs | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1862db26e..8a8b84f65 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,6 @@ tempfile = { version = "3.3.0", optional = true } log = "0.4.16" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.79" -num_cpus = "1.13.1" fs4 = { version = "0.8.0", optional = true } levenshtein_automata = "0.2.1" uuid = { version = "1.0.0", features = ["v4", "serde"] } @@ -117,7 +116,7 @@ unstable = [] # useful for benches. quickwit = ["sstable", "futures-util"] -# Compares only the hash of a string when indexing data. +# Compares only the hash of a string when indexing data. # Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision. # Uses 64bit ahash. compare_hash_only = ["stacker/compare_hash_only"] diff --git a/src/index/index.rs b/src/index/index.rs index e02212cd5..0baea3345 100644 --- a/src/index/index.rs +++ b/src/index/index.rs @@ -4,6 +4,7 @@ use std::fmt; use std::path::Path; use std::path::PathBuf; use std::sync::Arc; +use std::thread::available_parallelism; use super::segment::Segment; use super::segment_reader::merge_field_meta_data; @@ -340,7 +341,7 @@ impl Index { /// Replace the default single thread search executor pool /// by a thread pool with as many threads as there are CPUs on the system. pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> { - let default_num_threads = num_cpus::get(); + let default_num_threads = available_parallelism()?.get(); self.set_multithread_executor(default_num_threads) } @@ -621,7 +622,7 @@ impl Index { &self, memory_budget_in_bytes: usize, ) -> crate::Result> { - let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD); + let mut num_threads = std::cmp::min(available_parallelism()?.get(), MAX_NUM_THREAD); let memory_budget_num_bytes_per_thread = memory_budget_in_bytes / num_threads; if memory_budget_num_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN { num_threads = (memory_budget_in_bytes / MEMORY_BUDGET_NUM_BYTES_MIN).max(1); From 8cd7ddc535df2efe465e543ad4a55e22338d236f Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Wed, 8 May 2024 12:22:44 +0200 Subject: [PATCH 4/5] run block decompression from executor (#2386) * run block decompression from executor * add a wrapper with is_closed to oneshot channel * add cancelation test to Executor::spawn_blocking --- src/core/executor.rs | 107 ++++++++++++++++++++++++++++++++++++++++++- src/core/searcher.rs | 3 +- src/store/reader.rs | 32 ++++++++++--- 3 files changed, 133 insertions(+), 9 deletions(-) diff --git a/src/core/executor.rs b/src/core/executor.rs index f4d7d2a13..915534009 100644 --- a/src/core/executor.rs +++ b/src/core/executor.rs @@ -1,3 +1,5 @@ +#[cfg(feature = "quickwit")] +use futures_util::{future::Either, FutureExt}; use rayon::{ThreadPool, ThreadPoolBuilder}; use crate::TantivyError; @@ -91,11 +93,84 @@ impl Executor { } } } + + /// Spawn a task on the pool, returning a future completing on task success. + /// + /// If the task panic, returns `Err(())`. + #[cfg(feature = "quickwit")] + pub fn spawn_blocking( + &self, + cpu_intensive_task: impl FnOnce() -> T + Send + 'static, + ) -> impl std::future::Future> { + match self { + Executor::SingleThread => Either::Left(std::future::ready(Ok(cpu_intensive_task()))), + Executor::ThreadPool(pool) => { + let (sender, receiver) = oneshot_with_sentinel::channel(); + pool.spawn(|| { + if sender.is_closed() { + return; + } + let task_result = cpu_intensive_task(); + let _ = sender.send(task_result); + }); + + let res = receiver.map(|res| res.map_err(|_| ())); + Either::Right(res) + } + } + } +} + +#[cfg(feature = "quickwit")] +mod oneshot_with_sentinel { + use std::pin::Pin; + use std::sync::Arc; + use std::task::{Context, Poll}; + // TODO get ride of this if oneshot ever gains a is_closed() + + pub struct SenderWithSentinel { + tx: oneshot::Sender, + guard: Arc<()>, + } + + pub struct ReceiverWithSentinel { + rx: oneshot::Receiver, + _guard: Arc<()>, + } + + pub fn channel() -> (SenderWithSentinel, ReceiverWithSentinel) { + let (tx, rx) = oneshot::channel(); + let guard = Arc::new(()); + ( + SenderWithSentinel { + tx, + guard: guard.clone(), + }, + ReceiverWithSentinel { rx, _guard: guard }, + ) + } + + impl SenderWithSentinel { + pub fn send(self, message: T) -> Result<(), oneshot::SendError> { + self.tx.send(message) + } + + pub fn is_closed(&self) -> bool { + Arc::strong_count(&self.guard) == 1 + } + } + + impl std::future::Future for ReceiverWithSentinel { + type Output = Result; + + fn poll(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll { + Pin::new(&mut self.rx).poll(ctx) + } + } } #[cfg(test)] mod tests { - use super::Executor; #[test] @@ -147,4 +222,34 @@ mod tests { assert_eq!(result[i], i * 2); } } + + #[cfg(feature = "quickwit")] + #[test] + fn test_cancel_cpu_intensive_tasks() { + use std::sync::atomic::{AtomicU64, Ordering}; + use std::sync::Arc; + use std::time::Duration; + + let counter: Arc = Default::default(); + let mut futures = Vec::new(); + let executor = Executor::multi_thread(3, "search-test").unwrap(); + for _ in 0..1_000 { + let counter_clone = counter.clone(); + let fut = executor.spawn_blocking(move || { + std::thread::sleep(Duration::from_millis(4)); + counter_clone.fetch_add(1, Ordering::SeqCst) + }); + futures.push(fut); + } + std::thread::sleep(Duration::from_millis(5)); + // The first few num_cores tasks should run, but the other should get cancelled. + drop(futures); + while Arc::strong_count(&counter) > 1 { + std::thread::sleep(Duration::from_millis(10)); + } + // with ideal timing, we expect the result to always be 6, but as long as we run some, and + // cancelled most, the test is a success + assert!(counter.load(Ordering::SeqCst) > 0); + assert!(counter.load(Ordering::SeqCst) < 50); + } } diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 56816145e..f74c837c4 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -109,8 +109,9 @@ impl Searcher { &self, doc_address: DocAddress, ) -> crate::Result { + let executor = self.inner.index.search_executor(); let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize]; - store_reader.get_async(doc_address.doc_id).await + store_reader.get_async(doc_address.doc_id, executor).await } /// Access the schema associated with the index of this searcher. diff --git a/src/store/reader.rs b/src/store/reader.rs index b7f243003..44f0df993 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -18,6 +18,8 @@ use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize}; use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; +#[cfg(feature = "quickwit")] +use crate::Executor; pub(crate) const DOCSTORE_CACHE_CAPACITY: usize = 100; @@ -341,7 +343,11 @@ impl StoreReader { /// In most cases use [`get_async`](Self::get_async) /// /// Loads and decompresses a block asynchronously. - async fn read_block_async(&self, checkpoint: &Checkpoint) -> io::Result { + async fn read_block_async( + &self, + checkpoint: &Checkpoint, + executor: &Executor, + ) -> io::Result { let cache_key = checkpoint.byte_range.start; if let Some(block) = self.cache.get_from_cache(checkpoint.byte_range.start) { return Ok(block); @@ -353,8 +359,12 @@ impl StoreReader { .read_bytes_async() .await?; - let decompressed_block = - OwnedBytes::new(self.decompressor.decompress(compressed_block.as_ref())?); + let decompressor = self.decompressor; + let maybe_decompressed_block = executor + .spawn_blocking(move || decompressor.decompress(compressed_block.as_ref())) + .await + .expect("decompression panicked"); + let decompressed_block = OwnedBytes::new(maybe_decompressed_block?); self.cache .put_into_cache(cache_key, decompressed_block.clone()); @@ -363,15 +373,23 @@ impl StoreReader { } /// Reads raw bytes of a given document asynchronously. - pub async fn get_document_bytes_async(&self, doc_id: DocId) -> crate::Result { + pub async fn get_document_bytes_async( + &self, + doc_id: DocId, + executor: &Executor, + ) -> crate::Result { let checkpoint = self.block_checkpoint(doc_id)?; - let block = self.read_block_async(&checkpoint).await?; + let block = self.read_block_async(&checkpoint, executor).await?; Self::get_document_bytes_from_block(block, doc_id, &checkpoint) } /// Fetches a document asynchronously. Async version of [`get`](Self::get). - pub async fn get_async(&self, doc_id: DocId) -> crate::Result { - let mut doc_bytes = self.get_document_bytes_async(doc_id).await?; + pub async fn get_async( + &self, + doc_id: DocId, + executor: &Executor, + ) -> crate::Result { + let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?; let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes) .map_err(crate::TantivyError::from)?; From 71f3b4e4e3d7a5c7acf44f933f3809e4385f47e3 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 9 May 2024 06:14:42 +0200 Subject: [PATCH 5/5] fix ReferenceValue API flaw (#2372) * fix ReferenceValue API flaw Remove `Facet` and `TokenizedString` values from the `ReferenceValue` API, as this requires the trait value to have them stored somewhere. Since `TokenizedString` is quite niche, I just copy it into a Box, instead of designing a reference API around it. * fix comment link --- src/fastfield/facet_reader.rs | 7 +++++-- src/fastfield/writer.rs | 3 +-- src/indexer/segment_writer.rs | 5 ++--- src/query/more_like_this/more_like_this.rs | 4 ++-- src/schema/document/de.rs | 5 +++-- src/schema/document/owned_value.rs | 10 +++++---- src/schema/document/se.rs | 7 ++++--- src/schema/document/value.rs | 24 +++++++++++----------- 8 files changed, 35 insertions(+), 30 deletions(-) diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index c4e170352..731242779 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -146,8 +146,11 @@ mod tests { facet_ords.extend(facet_reader.facet_ords(0u32)); assert_eq!(&facet_ords, &[0u64]); let doc = searcher.doc::(DocAddress::new(0u32, 0u32))?; - let value: Option<&Facet> = doc.get_first(facet_field).and_then(|v| v.as_facet()); - assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); + let value: Option = doc + .get_first(facet_field) + .and_then(|v| v.as_facet()) + .map(|facet| Facet::from_encoded_string(facet.to_string())); + assert_eq!(value, Facet::from_text("/a/b").ok()); Ok(()) } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 8212f2b2f..2f4196078 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -183,8 +183,7 @@ impl FastFieldsWriter { .record_datetime(doc_id, field_name, truncated_datetime); } ReferenceValueLeaf::Facet(val) => { - self.columnar_writer - .record_str(doc_id, field_name, val.encoded_str()); + self.columnar_writer.record_str(doc_id, field_name, val); } ReferenceValueLeaf::Bytes(val) => { self.columnar_writer.record_bytes(doc_id, field_name, val); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 2ee2843c0..0e1be366c 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -206,8 +206,7 @@ impl SegmentWriter { // Used to help with linting and type checking. let value = value_access as D::Value<'_>; - let facet = value.as_facet().ok_or_else(make_schema_error)?; - let facet_str = facet.encoded_str(); + let facet_str = value.as_facet().ok_or_else(make_schema_error)?; let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str); let mut indexing_position = IndexingPosition::default(); postings_writer.index_text( @@ -230,7 +229,7 @@ impl SegmentWriter { &mut self.per_field_text_analyzers[field.field_id() as usize]; text_analyzer.token_stream(text) } else if let Some(tok_str) = value.as_pre_tokenized_text() { - BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone())) + BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone())) } else { continue; }; diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 4fb692e9d..043d081df 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -180,7 +180,7 @@ impl MoreLikeThis { let facets: Vec<&str> = values .iter() .map(|value| { - value.as_facet().map(|f| f.encoded_str()).ok_or_else(|| { + value.as_facet().ok_or_else(|| { TantivyError::InvalidArgument("invalid field value".to_string()) }) }) @@ -220,7 +220,7 @@ impl MoreLikeThis { let mut token_stream = tokenizer.token_stream(text); token_stream.process(sink); } else if let Some(tok_str) = value.as_pre_tokenized_text() { - let mut token_stream = PreTokenizedStream::from(tok_str.clone()); + let mut token_stream = PreTokenizedStream::from(*tok_str.clone()); token_stream.process(sink); } } diff --git a/src/schema/document/de.rs b/src/schema/document/de.rs index aab2b070e..e80bff2c9 100644 --- a/src/schema/document/de.rs +++ b/src/schema/document/de.rs @@ -873,7 +873,7 @@ mod tests { ); let facet = Facet::from_text("/hello/world").unwrap(); - let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into()); + let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into()); let value = deserialize_value(result); assert_eq!(value, crate::schema::OwnedValue::Facet(facet)); @@ -881,7 +881,8 @@ mod tests { text: "hello, world".to_string(), tokens: vec![Token::default(), Token::default()], }; - let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into()); + let result = + serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into()); let value = deserialize_value(result); assert_eq!(value, crate::schema::OwnedValue::PreTokStr(pre_tok_str)); } diff --git a/src/schema/document/owned_value.rs b/src/schema/document/owned_value.rs index 48d3f8792..a70eb7d1c 100644 --- a/src/schema/document/owned_value.rs +++ b/src/schema/document/owned_value.rs @@ -65,13 +65,13 @@ impl<'a> Value<'a> for &'a OwnedValue { match self { OwnedValue::Null => ReferenceValueLeaf::Null.into(), OwnedValue::Str(val) => ReferenceValueLeaf::Str(val).into(), - OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val).into(), + OwnedValue::PreTokStr(val) => ReferenceValueLeaf::PreTokStr(val.clone().into()).into(), OwnedValue::U64(val) => ReferenceValueLeaf::U64(*val).into(), OwnedValue::I64(val) => ReferenceValueLeaf::I64(*val).into(), OwnedValue::F64(val) => ReferenceValueLeaf::F64(*val).into(), OwnedValue::Bool(val) => ReferenceValueLeaf::Bool(*val).into(), OwnedValue::Date(val) => ReferenceValueLeaf::Date(*val).into(), - OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val).into(), + OwnedValue::Facet(val) => ReferenceValueLeaf::Facet(val.encoded_str()).into(), OwnedValue::Bytes(val) => ReferenceValueLeaf::Bytes(val).into(), OwnedValue::IpAddr(val) => ReferenceValueLeaf::IpAddr(*val).into(), OwnedValue::Array(array) => ReferenceValue::Array(array.iter()), @@ -277,11 +277,13 @@ impl<'a, V: Value<'a>> From> for OwnedValue { ReferenceValueLeaf::I64(val) => OwnedValue::I64(val), ReferenceValueLeaf::F64(val) => OwnedValue::F64(val), ReferenceValueLeaf::Date(val) => OwnedValue::Date(val), - ReferenceValueLeaf::Facet(val) => OwnedValue::Facet(val.clone()), + ReferenceValueLeaf::Facet(val) => { + OwnedValue::Facet(Facet::from_encoded_string(val.to_string())) + } ReferenceValueLeaf::Bytes(val) => OwnedValue::Bytes(val.to_vec()), ReferenceValueLeaf::IpAddr(val) => OwnedValue::IpAddr(val), ReferenceValueLeaf::Bool(val) => OwnedValue::Bool(val), - ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(val.clone()), + ReferenceValueLeaf::PreTokStr(val) => OwnedValue::PreTokStr(*val.clone()), }, ReferenceValue::Array(val) => { OwnedValue::Array(val.map(|v| v.as_value().into()).collect()) diff --git a/src/schema/document/se.rs b/src/schema/document/se.rs index 8acffb36b..f1eed1027 100644 --- a/src/schema/document/se.rs +++ b/src/schema/document/se.rs @@ -121,7 +121,7 @@ where W: Write ReferenceValueLeaf::Facet(val) => { self.write_type_code(type_codes::HIERARCHICAL_FACET_CODE)?; - val.serialize(self.writer) + Cow::Borrowed(val).serialize(self.writer) } ReferenceValueLeaf::Bytes(val) => { self.write_type_code(type_codes::BYTES_CODE)?; @@ -428,7 +428,7 @@ mod tests { ); let facet = Facet::from_text("/hello/world").unwrap(); - let result = serialize_value(ReferenceValueLeaf::Facet(&facet).into()); + let result = serialize_value(ReferenceValueLeaf::Facet(facet.encoded_str()).into()); let expected = binary_repr!( type_codes::HIERARCHICAL_FACET_CODE => Facet::from_text("/hello/world").unwrap(), ); @@ -441,7 +441,8 @@ mod tests { text: "hello, world".to_string(), tokens: vec![Token::default(), Token::default()], }; - let result = serialize_value(ReferenceValueLeaf::PreTokStr(&pre_tok_str).into()); + let result = + serialize_value(ReferenceValueLeaf::PreTokStr(pre_tok_str.clone().into()).into()); let expected = binary_repr!( type_codes::EXT_CODE, type_codes::TOK_STR_EXT_CODE => pre_tok_str, ); diff --git a/src/schema/document/value.rs b/src/schema/document/value.rs index ca3d3bf1b..e05649ad7 100644 --- a/src/schema/document/value.rs +++ b/src/schema/document/value.rs @@ -3,7 +3,6 @@ use std::net::Ipv6Addr; use common::DateTime; -use crate::schema::Facet; use crate::tokenizer::PreTokenizedString; /// A single field value. @@ -82,7 +81,7 @@ pub trait Value<'a>: Send + Sync + Debug { #[inline] /// If the Value is a pre-tokenized string, returns the associated string. Returns None /// otherwise. - fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> { + fn as_pre_tokenized_text(&self) -> Option> { self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text()) } @@ -94,7 +93,7 @@ pub trait Value<'a>: Send + Sync + Debug { #[inline] /// If the Value is a facet, returns the associated facet. Returns None otherwise. - fn as_facet(&self) -> Option<&'a Facet> { + fn as_facet(&self) -> Option<&'a str> { self.as_leaf().and_then(|leaf| leaf.as_facet()) } @@ -132,7 +131,7 @@ pub trait Value<'a>: Send + Sync + Debug { } /// A enum representing a leaf value for tantivy to index. -#[derive(Clone, Copy, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub enum ReferenceValueLeaf<'a> { /// A null value. Null, @@ -146,8 +145,9 @@ pub enum ReferenceValueLeaf<'a> { F64(f64), /// Date/time with nanoseconds precision Date(DateTime), - /// Facet - Facet(&'a Facet), + /// Facet string needs to match the format of + /// [Facet::encoded_str](crate::schema::Facet::encoded_str). + Facet(&'a str), /// Arbitrarily sized byte array Bytes(&'a [u8]), /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. @@ -155,7 +155,7 @@ pub enum ReferenceValueLeaf<'a> { /// Bool value Bool(bool), /// Pre-tokenized str type, - PreTokStr(&'a PreTokenizedString), + PreTokStr(Box), } impl<'a, T: Value<'a> + ?Sized> From> for ReferenceValue<'a, T> { @@ -261,9 +261,9 @@ impl<'a> ReferenceValueLeaf<'a> { #[inline] /// If the Value is a pre-tokenized string, returns the associated string. Returns None /// otherwise. - pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> { + pub fn as_pre_tokenized_text(&self) -> Option> { if let Self::PreTokStr(val) = self { - Some(val) + Some(val.clone()) } else { None } @@ -281,7 +281,7 @@ impl<'a> ReferenceValueLeaf<'a> { #[inline] /// If the Value is a facet, returns the associated facet. Returns None otherwise. - pub fn as_facet(&self) -> Option<&'a Facet> { + pub fn as_facet(&self) -> Option<&'a str> { if let Self::Facet(val) = self { Some(val) } else { @@ -367,7 +367,7 @@ where V: Value<'a> #[inline] /// If the Value is a pre-tokenized string, returns the associated string. Returns None /// otherwise. - pub fn as_pre_tokenized_text(&self) -> Option<&'a PreTokenizedString> { + pub fn as_pre_tokenized_text(&self) -> Option> { self.as_leaf().and_then(|leaf| leaf.as_pre_tokenized_text()) } @@ -379,7 +379,7 @@ where V: Value<'a> #[inline] /// If the Value is a facet, returns the associated facet. Returns None otherwise. - pub fn as_facet(&self) -> Option<&'a Facet> { + pub fn as_facet(&self) -> Option<&'a str> { self.as_leaf().and_then(|leaf| leaf.as_facet()) }