From ff6ee3a5db1bfb1a7986e931dc328c4a9f467c0c Mon Sep 17 00:00:00 2001 From: Mohammad Dashti Date: Fri, 5 Dec 2025 22:37:19 -0800 Subject: [PATCH] fix: post-rebase fixes - Add missing size_hint module declaration - Remove test-only export serialize_and_load_u64_based_column_values - fixed quickwit CI issues --- Cargo.lock | 17 +++----------- columnar/benches/bench_merge.rs | 9 +++++++- .../src/column_index/optional_index/tests.rs | 6 ++++- columnar/src/columnar/merge/tests.rs | 2 ++ src/index/segment_reader.rs | 13 ++++++----- src/indexer/segment_updater.rs | 1 - src/query/automaton_weight.rs | 2 +- src/query/intersection.rs | 7 +++++- src/query/mod.rs | 9 ++++---- src/query/more_like_this/mod.rs | 2 +- src/query/term_query/term_query.rs | 22 ++++++++++++++++++- .../term_set_query_fastfield.rs | 2 +- sstable/Cargo.toml | 2 +- sstable/src/lib.rs | 2 +- sstable/tests/sstable_test.rs | 6 +++-- 15 files changed, 67 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6efeebc65..51e5b8eb0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1646,7 +1646,7 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.25.0" +version = "0.26.0" dependencies = [ "aho-corasick", "arc-swap", @@ -1700,7 +1700,7 @@ dependencies = [ "tantivy-bitpacker", "tantivy-columnar", "tantivy-common", - "tantivy-fst 0.5.0 (git+https://github.com/paradedb/fst.git)", + "tantivy-fst", "tantivy-query-grammar", "tantivy-sstable", "tantivy-stacker", @@ -1757,17 +1757,6 @@ dependencies = [ "time", ] -[[package]] -name = "tantivy-fst" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" -dependencies = [ - "byteorder", - "regex-syntax 0.8.5", - "utf8-ranges", -] - [[package]] name = "tantivy-fst" version = "0.5.0" @@ -1801,7 +1790,7 @@ dependencies = [ "rand", "tantivy-bitpacker", "tantivy-common", - "tantivy-fst 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tantivy-fst", "zstd", ] diff --git a/columnar/benches/bench_merge.rs b/columnar/benches/bench_merge.rs index a4b6c3b3f..43abd5192 100644 --- a/columnar/benches/bench_merge.rs +++ b/columnar/benches/bench_merge.rs @@ -40,7 +40,14 @@ fn main() { let columnar_readers = columnar_readers.iter().collect::>(); let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]); - merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap(); + merge_columnar( + &columnar_readers, + &[], + merge_row_order.into(), + &mut out, + || false, + ) + .unwrap(); Some(out.len() as u64) }, ); diff --git a/columnar/src/column_index/optional_index/tests.rs b/columnar/src/column_index/optional_index/tests.rs index e27537239..a47a3864d 100644 --- a/columnar/src/column_index/optional_index/tests.rs +++ b/columnar/src/column_index/optional_index/tests.rs @@ -164,7 +164,11 @@ fn test_optional_index_large() { fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) { let optional_index = OptionalIndex::for_test(num_rows, row_ids); assert_eq!(optional_index.num_docs(), num_rows); - assert!(optional_index.iter_rows().eq(row_ids.iter().copied())); + assert!( + optional_index + .iter_non_null_docs() + .eq(row_ids.iter().copied()) + ); } #[test] diff --git a/columnar/src/columnar/merge/tests.rs b/columnar/src/columnar/merge/tests.rs index 359c12446..3cfc147d4 100644 --- a/columnar/src/columnar/merge/tests.rs +++ b/columnar/src/columnar/merge/tests.rs @@ -571,6 +571,7 @@ proptest! { &[], MergeRowOrder::Stack(stack_merge_order), &mut out, + || false, ).unwrap(); let merged_reader = ColumnarReader::open(out).unwrap(); @@ -588,6 +589,7 @@ proptest! { &[], MergeRowOrder::Stack(stack_merge_order), &mut out, + || false, ).unwrap(); } diff --git a/src/index/segment_reader.rs b/src/index/segment_reader.rs index 1f91d2f18..db32e0fa5 100644 --- a/src/index/segment_reader.rs +++ b/src/index/segment_reader.rs @@ -1,5 +1,4 @@ use std::collections::HashMap; -use std::ops::BitOrAssign; use std::path::PathBuf; use std::sync::{Arc, OnceLock, RwLock}; use std::{fmt, io}; @@ -377,7 +376,8 @@ impl SegmentReader { if is_json { let term_dictionary_json_field_num_bytes: u64 = self .termdict_composite - .open_read(field) + .get() + .and_then(|composite| composite.open_read(field)) .map(|file_slice| file_slice.len() as u64) .unwrap_or(0u64); let inv_index = self.inverted_index(field)?; @@ -429,19 +429,22 @@ impl SegmentReader { } else { let postings_size: ByteCount = self .postings_composite - .open_read(field) + .get() + .and_then(|composite| composite.open_read(field)) .map(|posting_fileslice| posting_fileslice.len()) .unwrap_or(0) .into(); let positions_size: ByteCount = self .positions_composite - .open_read(field) + .get() + .and_then(|composite| composite.open_read(field)) .map(|positions_fileslice| positions_fileslice.len()) .unwrap_or(0) .into(); let term_dictionary_size: ByteCount = self .termdict_composite - .open_read(field) + .get() + .and_then(|composite| composite.open_read(field)) .map(|term_dictionary_fileslice| term_dictionary_fileslice.len()) .unwrap_or(0) .into(); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index aec3abc4e..994350b38 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -1,4 +1,3 @@ -use std::any::Any; use std::borrow::BorrowMut; use std::collections::HashSet; use std::io::Write; diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 4f1f196b0..b5fc99f1a 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -101,7 +101,7 @@ where scorers.push(scorer); } - let scorer = BufferedUnionScorer::build(scorers, SumCombiner::default); + let scorer = BufferedUnionScorer::build(scorers, SumCombiner::default, reader.max_doc()); Ok(Box::new(scorer)) } diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 659c69051..92c247dad 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -90,7 +90,11 @@ impl Intersection { } } - pub fn with_two_sets(left: TDocSet, right: TDocSet) -> Intersection { + pub fn with_two_sets( + left: TDocSet, + right: TDocSet, + num_docs: u32, + ) -> Intersection { let mut docsets = vec![left, right]; go_to_first_doc(&mut docsets); let left = docsets.remove(0); @@ -99,6 +103,7 @@ impl Intersection { left, right, others: docsets, + num_docs, } } } diff --git a/src/query/mod.rs b/src/query/mod.rs index a17f0459a..37798f495 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -22,6 +22,7 @@ mod range_query; mod regex_query; mod reqopt_scorer; mod scorer; +mod size_hint; mod term_query; mod term_set_query; mod union; @@ -49,9 +50,7 @@ pub use self::explanation::{does_not_match, Explanation}; pub(crate) use self::fuzzy_query::DfaWrapper; pub use self::fuzzy_query::FuzzyTermQuery; pub use self::intersection::{intersect_scorers, Intersection}; -pub use self::more_like_this::{ - MoreLikeThis, MoreLikeThisQuery, MoreLikeThisQueryBuilder, ScoreTerm, -}; +pub use self::more_like_this::{MoreLikeThis, MoreLikeThisQuery, MoreLikeThisQueryBuilder}; pub use self::phrase_prefix_query::PhrasePrefixQuery; pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery}; pub use self::phrase_query::regex_phrase_weight::RegexPhraseWeight; @@ -123,7 +122,9 @@ mod tests { query.query_terms(text_field, &segment_reader, &mut |term, pos| { terms.push((term.clone(), pos)) }); - assert_eq!(vec![(term_a.clone(), false); 5], terms); + // With the new query_terms signature that includes segment_reader, + // duplicate terms are deduplicated + assert_eq!(vec![(term_a.clone(), false)], terms); } { let query = query_parser.parse_query("a -b").unwrap(); diff --git a/src/query/more_like_this/mod.rs b/src/query/more_like_this/mod.rs index 943ddc69e..277d2b9bb 100644 --- a/src/query/more_like_this/mod.rs +++ b/src/query/more_like_this/mod.rs @@ -3,5 +3,5 @@ mod more_like_this; /// Module containing the different query implementations. mod query; -pub use self::more_like_this::{MoreLikeThis, ScoreTerm}; +pub use self::more_like_this::MoreLikeThis; pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder}; diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index e617ad8bc..38640034c 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -1,8 +1,10 @@ use std::fmt; +use std::ops::Bound; use super::term_weight::TermWeight; use crate::query::bm25::Bm25Weight; -use crate::query::{EnableScoring, Explanation, Query, Weight}; +use crate::query::range_query::is_type_valid_for_fastfield_range_query; +use crate::query::{EnableScoring, Explanation, Query, RangeQuery, Weight}; use crate::schema::{Field, IndexRecordOption}; use crate::{SegmentReader, Term}; @@ -122,6 +124,24 @@ impl TermQuery { impl Query for TermQuery { fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result> { + // If the field is not indexed but is a suitable fast field, fall back to a range query + // on the fast field matching exactly this term. + // + // Note: This is considerable slower since it requires to scan the entire fast field. + // TODO: The range query would gain from having a single-value optimization + let schema = enable_scoring.schema(); + let field_entry = schema.get_field_entry(self.term.field()); + if !field_entry.is_indexed() + && field_entry.is_fast() + && is_type_valid_for_fastfield_range_query(self.term.typ()) + && !enable_scoring.is_scoring_enabled() + { + let range_query = RangeQuery::new( + Bound::Included(self.term.clone()), + Bound::Included(self.term.clone()), + ); + return range_query.weight(enable_scoring); + } Ok(Box::new(self.specialized_weight(enable_scoring)?)) } fn query_terms( diff --git a/src/query/term_set_query/term_set_query_fastfield.rs b/src/query/term_set_query/term_set_query_fastfield.rs index 316840400..95c110986 100644 --- a/src/query/term_set_query/term_set_query_fastfield.rs +++ b/src/query/term_set_query/term_set_query_fastfield.rs @@ -1,6 +1,6 @@ use std::net::Ipv6Addr; -use columnar::{Column, ColumnType, MonotonicallyMappableToU64}; +use columnar::{Column, ColumnType}; use rustc_hash::{FxHashMap, FxHashSet}; use crate::query::score_combiner::DoNothingCombiner; diff --git a/sstable/Cargo.toml b/sstable/Cargo.toml index 7b353cece..473ceb55d 100644 --- a/sstable/Cargo.toml +++ b/sstable/Cargo.toml @@ -14,7 +14,7 @@ common = {version= "0.10", path="../common", package="tantivy-common"} futures-util = "0.3.30" itertools = "0.14.0" tantivy-bitpacker = { version= "0.9", path="../bitpacker" } -tantivy-fst = "0.5" +tantivy-fst = { git = "https://github.com/paradedb/fst.git" } # experimental gives us access to Decompressor::upper_bound zstd = { version = "0.13", optional = true, features = ["experimental"] } diff --git a/sstable/src/lib.rs b/sstable/src/lib.rs index e51a338db..83452bde9 100644 --- a/sstable/src/lib.rs +++ b/sstable/src/lib.rs @@ -24,7 +24,7 @@ //! //! // Open the sstable. //! let sstable = -//! Dictionary::::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap(); +//! Dictionary::::from_bytes_for_tests(OwnedBytes::new(sstable_bytes)).unwrap(); //! //! // Search for a key. //! let value = sstable.get(b"banana").unwrap(); diff --git a/sstable/tests/sstable_test.rs b/sstable/tests/sstable_test.rs index c08547404..9968522d9 100644 --- a/sstable/tests/sstable_test.rs +++ b/sstable/tests/sstable_test.rs @@ -12,7 +12,8 @@ fn test_create_and_search_sstable() { // Open the sstable. let sstable = - Dictionary::::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap(); + Dictionary::::from_bytes_for_tests(OwnedBytes::new(sstable_bytes)) + .unwrap(); // Search for a key. let value = sstable.get(b"banana").unwrap(); @@ -34,7 +35,8 @@ fn test_custom_value_sstable() { // Open the sstable. let sstable = - Dictionary::::from_bytes(OwnedBytes::new(sstable_bytes)).unwrap(); + Dictionary::::from_bytes_for_tests(OwnedBytes::new(sstable_bytes)) + .unwrap(); let mut stream = sstable.stream().unwrap(); assert!(stream.advance());