mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-29 13:50:43 +00:00
Compare commits
10 Commits
trinity.po
...
mallets/fi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c6eb92c7e | ||
|
|
9f73d3fe54 | ||
|
|
50272240c6 | ||
|
|
a6fd070e3d | ||
|
|
06c046bdc9 | ||
|
|
63f65dcd71 | ||
|
|
8a4c5b9013 | ||
|
|
f7355e60cd | ||
|
|
90603d2396 | ||
|
|
910861a3e9 |
@@ -81,11 +81,6 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
||||
register!(group, composite_histogram);
|
||||
register!(group, composite_histogram_calendar);
|
||||
|
||||
// multi_terms aggregation benchmarks
|
||||
register!(group, multi_terms_status_with_zipf_1000);
|
||||
register!(group, multi_terms_zipf_1000_with_status);
|
||||
register!(group, multi_terms_status_with_zipf_1000_sub_agg);
|
||||
|
||||
register!(group, cardinality_agg);
|
||||
register!(group, cardinality_agg_high_card);
|
||||
register!(group, cardinality_agg_low_card);
|
||||
@@ -573,58 +568,6 @@ fn composite_histogram_calendar(index: &Index) {
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
/// multi_terms equivalent of terms_status_with_terms_zipf_1000_sub_agg:
|
||||
/// flat GroupBy(status, zipf_1000) vs nested terms(status) -> terms(zipf_1000)
|
||||
fn multi_terms_status_with_zipf_1000(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"mt": {
|
||||
"multi_terms": {
|
||||
"terms": [
|
||||
{"field": "text_few_terms_status"},
|
||||
{"field": "text_1000_terms_zipf"}
|
||||
],
|
||||
"size": 10
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
/// multi_terms equivalent of terms_zipf_1000_with_terms_status_sub_agg:
|
||||
/// flat GroupBy(zipf_1000, status) vs nested terms(zipf_1000) -> terms(status)
|
||||
fn multi_terms_zipf_1000_with_status(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"mt": {
|
||||
"multi_terms": {
|
||||
"terms": [
|
||||
{"field": "text_1000_terms_zipf"},
|
||||
{"field": "text_few_terms_status"}
|
||||
],
|
||||
"size": 100
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
/// multi_terms on the same field pair as the nested benchmarks, with an avg sub-aggregation
|
||||
fn multi_terms_status_with_zipf_1000_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"mt": {
|
||||
"multi_terms": {
|
||||
"terms": [
|
||||
{"field": "text_few_terms_status"},
|
||||
{"field": "text_1000_terms_zipf"}
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_f64": { "avg": { "field": "score_f64" } }
|
||||
}
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
|
||||
let collector = get_collector(agg_req);
|
||||
|
||||
@@ -54,6 +54,6 @@ pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str)
|
||||
}
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||
columnar_writer.serialize(num_docs, None, &mut wrt).unwrap();
|
||||
ColumnarReader::open(wrt).unwrap()
|
||||
}
|
||||
|
||||
@@ -281,12 +281,16 @@ impl BitSet {
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
///
|
||||
/// Returns true if the set changed.
|
||||
#[inline]
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
pub fn insert(&mut self, el: u32) -> bool {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += u64::from(self.tinysets[higher as usize].insert_mut(lower));
|
||||
let changed = self.tinysets[higher as usize].insert_mut(lower);
|
||||
self.len += u64::from(changed);
|
||||
changed
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
|
||||
@@ -13,9 +13,8 @@ use crate::aggregation::bucket::{
|
||||
build_segment_filter_collector, build_segment_histogram_collector,
|
||||
build_segment_range_collector, CompositeAggReqData, CompositeAggregation,
|
||||
CompositeSourceAccessors, FilterAggReqData, HistogramAggReqData, HistogramBounds,
|
||||
IncludeExcludeParam, MissingTermAggReqData, MultiTermsAggReqData, MultiTermsAggregation,
|
||||
MultiTermsFieldAccessors, RangeAggReqData, SegmentMultiTermsCollector, TermMissingAgg,
|
||||
TermsAggReqData, TermsAggregation, TermsAggregationInternal,
|
||||
IncludeExcludeParam, MissingTermAggReqData, RangeAggReqData, TermMissingAgg, TermsAggReqData,
|
||||
TermsAggregation, TermsAggregationInternal,
|
||||
};
|
||||
use crate::aggregation::metric::{
|
||||
build_segment_stats_collector, AverageAggregation, CardinalityAggReqData,
|
||||
@@ -77,10 +76,6 @@ impl AggregationsSegmentCtx {
|
||||
self.per_request.composite_req_data.push(data);
|
||||
self.per_request.composite_req_data.len() - 1
|
||||
}
|
||||
pub(crate) fn push_multi_terms_req_data(&mut self, data: MultiTermsAggReqData) -> usize {
|
||||
self.per_request.multi_terms_req_data.push(data);
|
||||
self.per_request.multi_terms_req_data.len() - 1
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_term_req_data(&self, idx: usize) -> &TermsAggReqData {
|
||||
@@ -130,8 +125,6 @@ pub struct PerRequestAggSegCtx {
|
||||
pub missing_term_req_data: Vec<MissingTermAggReqData>,
|
||||
/// CompositeAggReqData contains the request data for a composite aggregation.
|
||||
pub composite_req_data: Vec<CompositeAggReqData>,
|
||||
/// MultiTermsAggReqData contains the request data for a multi_terms aggregation.
|
||||
pub multi_terms_req_data: Vec<MultiTermsAggReqData>,
|
||||
|
||||
/// Request tree used to build collectors.
|
||||
pub agg_tree: Vec<AggRefNode>,
|
||||
@@ -184,11 +177,6 @@ impl PerRequestAggSegCtx {
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.multi_terms_req_data
|
||||
.iter()
|
||||
.map(|t| t.get_memory_consumption())
|
||||
.sum::<usize>()
|
||||
+ self.agg_tree.len() * std::mem::size_of::<AggRefNode>()
|
||||
}
|
||||
|
||||
@@ -206,7 +194,6 @@ impl PerRequestAggSegCtx {
|
||||
AggKind::Range => self.range_req_data[idx].name.as_str(),
|
||||
AggKind::Filter => self.filter_req_data[idx].name.as_str(),
|
||||
AggKind::Composite => self.composite_req_data[idx].name.as_str(),
|
||||
AggKind::MultiTerms => self.multi_terms_req_data[idx].name.as_str(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -360,9 +347,6 @@ pub(crate) fn build_segment_agg_collector(
|
||||
req, node,
|
||||
)?,
|
||||
)),
|
||||
AggKind::MultiTerms => Ok(Box::new(SegmentMultiTermsCollector::from_req_and_validate(
|
||||
req, node,
|
||||
)?)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -394,7 +378,6 @@ pub enum AggKind {
|
||||
Range,
|
||||
Filter,
|
||||
Composite,
|
||||
MultiTerms,
|
||||
}
|
||||
|
||||
impl AggKind {
|
||||
@@ -411,7 +394,6 @@ impl AggKind {
|
||||
AggKind::Range => "Range",
|
||||
AggKind::Filter => "Filter",
|
||||
AggKind::Composite => "Composite",
|
||||
AggKind::MultiTerms => "MultiTerms",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -667,14 +649,6 @@ fn build_nodes(
|
||||
&req.sub_aggregation,
|
||||
composite_req,
|
||||
)?]),
|
||||
AggregationVariants::MultiTerms(multi_terms_req) => Ok(vec![build_multi_terms_node(
|
||||
agg_name,
|
||||
reader,
|
||||
segment_ordinal,
|
||||
data,
|
||||
&req.sub_aggregation,
|
||||
multi_terms_req,
|
||||
)?]),
|
||||
AggregationVariants::Filter(filter_req) => {
|
||||
// Build the query and evaluator upfront
|
||||
let schema = reader.schema();
|
||||
@@ -733,111 +707,6 @@ fn build_composite_node(
|
||||
})
|
||||
}
|
||||
|
||||
fn build_multi_terms_node(
|
||||
agg_name: &str,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
data: &mut AggregationsSegmentCtx,
|
||||
sub_aggs: &Aggregations,
|
||||
req: &MultiTermsAggregation,
|
||||
) -> crate::Result<AggRefNode> {
|
||||
use crate::aggregation::bucket::KeyElem;
|
||||
|
||||
if req.terms.is_empty() {
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"multi_terms aggregation requires at least one field".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let mut fields = Vec::with_capacity(req.terms.len());
|
||||
|
||||
for field_def in &req.terms {
|
||||
let field_name = &field_def.field;
|
||||
let str_dict_column = reader.fast_fields().str(field_name)?;
|
||||
|
||||
// Collect all columns for this field (handles JSON multi-type fields).
|
||||
let columns = get_term_agg_accessors(reader, field_name, &field_def.missing)?;
|
||||
|
||||
// Precompute the missing KeyElem (or None -> drop combo).
|
||||
let missing_key_elem = if let Some(missing) = &field_def.missing {
|
||||
match missing {
|
||||
Key::Str(missing_str) => {
|
||||
match columns.iter().position(|(_, ct)| *ct == ColumnType::Str) {
|
||||
Some(idx) => {
|
||||
match str_dict_column
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.dictionary()
|
||||
.term_ord(missing_str.as_bytes())?
|
||||
{
|
||||
Some(ord) => Some(KeyElem::new(idx as u32, ord)),
|
||||
None => Some(KeyElem::synthetic_missing()),
|
||||
}
|
||||
}
|
||||
None => Some(KeyElem::synthetic_missing()),
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Non-string missing: find the column whose type best matches the
|
||||
// missing key. Prefer an exact-type match; fall back to any numeric
|
||||
// column so cross-type coercions (e.g. Key::F64 on an I64 column)
|
||||
// still work.
|
||||
let preferred_type = match missing {
|
||||
Key::F64(_) => ColumnType::F64,
|
||||
Key::I64(_) => ColumnType::I64,
|
||||
Key::U64(_) => ColumnType::U64,
|
||||
Key::Str(_) => unreachable!("handled by Key::Str arm"),
|
||||
};
|
||||
let idx = columns
|
||||
.iter()
|
||||
.position(|(_, ct)| *ct == preferred_type)
|
||||
.or_else(|| {
|
||||
columns
|
||||
.iter()
|
||||
.position(|(_, ct)| ct.numerical_type().is_some())
|
||||
});
|
||||
match idx {
|
||||
Some(idx) => {
|
||||
let (col, col_type) = &columns[idx];
|
||||
get_missing_val_as_u64_lenient(
|
||||
*col_type,
|
||||
col.max_value(),
|
||||
missing,
|
||||
field_name,
|
||||
)?
|
||||
.map(|sentinel| KeyElem::new(idx as u32, sentinel))
|
||||
}
|
||||
None => Some(KeyElem::synthetic_missing()),
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
fields.push(MultiTermsFieldAccessors {
|
||||
columns,
|
||||
str_dict_column,
|
||||
missing: field_def.missing.clone(),
|
||||
missing_key_elem,
|
||||
field: field_name.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
let idx = data.push_multi_terms_req_data(MultiTermsAggReqData {
|
||||
name: agg_name.to_string(),
|
||||
req: req.clone(),
|
||||
fields,
|
||||
sub_aggregations: sub_aggs.clone(),
|
||||
});
|
||||
let children = build_children(sub_aggs, reader, segment_ordinal, data)?;
|
||||
Ok(AggRefNode {
|
||||
kind: AggKind::MultiTerms,
|
||||
idx_in_req_data: idx,
|
||||
children,
|
||||
})
|
||||
}
|
||||
|
||||
fn build_children(
|
||||
aggs: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
@@ -1062,7 +931,9 @@ fn build_allowed_term_ids_for_str(
|
||||
// add matches
|
||||
allowed = Some(BitSet::with_max_value(allowed_capacity));
|
||||
let allowed = allowed.as_mut().unwrap();
|
||||
for_each_matching_term_ord(str_col, include, |ord| allowed.insert(ord))?;
|
||||
for_each_matching_term_ord(str_col, include, |ord| {
|
||||
let _ = allowed.insert(ord);
|
||||
})?;
|
||||
};
|
||||
|
||||
if let Some(exclude) = exclude {
|
||||
|
||||
@@ -33,7 +33,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::bucket::{
|
||||
CompositeAggregation, DateHistogramAggregationReq, FilterAggregation, HistogramAggregation,
|
||||
MultiTermsAggregation, RangeAggregation, TermsAggregation,
|
||||
RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||
@@ -202,9 +202,6 @@ pub enum AggregationVariants {
|
||||
/// Multi-dimensional, paginable bucket aggregation.
|
||||
#[serde(rename = "composite")]
|
||||
Composite(CompositeAggregation),
|
||||
/// Bucket aggregation over unique combinations of values across multiple term fields.
|
||||
#[serde(rename = "multi_terms")]
|
||||
MultiTerms(MultiTermsAggregation),
|
||||
|
||||
// Metric aggregation types
|
||||
/// Computes the average of the extracted values.
|
||||
@@ -256,9 +253,6 @@ impl AggregationVariants {
|
||||
.iter()
|
||||
.map(|source| source.field())
|
||||
.collect(),
|
||||
AggregationVariants::MultiTerms(mt) => {
|
||||
mt.terms.iter().map(|t| t.field.as_str()).collect()
|
||||
}
|
||||
AggregationVariants::Average(avg) => vec![avg.field_name()],
|
||||
AggregationVariants::Count(count) => vec![count.field_name()],
|
||||
AggregationVariants::Max(max) => vec![max.field_name()],
|
||||
@@ -299,12 +293,6 @@ impl AggregationVariants {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_multi_terms(&self) -> Option<&MultiTermsAggregation> {
|
||||
match &self {
|
||||
AggregationVariants::MultiTerms(mt) => Some(mt),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
|
||||
match &self {
|
||||
AggregationVariants::Percentiles(percentile_req) => Some(percentile_req),
|
||||
|
||||
@@ -152,25 +152,12 @@ pub enum BucketResult {
|
||||
///
|
||||
/// See [`TermsAggregation`](super::bucket::TermsAggregation)
|
||||
buckets: Vec<BucketEntry>,
|
||||
/// The number of documents that didn't make it into to TOP N due to shard_size or size
|
||||
/// The number of documents that didn’t make it into to TOP N due to shard_size or size
|
||||
sum_other_doc_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// The upper bound error for the doc count of each term.
|
||||
doc_count_error_upper_bound: Option<u64>,
|
||||
},
|
||||
/// This is the multi_terms result -- placed AFTER Terms so that a zero-bucket result
|
||||
/// deserializes as Terms (the more common case). Non-empty MultiTerms still deserializes
|
||||
/// correctly because its array `key` fails Terms' scalar `key` check first. The only known
|
||||
/// ambiguity is an empty MultiTerms result decoding as Terms (deserialization only).
|
||||
MultiTerms {
|
||||
/// The buckets (one per unique combination of field values).
|
||||
buckets: Vec<MultiTermsBucketEntry>,
|
||||
/// The number of documents that didn't make it into the TOP N.
|
||||
sum_other_doc_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// The upper bound error for the doc count of each term combination.
|
||||
doc_count_error_upper_bound: Option<u64>,
|
||||
},
|
||||
/// This is the filter result - a single bucket with sub-aggregations
|
||||
Filter(FilterBucketResult),
|
||||
/// This is the composite result
|
||||
@@ -192,11 +179,6 @@ impl BucketResult {
|
||||
BucketResult::Histogram { buckets } => {
|
||||
buckets.iter().map(|bucket| bucket.get_bucket_count()).sum()
|
||||
}
|
||||
BucketResult::MultiTerms {
|
||||
buckets,
|
||||
sum_other_doc_count: _,
|
||||
doc_count_error_upper_bound: _,
|
||||
} => buckets.iter().map(|bucket| bucket.get_bucket_count()).sum(),
|
||||
BucketResult::Terms {
|
||||
buckets,
|
||||
sum_other_doc_count: _,
|
||||
@@ -290,35 +272,6 @@ impl GetDocCount for BucketEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Bucket entry for a [`multi_terms`](super::bucket::MultiTermsAggregation) aggregation.
|
||||
///
|
||||
/// The key is a vector of values (one per declared field), and `key_as_string` is the pipe-joined
|
||||
/// representation.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct MultiTermsBucketEntry {
|
||||
/// Pipe-joined string representation of all key elements, e.g. `"rock|Product A"`.
|
||||
pub key_as_string: String,
|
||||
/// The composite key: one [`Key`] per field in declaration order.
|
||||
pub key: Vec<Key>,
|
||||
/// Number of documents in this bucket.
|
||||
pub doc_count: u64,
|
||||
/// Sub-aggregation results.
|
||||
#[serde(flatten)]
|
||||
pub sub_aggregation: AggregationResults,
|
||||
}
|
||||
|
||||
impl MultiTermsBucketEntry {
|
||||
pub(crate) fn get_bucket_count(&self) -> u64 {
|
||||
1 + self.sub_aggregation.get_bucket_count()
|
||||
}
|
||||
}
|
||||
|
||||
impl GetDocCount for MultiTermsBucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub-aggregations.
|
||||
///
|
||||
|
||||
@@ -25,7 +25,6 @@
|
||||
mod composite;
|
||||
mod filter;
|
||||
mod histogram;
|
||||
mod multi_terms;
|
||||
mod range;
|
||||
mod term_agg;
|
||||
mod term_missing_agg;
|
||||
@@ -36,7 +35,6 @@ use std::fmt;
|
||||
pub use composite::*;
|
||||
pub use filter::*;
|
||||
pub use histogram::*;
|
||||
pub use multi_terms::*;
|
||||
pub use range::*;
|
||||
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
|
||||
pub use term_agg::*;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -28,7 +28,7 @@ use super::{format_date, AggregationError, Key, SerializedKey};
|
||||
use crate::aggregation::agg_result::{
|
||||
AggregationResults, BucketEntries, BucketEntry, CompositeBucketEntry, FilterBucketResult,
|
||||
};
|
||||
use crate::aggregation::bucket::{IntermediateMultiTermsBucketResult, TermsAggregationInternal};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
use crate::aggregation::metric::CardinalityCollector;
|
||||
use crate::TantivyError;
|
||||
|
||||
@@ -82,7 +82,7 @@ impl From<IntermediateKey> for Key {
|
||||
}
|
||||
}
|
||||
IntermediateKey::F64(f) => Self::F64(f),
|
||||
IntermediateKey::Bool(f) => Self::Str(f.to_string()),
|
||||
IntermediateKey::Bool(f) => Self::U64(f as u64),
|
||||
IntermediateKey::U64(f) => Self::U64(f),
|
||||
IntermediateKey::I64(f) => Self::I64(f),
|
||||
}
|
||||
@@ -286,11 +286,6 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
||||
buckets: IntermediateCompositeBucketResult::default(),
|
||||
})
|
||||
}
|
||||
MultiTerms(_) => {
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::MultiTerms {
|
||||
buckets: Default::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -504,11 +499,6 @@ pub enum IntermediateBucketResult {
|
||||
/// The composite buckets
|
||||
buckets: IntermediateCompositeBucketResult,
|
||||
},
|
||||
/// Multi-terms aggregation
|
||||
MultiTerms {
|
||||
/// The multi-terms buckets
|
||||
buckets: IntermediateMultiTermsBucketResult,
|
||||
},
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
@@ -611,13 +601,6 @@ impl IntermediateBucketResult {
|
||||
.expect("unexpected aggregation, expected composite aggregation");
|
||||
buckets.into_final_result(composite_req, req.sub_aggregation(), limits)
|
||||
}
|
||||
IntermediateBucketResult::MultiTerms { buckets } => {
|
||||
let multi_terms_req = req
|
||||
.agg
|
||||
.as_multi_terms()
|
||||
.expect("unexpected aggregation, expected multi_terms aggregation");
|
||||
buckets.into_final_result(multi_terms_req, req.sub_aggregation(), limits)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -694,14 +677,6 @@ impl IntermediateBucketResult {
|
||||
) => {
|
||||
composite_left.merge_fruits(composite_right)?;
|
||||
}
|
||||
(
|
||||
IntermediateBucketResult::MultiTerms { buckets: mt_left },
|
||||
IntermediateBucketResult::MultiTerms { buckets: mt_right },
|
||||
) => {
|
||||
merge_maps(&mut mt_left.entries, mt_right.entries)?;
|
||||
mt_left.sum_other_doc_count += mt_right.sum_other_doc_count;
|
||||
mt_left.doc_count_error_upper_bound += mt_right.doc_count_error_upper_bound;
|
||||
}
|
||||
(IntermediateBucketResult::Range(_), _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
@@ -717,9 +692,6 @@ impl IntermediateBucketResult {
|
||||
(IntermediateBucketResult::Composite { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::MultiTerms { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use crate::collector::Count;
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::index::SegmentId;
|
||||
use crate::indexer::{LogMergePolicy, NoMergePolicy};
|
||||
use crate::indexer::{DocIdMapping, LogMergePolicy, NoMergePolicy};
|
||||
use crate::postings::Postings;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, INDEXED, STRING, TEXT};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, Value, INDEXED, STORED, STRING, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{
|
||||
Directory, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter, ReloadPolicy,
|
||||
TantivyDocument, Term,
|
||||
Directory, DocAddress, DocSet, Index, IndexBuilder, IndexReader, IndexSettings, IndexWriter,
|
||||
ReloadPolicy, TantivyDocument, Term,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -300,6 +300,51 @@ fn test_single_segment_index_writer() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_segment_index_writer_with_doc_id_mapping() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let directory = RamDirectory::default();
|
||||
let settings = IndexSettings {
|
||||
manual_doc_id_mapping: true,
|
||||
..Default::default()
|
||||
};
|
||||
let mut single_segment_index_writer = Index::builder()
|
||||
.schema(schema)
|
||||
.settings(settings)
|
||||
.single_segment_index_writer(directory, 15_000_000)?;
|
||||
|
||||
single_segment_index_writer.add_document(doc!(text_field=>"alpha beta"))?;
|
||||
single_segment_index_writer.add_document(doc!())?;
|
||||
single_segment_index_writer.add_document(doc!(text_field=>"gamma"))?;
|
||||
|
||||
let mapping = DocIdMapping::new_permutation(vec![2, 1, 0])?;
|
||||
let index = single_segment_index_writer.finalize_with_doc_id_mapping(&mapping)?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field)?;
|
||||
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 1);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 2);
|
||||
|
||||
let doc_0 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 0))?;
|
||||
assert_eq!(
|
||||
doc_0.get_first(text_field).and_then(|val| val.as_str()),
|
||||
Some("gamma")
|
||||
);
|
||||
let doc_1 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 1))?;
|
||||
assert!(doc_1.get_first(text_field).is_none());
|
||||
let doc_2 = searcher.doc::<TantivyDocument>(DocAddress::new(0, 2))?;
|
||||
assert_eq!(
|
||||
doc_2.get_first(text_field).and_then(|val| val.as_str()),
|
||||
Some("alpha beta")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merging_segment_update_docfreq() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -250,6 +250,11 @@ pub struct IndexSettings {
|
||||
/// provided in `IndexSortByField`
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sort_by_field: Option<IndexSortByField>,
|
||||
/// If true, enables caller-provided doc id mappings at segment finalization time.
|
||||
/// Always skip serializing this field since it's only used at segment finalization time.
|
||||
#[doc(hidden)]
|
||||
#[serde(skip)]
|
||||
pub manual_doc_id_mapping: bool,
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
@@ -273,6 +278,7 @@ impl Default for IndexSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sort_by_field: None,
|
||||
manual_doc_id_mapping: false,
|
||||
docstore_compression: Compressor::default(),
|
||||
docstore_blocksize: default_docstore_blocksize(),
|
||||
docstore_compress_dedicated_thread: true,
|
||||
@@ -460,6 +466,7 @@ mod tests {
|
||||
field: "text".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
manual_doc_id_mapping: false,
|
||||
docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
|
||||
compression_level: Some(4),
|
||||
}),
|
||||
@@ -529,6 +536,7 @@ mod tests {
|
||||
index_settings,
|
||||
IndexSettings {
|
||||
sort_by_field: None,
|
||||
manual_doc_id_mapping: false,
|
||||
docstore_compression: Compressor::default(),
|
||||
docstore_compress_dedicated_thread: true,
|
||||
docstore_blocksize: 16_384
|
||||
@@ -547,6 +555,18 @@ mod tests {
|
||||
serde_json::from_value(index_settings_json).unwrap();
|
||||
assert_eq!(index_settings_deser, index_settings);
|
||||
}
|
||||
{
|
||||
index_settings.manual_doc_id_mapping = true;
|
||||
let index_settings_json = serde_json::to_value(&index_settings).unwrap();
|
||||
assert_eq!(
|
||||
index_settings_json,
|
||||
serde_json::json!({
|
||||
"docstore_compression": "lz4",
|
||||
"docstore_blocksize": 16384
|
||||
})
|
||||
);
|
||||
index_settings.manual_doc_id_mapping = false;
|
||||
}
|
||||
{
|
||||
index_settings.docstore_compress_dedicated_thread = false;
|
||||
let index_settings_json = serde_json::to_value(&index_settings).unwrap();
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
//! This module is used when sorting the index by a property, e.g.
|
||||
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
|
||||
|
||||
use common::ReadOnlyBitSet;
|
||||
use common::{BitSet, ReadOnlyBitSet};
|
||||
|
||||
use super::SegmentWriter;
|
||||
use crate::schema::{Field, Schema};
|
||||
@@ -71,7 +70,33 @@ pub struct DocIdMapping {
|
||||
}
|
||||
|
||||
impl DocIdMapping {
|
||||
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
|
||||
/// Creates a `DocIdMapping` from a mapping of new doc ids to old doc ids, with permutation validation.
|
||||
/// The mapping is validated by checking that every old doc id appears exactly once in the mapping.
|
||||
/// I.e., doc ids must be consecutive from `0` to `new_doc_id_to_old.len() - 1`, inclusive.
|
||||
pub fn new_permutation(new_doc_id_to_old: Vec<DocId>) -> crate::Result<Self> {
|
||||
// Check that the mapping is a permutation of the segment doc ids.
|
||||
let max_doc = new_doc_id_to_old.len() as DocId;
|
||||
let mut old_doc_id_to_new = vec![0; max_doc as usize];
|
||||
|
||||
let mut seen_doc_ids = BitSet::with_max_value(max_doc);
|
||||
for (i, old_doc_id) in new_doc_id_to_old.iter().copied().enumerate() {
|
||||
if old_doc_id >= max_doc || !seen_doc_ids.insert(old_doc_id) {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Mapping must be a permutation of the segment doc ids".to_string(),
|
||||
));
|
||||
}
|
||||
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
||||
}
|
||||
|
||||
let doc_id_mapping = DocIdMapping {
|
||||
new_doc_id_to_old,
|
||||
old_doc_id_to_new,
|
||||
};
|
||||
Ok(doc_id_mapping)
|
||||
}
|
||||
|
||||
/// Creates a `DocIdMapping` from a mapping of new doc ids to old doc ids.
|
||||
pub(crate) fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
|
||||
let max_doc = new_doc_id_to_old.len();
|
||||
let old_max_doc = new_doc_id_to_old
|
||||
.iter()
|
||||
@@ -89,35 +114,41 @@ impl DocIdMapping {
|
||||
}
|
||||
}
|
||||
|
||||
/// returns the new doc_id for the old doc_id
|
||||
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
/// Returns the new doc_id for the old doc_id
|
||||
pub(crate) fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.old_doc_id_to_new[doc_id as usize]
|
||||
}
|
||||
/// returns the old doc_id for the new doc_id
|
||||
pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.new_doc_id_to_old[doc_id as usize]
|
||||
}
|
||||
/// iterate over old doc_ids in order of the new doc_ids
|
||||
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
||||
self.new_doc_id_to_old.iter().cloned()
|
||||
|
||||
/// Iiterate over old doc_ids in order of the new doc_ids
|
||||
pub(crate) fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
||||
self.new_doc_id_to_old.iter().copied()
|
||||
}
|
||||
|
||||
pub fn old_to_new_ids(&self) -> &[DocId] {
|
||||
/// Returns the new doc_ids in order of the old doc_ids
|
||||
pub(crate) fn old_to_new_ids(&self) -> &[DocId] {
|
||||
&self.old_doc_id_to_new[..]
|
||||
}
|
||||
|
||||
/// Remaps a given array to the new doc ids.
|
||||
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
|
||||
pub(crate) fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
|
||||
self.new_doc_id_to_old
|
||||
.iter()
|
||||
.map(|old_doc| els[*old_doc as usize])
|
||||
.collect()
|
||||
}
|
||||
pub fn num_new_doc_ids(&self) -> usize {
|
||||
|
||||
/// Returns the number of documents in the mapping.
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
// new_doc_id_to_old and old_doc_id_to_new have the same length by construction.
|
||||
self.new_doc_id_to_old.len()
|
||||
}
|
||||
pub fn num_old_doc_ids(&self) -> usize {
|
||||
self.old_doc_id_to_new.len()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl DocIdMapping {
|
||||
/// returns the old doc_id for the new doc_id
|
||||
fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
|
||||
self.new_doc_id_to_old[doc_id as usize]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,7 +189,7 @@ mod tests_indexsorting {
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::*;
|
||||
use crate::{schema::*, TantivyError};
|
||||
use crate::{DocAddress, Index, IndexBuilder, IndexSettings, IndexSortByField, Order};
|
||||
|
||||
fn create_test_index(
|
||||
@@ -550,6 +581,18 @@ mod tests_indexsorting {
|
||||
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping_new_permutation_rejects_out_of_range() {
|
||||
let result = DocIdMapping::new_permutation(vec![5, 0]);
|
||||
assert!(matches!(result, Err(TantivyError::InvalidArgument(_)),));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping_new_permutation_rejects_duplicates() {
|
||||
let result = DocIdMapping::new_permutation(vec![0, 1, 0]);
|
||||
assert!(matches!(result, Err(TantivyError::InvalidArgument(_)),));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_mapping_remap() {
|
||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
|
||||
|
||||
@@ -33,6 +33,7 @@ mod stamper;
|
||||
use crossbeam_channel as channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub use self::doc_id_mapping::DocIdMapping;
|
||||
pub use self::index_writer::{advance_deletes, IndexWriter, IndexWriterOptions};
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::directory::WritePtr;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::index::{Segment, SegmentComponent};
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
use crate::store::StoreWriter;
|
||||
use crate::store::{Compressor, StoreWriter};
|
||||
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
@@ -25,17 +25,18 @@ impl SegmentSerializer {
|
||||
// If the segment is going to be sorted, we stream the docs first to a temporary file.
|
||||
// In the merge case this is not necessary because we can kmerge the already sorted
|
||||
// segments
|
||||
let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
|
||||
let settings = segment.index().settings().clone();
|
||||
let remapping_required =
|
||||
(settings.sort_by_field.is_some() || settings.manual_doc_id_mapping) && !is_in_merge;
|
||||
let store_writer = if remapping_required {
|
||||
let store_write = segment.open_write(SegmentComponent::TempStore)?;
|
||||
StoreWriter::new(
|
||||
store_write,
|
||||
crate::store::Compressor::None,
|
||||
Compressor::None,
|
||||
// We want fast random access on the docs, so we choose a small block size.
|
||||
// If this is zero, the skip index will contain too many checkpoints and
|
||||
// therefore will be relatively slow.
|
||||
16000,
|
||||
16_000,
|
||||
settings.docstore_compress_dedicated_thread,
|
||||
)?
|
||||
} else {
|
||||
|
||||
@@ -136,10 +136,8 @@ impl SegmentWriter {
|
||||
|
||||
/// Lay on disk the current content of the `SegmentWriter`
|
||||
///
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot
|
||||
/// be used afterwards.
|
||||
pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
|
||||
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot be used afterwards.
|
||||
pub fn finalize(self) -> crate::Result<Vec<u64>> {
|
||||
let mapping: Option<DocIdMapping> = self
|
||||
.segment_serializer
|
||||
.segment()
|
||||
@@ -149,6 +147,41 @@ impl SegmentWriter {
|
||||
.clone()
|
||||
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
|
||||
.transpose()?;
|
||||
self.finalize_inner(mapping.as_ref())
|
||||
}
|
||||
|
||||
/// Lay on disk the current content of the `SegmentWriter` using the provided doc id mapping.
|
||||
///
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot be used afterwards.
|
||||
pub fn finalize_with_doc_id_mapping(self, mapping: &DocIdMapping) -> crate::Result<Vec<u64>> {
|
||||
// Ensure the segment writer was created in remap mode so the docstore can be reordered.
|
||||
if !self
|
||||
.segment_serializer
|
||||
.segment()
|
||||
.index()
|
||||
.settings()
|
||||
.manual_doc_id_mapping
|
||||
{
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"IndexSettings::manual_doc_id_mapping must be set to true".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// Check that the mapping eventually covers all documents in the segment.
|
||||
if mapping.len() != self.max_doc as usize {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Mapping must cover all documents in this segment. Expected {} documents, got {}",
|
||||
self.max_doc,
|
||||
mapping.len()
|
||||
)));
|
||||
}
|
||||
|
||||
self.finalize_inner(Some(mapping))
|
||||
}
|
||||
|
||||
fn finalize_inner(mut self, mapping: Option<&DocIdMapping>) -> crate::Result<Vec<u64>> {
|
||||
// Pad before remapping; the mapping indexes fieldnorms by old doc id.
|
||||
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
|
||||
remap_and_write(
|
||||
self.schema,
|
||||
&self.per_field_postings_writers,
|
||||
@@ -156,9 +189,9 @@ impl SegmentWriter {
|
||||
self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer,
|
||||
mapping.as_ref(),
|
||||
mapping,
|
||||
)?;
|
||||
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
|
||||
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping);
|
||||
Ok(doc_opstamps)
|
||||
}
|
||||
|
||||
@@ -485,6 +518,7 @@ mod tests {
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::{Postings, TermInfo};
|
||||
use crate::query::{PhraseQuery, QueryParser};
|
||||
use crate::schema::{
|
||||
@@ -497,7 +531,7 @@ mod tests {
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{
|
||||
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, SegmentReader,
|
||||
TantivyDocument, Term, TERMINATED,
|
||||
TantivyDocument, TantivyError, Term, TERMINATED,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -1136,4 +1170,87 @@ mod tests {
|
||||
"Schema error: 'Error getting tokenizer for field: title'"
|
||||
);
|
||||
}
|
||||
|
||||
/// Builds a `SegmentWriter` with a fast `u64` field and a text field that only some
|
||||
/// documents populate, so the text field is missing fieldnorms on some docs.
|
||||
///
|
||||
/// The `texts` slice provides, for each document, an optional text value. The order
|
||||
/// number is always recorded in the `order` fast field so callers can recover the
|
||||
/// original document via that value.
|
||||
fn build_segment_writer_with_doc_id_mapping(
|
||||
texts: &[Option<&str>],
|
||||
) -> (Index, crate::Segment, super::SegmentWriter) {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("order", FAST | STORED);
|
||||
schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut index = Index::create_in_ram(schema);
|
||||
index.settings_mut().manual_doc_id_mapping = true;
|
||||
let segment = index.new_segment();
|
||||
let order = index.schema().get_field("order").unwrap();
|
||||
let text = index.schema().get_field("text").unwrap();
|
||||
let mut segment_writer =
|
||||
super::SegmentWriter::for_segment(15_000_000, segment.clone()).unwrap();
|
||||
for (opstamp, text_opt) in texts.iter().enumerate() {
|
||||
let mut doc = TantivyDocument::default();
|
||||
doc.add_u64(order, opstamp as u64);
|
||||
if let Some(text_value) = text_opt {
|
||||
doc.add_text(text, *text_value);
|
||||
}
|
||||
segment_writer
|
||||
.add_document(crate::indexer::AddOperation {
|
||||
opstamp: opstamp as u64,
|
||||
document: doc,
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
(index, segment, segment_writer)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_finalize_with_doc_id_mapping_rejects_wrong_length() {
|
||||
let (_index, _segment, segment_writer) =
|
||||
build_segment_writer_with_doc_id_mapping(&[Some("a"), Some("b"), Some("c")]);
|
||||
// Mapping only covers 2 of the 3 documents.
|
||||
let mapping = DocIdMapping::new_permutation(vec![1, 0]).unwrap();
|
||||
let err = segment_writer
|
||||
.finalize_with_doc_id_mapping(&mapping)
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
matches!(err, TantivyError::InvalidArgument(_)),
|
||||
"unexpected error: {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_finalize_with_doc_id_mapping_remaps_missing_fieldnorms() -> crate::Result<()> {
|
||||
// doc 0: "alpha beta" (2 tokens)
|
||||
// doc 1: <no text> (missing fieldnorm -> 0)
|
||||
// doc 2: "gamma" (1 token)
|
||||
// doc 3: <no text> (missing fieldnorm -> 0)
|
||||
let (index, segment, segment_writer) = build_segment_writer_with_doc_id_mapping(&[
|
||||
Some("alpha beta"),
|
||||
None,
|
||||
Some("gamma"),
|
||||
None,
|
||||
]);
|
||||
let max_doc = segment_writer.max_doc();
|
||||
|
||||
// Reverse the documents. New doc id i maps to old doc id (3 - i).
|
||||
let mapping = DocIdMapping::new_permutation(vec![3, 2, 1, 0])?;
|
||||
segment_writer.finalize_with_doc_id_mapping(&mapping)?;
|
||||
|
||||
let segment = segment.with_max_doc(max_doc);
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
let text = index.schema().get_field("text").unwrap();
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text)?;
|
||||
|
||||
// After remapping, fieldnorms follow the reversed order:
|
||||
// new 0 <- old 3 (0), new 1 <- old 2 (1), new 2 <- old 1 (0), new 3 <- old 0 (2)
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 1);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 2);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::marker::PhantomData;
|
||||
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::segment_updater::save_metas;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::indexer::{DocIdMapping, SegmentWriter};
|
||||
use crate::schema::document::Document;
|
||||
use crate::{Directory, Index, IndexMeta, Opstamp, Segment, TantivyDocument};
|
||||
|
||||
@@ -38,9 +38,29 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> crate::Result<Index> {
|
||||
let max_doc = self.segment_writer.max_doc();
|
||||
self.segment_writer.finalize()?;
|
||||
let segment: Segment = self.segment.with_max_doc(max_doc);
|
||||
let Self {
|
||||
segment,
|
||||
segment_writer,
|
||||
..
|
||||
} = self;
|
||||
let max_doc = segment_writer.max_doc();
|
||||
segment_writer.finalize()?;
|
||||
Self::finalize_inner(segment, max_doc)
|
||||
}
|
||||
|
||||
pub fn finalize_with_doc_id_mapping(self, mapping: &DocIdMapping) -> crate::Result<Index> {
|
||||
let Self {
|
||||
segment,
|
||||
segment_writer,
|
||||
..
|
||||
} = self;
|
||||
let max_doc = segment_writer.max_doc();
|
||||
segment_writer.finalize_with_doc_id_mapping(mapping)?;
|
||||
Self::finalize_inner(segment, max_doc)
|
||||
}
|
||||
|
||||
fn finalize_inner(segment: Segment, max_doc: u32) -> crate::Result<Index> {
|
||||
let segment: Segment = segment.with_max_doc(max_doc);
|
||||
let index = segment.index();
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: index.settings().clone(),
|
||||
@@ -51,6 +71,6 @@ impl<D: Document> SingleSegmentIndexWriter<D> {
|
||||
};
|
||||
save_metas(&index_meta, index.directory())?;
|
||||
index.directory().sync_directory()?;
|
||||
Ok(segment.index().clone())
|
||||
Ok(index.clone())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user