mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-29 12:20:36 +00:00
Compare commits
6 Commits
segmentrea
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9b619998bd | ||
|
|
765c448945 | ||
|
|
943594ebaa | ||
|
|
df17daae0d | ||
|
|
0ae94baef5 | ||
|
|
3f448ecf79 |
@@ -560,7 +560,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
(
|
||||
(
|
||||
value((), tag(">=")),
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
map(word_infallible(")", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
bound
|
||||
@@ -574,7 +574,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag("<=")),
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
map(word_infallible(")", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
UserInputBound::Unbounded,
|
||||
@@ -588,7 +588,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag(">")),
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
map(word_infallible(")", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
bound
|
||||
@@ -602,7 +602,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag("<")),
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
map(word_infallible(")", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
UserInputBound::Unbounded,
|
||||
@@ -1323,6 +1323,14 @@ mod test {
|
||||
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("(<=42)", "{\"*\" TO \"42\"]");
|
||||
test_parse_query_to_ast_helper("(<=42 )", "{\"*\" TO \"42\"]");
|
||||
test_parse_query_to_ast_helper("(age:>5)", "\"age\":{\"5\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper(
|
||||
"(title:bar AND age:>12)",
|
||||
"(+\"title\":bar +\"age\":{\"12\" TO \"*\"})",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -676,7 +676,7 @@ mod tests {
|
||||
let num_segments = reader.searcher().segment_readers().len();
|
||||
assert!(num_segments <= 4);
|
||||
let num_components_except_deletes_and_tempstore =
|
||||
crate::index::SegmentComponent::iterator().len() - 2;
|
||||
crate::index::SegmentComponent::iterator().len() - 1;
|
||||
let max_num_mmapped = num_components_except_deletes_and_tempstore * num_segments;
|
||||
assert_eventually(|| {
|
||||
let num_mmapped = mmap_directory.get_cache_info().mmapped.len();
|
||||
|
||||
@@ -65,8 +65,8 @@ pub trait DocSet: Send {
|
||||
/// `seek_danger(..)` until it returns `Found`, and get back to a valid state.
|
||||
///
|
||||
/// `seek_lower_bound` can be any `DocId` (in the docset or not) as long as it is in
|
||||
/// `(target .. seek_result]` where `seek_result` is the first document in the docset greater
|
||||
/// than to `target`.
|
||||
/// `(target .. seek_result] U {TERMINATED}` where `seek_result` is the first document in the
|
||||
/// docset greater than to `target`.
|
||||
///
|
||||
/// `seek_danger` may return `SeekLowerBound(TERMINATED)`.
|
||||
///
|
||||
@@ -98,7 +98,7 @@ pub trait DocSet: Send {
|
||||
if doc == target {
|
||||
SeekDangerResult::Found
|
||||
} else {
|
||||
SeekDangerResult::SeekLowerBound(self.doc())
|
||||
SeekDangerResult::SeekLowerBound(doc)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -37,7 +35,6 @@ impl SegmentMetaInventory {
|
||||
let inner = InnerSegmentMeta {
|
||||
segment_id,
|
||||
max_doc,
|
||||
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
|
||||
deletes: None,
|
||||
};
|
||||
SegmentMeta::from(self.inventory.track(inner))
|
||||
@@ -85,15 +82,6 @@ impl SegmentMeta {
|
||||
self.tracked.segment_id
|
||||
}
|
||||
|
||||
/// Removes the Component::TempStore from the alive list and
|
||||
/// therefore marks the temp docstore file to be deleted by
|
||||
/// the garbage collection.
|
||||
pub fn untrack_temp_docstore(&self) {
|
||||
self.tracked
|
||||
.include_temp_doc_store
|
||||
.store(false, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Returns the number of deleted documents.
|
||||
pub fn num_deleted_docs(&self) -> u32 {
|
||||
self.tracked
|
||||
@@ -111,20 +99,9 @@ impl SegmentMeta {
|
||||
/// is by removing all files that have been created by tantivy
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
if self
|
||||
.tracked
|
||||
.include_temp_doc_store
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
{
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
} else {
|
||||
SegmentComponent::iterator()
|
||||
.filter(|comp| *comp != &SegmentComponent::TempStore)
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
}
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
@@ -138,7 +115,6 @@ impl SegmentMeta {
|
||||
SegmentComponent::Positions => ".pos".to_string(),
|
||||
SegmentComponent::Terms => ".term".to_string(),
|
||||
SegmentComponent::Store => ".store".to_string(),
|
||||
SegmentComponent::TempStore => ".store.temp".to_string(),
|
||||
SegmentComponent::FastFields => ".fast".to_string(),
|
||||
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
|
||||
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
@@ -183,7 +159,6 @@ impl SegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
@@ -202,7 +177,6 @@ impl SegmentMeta {
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc: inner_meta.max_doc,
|
||||
include_temp_doc_store: Arc::new(AtomicBool::new(true)),
|
||||
deletes: Some(delete_meta),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
@@ -214,14 +188,6 @@ struct InnerSegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
pub deletes: Option<DeleteMeta>,
|
||||
/// If you want to avoid the SegmentComponent::TempStore file to be covered by
|
||||
/// garbage collection and deleted, set this to true. This is used during merge.
|
||||
#[serde(skip)]
|
||||
#[serde(default = "default_temp_store")]
|
||||
pub(crate) include_temp_doc_store: Arc<AtomicBool>,
|
||||
}
|
||||
fn default_temp_store() -> Arc<AtomicBool> {
|
||||
Arc::new(AtomicBool::new(false))
|
||||
}
|
||||
|
||||
impl InnerSegmentMeta {
|
||||
|
||||
@@ -23,8 +23,6 @@ pub enum SegmentComponent {
|
||||
/// Accessing a document from the store is relatively slow, as it
|
||||
/// requires to decompress the entire block it belongs to.
|
||||
Store,
|
||||
/// Temporary storage of the documents, before streamed to `Store`.
|
||||
TempStore,
|
||||
/// Bitset describing which document of the segment is alive.
|
||||
/// (It was representing deleted docs but changed to represent alive docs from v0.17)
|
||||
Delete,
|
||||
@@ -33,14 +31,13 @@ pub enum SegmentComponent {
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::Postings,
|
||||
SegmentComponent::Positions,
|
||||
SegmentComponent::FastFields,
|
||||
SegmentComponent::FieldNorms,
|
||||
SegmentComponent::Terms,
|
||||
SegmentComponent::Store,
|
||||
SegmentComponent::TempStore,
|
||||
SegmentComponent::Delete,
|
||||
];
|
||||
SEGMENT_COMPONENTS.iter()
|
||||
|
||||
@@ -218,7 +218,7 @@ fn index_documents<D: Document>(
|
||||
let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
|
||||
|
||||
let meta = segment_with_max_doc.meta().clone();
|
||||
meta.untrack_temp_docstore();
|
||||
|
||||
// update segment_updater inventory to remove tempstore
|
||||
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
|
||||
segment_updater.schedule_add_segment(segment_entry).wait()?;
|
||||
|
||||
@@ -303,10 +303,10 @@ impl BlockSegmentPostings {
|
||||
}
|
||||
|
||||
pub(crate) fn load_block(&mut self) {
|
||||
let offset = self.skip_reader.byte_offset();
|
||||
if self.block_is_loaded() {
|
||||
return;
|
||||
}
|
||||
let offset = self.skip_reader.byte_offset();
|
||||
match self.skip_reader.block_info() {
|
||||
BlockInfo::BitPacked {
|
||||
doc_num_bits,
|
||||
|
||||
@@ -168,12 +168,20 @@ impl DocSet for SegmentPostings {
|
||||
self.doc()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
debug_assert!(self.doc() <= target);
|
||||
if self.doc() >= target {
|
||||
return self.doc();
|
||||
}
|
||||
|
||||
// As an optimization, if the block is already loaded, we can
|
||||
// cheaply check the next doc.
|
||||
self.cur = (self.cur + 1).min(COMPRESSION_BLOCK_SIZE - 1);
|
||||
if self.doc() >= target {
|
||||
return self.doc();
|
||||
}
|
||||
|
||||
// Delegate block-local search to BlockSegmentPostings::seek, which returns
|
||||
// the in-block index of the first doc >= target.
|
||||
self.cur = self.block_cursor.seek(target);
|
||||
|
||||
@@ -84,6 +84,14 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
docsets.sort_by_key(|docset| docset.cost());
|
||||
go_to_first_doc(&mut docsets);
|
||||
let left = docsets.remove(0);
|
||||
debug_assert!({
|
||||
let doc = left.doc();
|
||||
if doc == TERMINATED {
|
||||
true
|
||||
} else {
|
||||
docsets.iter().all(|docset| docset.doc() == doc)
|
||||
}
|
||||
});
|
||||
let right = docsets.remove(0);
|
||||
Intersection {
|
||||
left,
|
||||
@@ -112,30 +120,24 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
// Invariant:
|
||||
// - candidate is always <= to the next document in the intersection.
|
||||
// - candidate strictly increases at every occurence of the loop.
|
||||
let mut candidate = 0;
|
||||
let mut candidate = left.doc() + 1;
|
||||
|
||||
// Termination: candidate strictly increases.
|
||||
'outer: while candidate < TERMINATED {
|
||||
// As we enter the loop, we should always have candidate < next_doc.
|
||||
|
||||
// This step always increases candidate.
|
||||
//
|
||||
// TODO: Think about which value would make sense here
|
||||
// It depends on the DocSet implementation, when a seek would outweigh an advance.
|
||||
candidate = if candidate > left.doc().wrapping_add(100) {
|
||||
left.seek(candidate)
|
||||
} else {
|
||||
left.advance()
|
||||
};
|
||||
candidate = left.seek(candidate);
|
||||
|
||||
// Left is positionned on `candidate`.
|
||||
debug_assert_eq!(left.doc(), candidate);
|
||||
|
||||
if let SeekDangerResult::SeekLowerBound(seek_lower_bound) = right.seek_danger(candidate)
|
||||
{
|
||||
// The max is technically useless but it makes the invariant
|
||||
// easier to proofread.
|
||||
debug_assert!(seek_lower_bound >= candidate);
|
||||
debug_assert!(
|
||||
seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
|
||||
"seek_lower_bound {seek_lower_bound} must be greater than candidate \
|
||||
{candidate}"
|
||||
);
|
||||
candidate = seek_lower_bound;
|
||||
continue;
|
||||
}
|
||||
@@ -148,7 +150,11 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
other.seek_danger(candidate)
|
||||
{
|
||||
// One of the scorer does not match, let's restart at the top of the loop.
|
||||
debug_assert!(seek_lower_bound >= candidate);
|
||||
debug_assert!(
|
||||
seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
|
||||
"seek_lower_bound {seek_lower_bound} must be greater than candidate \
|
||||
{candidate}"
|
||||
);
|
||||
candidate = seek_lower_bound;
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -238,9 +244,12 @@ mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::Intersection;
|
||||
use crate::collector::Count;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::tests::test_skip_against_unoptimized;
|
||||
use crate::query::VecDocSet;
|
||||
use crate::query::{QueryParser, VecDocSet};
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
@@ -411,4 +420,29 @@ mod tests {
|
||||
assert_eq!(intersection.doc(), TERMINATED);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_2811_intersection_candidate_should_increase() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer
|
||||
.add_document(doc!(text_field=>"hello happy tax"))
|
||||
.unwrap();
|
||||
writer.add_document(doc!(text_field=>"hello")).unwrap();
|
||||
writer.add_document(doc!(text_field=>"hello")).unwrap();
|
||||
writer.add_document(doc!(text_field=>"happy tax")).unwrap();
|
||||
|
||||
writer.commit().unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, Vec::new());
|
||||
let query = query_parser
|
||||
.parse_query(r#"+text:hello +text:"happy tax""#)
|
||||
.unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let c = searcher.search(&*query, &Count).unwrap();
|
||||
assert_eq!(c, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -531,7 +531,12 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
|
||||
debug_assert!(target >= self.doc());
|
||||
debug_assert!(
|
||||
target >= self.doc(),
|
||||
"target ({}) should be greater than or equal to doc ({})",
|
||||
target,
|
||||
self.doc()
|
||||
);
|
||||
let seek_res = self.intersection_docset.seek_danger(target);
|
||||
if seek_res != SeekDangerResult::Found {
|
||||
return seek_res;
|
||||
|
||||
@@ -105,6 +105,7 @@ impl DocSet for TermScorer {
|
||||
|
||||
#[inline]
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
debug_assert!(target >= self.doc());
|
||||
self.postings.seek(target)
|
||||
}
|
||||
|
||||
|
||||
@@ -124,7 +124,6 @@ impl SegmentSpaceUsage {
|
||||
FieldNorms => PerField(self.fieldnorms().clone()),
|
||||
Terms => PerField(self.termdict().clone()),
|
||||
SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
|
||||
SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()),
|
||||
Delete => Basic(self.deletes()),
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user