fix delta position in merge and index sorting (#1132)

fixes #1125
This commit is contained in:
PSeitz
2021-08-03 10:06:36 +01:00
committed by GitHub
parent 85d23e8e3b
commit 4a320fd1ff
2 changed files with 95 additions and 5 deletions

View File

@@ -958,12 +958,13 @@ impl IndexMerger {
}
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
for (doc_id, term_freq, positions) in &doc_id_and_positions {
field_serializer.write_doc(*doc_id, *term_freq, positions);
let delta_positions = delta_computer.compute_delta(&positions);
field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
}
doc_id_and_positions.clear();
}
// closing the term.
field_serializer.close_term()?;
}

View File

@@ -1,6 +1,7 @@
#[cfg(test)]
mod tests {
use crate::fastfield::FastFieldReader;
use crate::schema::IndexRecordOption;
use crate::{
collector::TopDocs,
schema::{Cardinality, TextFieldIndexing},
@@ -16,7 +17,7 @@ mod tests {
schema::{self, BytesOptions},
DocAddress,
};
use crate::{IndexSettings, Term};
use crate::{DocSet, IndexSettings, Postings, Term};
use futures::executor::block_on;
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
@@ -106,7 +107,7 @@ mod tests {
);
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme"));
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64),
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
);
assert!(index_writer.commit().is_ok());
@@ -243,6 +244,24 @@ mod tests {
assert_eq!(do_search("biggest"), vec![0]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
// access doc store
{
let blubber_pos = if force_disjunct_segment_sort_values {
@@ -260,6 +279,58 @@ mod tests {
}
}
#[test]
fn test_merge_unsorted_index() {
let index = create_test_index(
Some(IndexSettings {
..Default::default()
}),
false,
);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("some"), vec![1]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![4]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
@@ -314,7 +385,7 @@ mod tests {
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
@@ -339,6 +410,24 @@ mod tests {
assert_eq!(do_search("biggest"), vec![5]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();