From 63bc390b02e582e83eb07d5e2162c15a9aae48dc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 14 Oct 2022 12:01:22 +0800 Subject: [PATCH] Fix missing fieldnorm indexing Fixes broken search (no results) with BM25 for u64, i64, f64, bool, bytes and date after deletion and merge. There were no fieldnorms recorded for those field. After merge InvertedIndexReader::total_num_tokens returns 0 (Sum over the fieldnorms is 0). BM25 does not work when total_num_tokens is 0. Fixes #1617 --- src/indexer/index_writer.rs | 194 ++++++++++++++++++++++++++++++++-- src/indexer/segment_writer.rs | 32 ++++++ 2 files changed, 218 insertions(+), 8 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index e7cd65574..7a0d9cb98 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -821,7 +821,9 @@ mod tests { TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, }; use crate::store::DOCSTORE_CACHE_CAPACITY; - use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term}; + use crate::{ + DateTime, DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term, + }; const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \ eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad \ @@ -1601,6 +1603,9 @@ mod tests { IpAddrOptions::default().set_fast(Cardinality::MultiValues), ); let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED); + let i64_field = schema_builder.add_i64_field("i64", INDEXED); + let f64_field = schema_builder.add_f64_field("f64", INDEXED); + let date_field = schema_builder.add_date_field("date", INDEXED); let bytes_field = schema_builder.add_bytes_field("bytes", FAST | INDEXED | STORED); let bool_field = schema_builder.add_bool_field("bool", FAST | INDEXED | STORED); let text_field = schema_builder.add_text_field( @@ -1664,6 +1669,9 @@ mod tests { multi_numbers=> id, multi_numbers => id, bool_field => (id % 2u64) != 0, + i64_field => id as i64, + f64_field => id as f64, + date_field => DateTime::from_timestamp_secs(id as i64), multi_bools => (id % 2u64) != 0, multi_bools => (id % 2u64) == 0, text_field => id.to_string(), @@ -1679,6 +1687,9 @@ mod tests { multi_numbers=> id, multi_numbers => id, bool_field => (id % 2u64) != 0, + i64_field => id as i64, + f64_field => id as f64, + date_field => DateTime::from_timestamp_secs(id as i64), multi_bools => (id % 2u64) != 0, multi_bools => (id % 2u64) == 0, text_field => id.to_string(), @@ -1889,10 +1900,8 @@ mod tests { } } // test search - let my_text_field = index.schema().get_field("text_field").unwrap(); - - let do_search = |term: &str| { - let query = QueryParser::for_index(&index, vec![my_text_field]) + let do_search = |term: &str, field| { + let query = QueryParser::for_index(&index, vec![field]) .parse_query(term) .unwrap(); let top_docs: Vec<(f32, DocAddress)> = @@ -1901,11 +1910,49 @@ mod tests { top_docs.iter().map(|el| el.1).collect::>() }; + let do_search2 = |term: Term| { + let query = TermQuery::new(term, IndexRecordOption::Basic); + let top_docs: Vec<(f32, DocAddress)> = + searcher.search(&query, &TopDocs::with_limit(1000)).unwrap(); + + top_docs.iter().map(|el| el.1).collect::>() + }; + for (existing_id, count) in expected_ids_and_num_occurrences { - assert_eq!(do_search(&existing_id.to_string()).len() as u64, count); + let assert_field = |field| do_search(&existing_id.to_string(), field).len() as u64; + assert_eq!(assert_field(text_field), count); + assert_eq!(assert_field(i64_field), count); + assert_eq!(assert_field(f64_field), count); + assert_eq!(assert_field(id_field), count); + + // Test bytes + let term = Term::from_field_bytes(bytes_field, existing_id.to_le_bytes().as_slice()); + assert_eq!(do_search2(term).len() as u64, count); + + // Test date + let term = Term::from_field_date( + date_field, + DateTime::from_timestamp_secs(existing_id as i64), + ); + assert_eq!(do_search2(term).len() as u64, count); } - for existing_id in deleted_ids { - assert_eq!(do_search(&existing_id.to_string()).len(), 0); + for deleted_id in deleted_ids { + let assert_field = |field| { + assert_eq!(do_search(&deleted_id.to_string(), field).len() as u64, 0); + }; + assert_field(text_field); + assert_field(f64_field); + assert_field(i64_field); + assert_field(id_field); + + // Test bytes + let term = Term::from_field_bytes(bytes_field, deleted_id.to_le_bytes().as_slice()); + assert_eq!(do_search2(term).len() as u64, 0); + + // Test date + let term = + Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64)); + assert_eq!(do_search2(term).len() as u64, 0); } // test facets for segment_reader in searcher.segment_readers().iter() { @@ -2050,4 +2097,135 @@ mod tests { index_writer.commit()?; Ok(()) } + + #[test] + fn test_bug_1617_3() { + assert!(test_operation_strategy( + &[ + IndexingOp::DeleteDoc { id: 0 }, + IndexingOp::AddDoc { id: 6 }, + IndexingOp::DeleteDocQuery { id: 11 }, + IndexingOp::Commit, + IndexingOp::Merge, + IndexingOp::Commit, + IndexingOp::Commit + ], + false, + false + ) + .is_ok()); + } + + #[test] + fn test_bug_1617_2() { + assert!(test_operation_strategy( + &[ + IndexingOp::AddDoc { id: 13 }, + IndexingOp::DeleteDoc { id: 13 }, + IndexingOp::Commit, + IndexingOp::AddDoc { id: 30 }, + IndexingOp::Commit, + IndexingOp::Merge, + ], + false, + true + ) + .is_ok()); + } + + #[test] + fn test_bug_1617() -> crate::Result<()> { + let mut schema_builder = schema::Schema::builder(); + let id_field = schema_builder.add_u64_field("id", INDEXED); + + let schema = schema_builder.build(); + let index = Index::builder().schema(schema).create_in_ram()?; + let mut index_writer = index.writer_for_tests()?; + index_writer.set_merge_policy(Box::new(NoMergePolicy)); + + let existing_id = 16u64; + let deleted_id = 13u64; + index_writer.add_document(doc!( + id_field=>existing_id, + ))?; + index_writer.add_document(doc!( + id_field=>deleted_id, + ))?; + index_writer.delete_term(Term::from_field_u64(id_field, deleted_id)); + index_writer.commit()?; + + // Merge + { + assert!(index_writer.wait_merging_threads().is_ok()); + let mut index_writer = index.writer_for_tests()?; + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + index_writer.merge(&segment_ids).wait().unwrap(); + assert!(index_writer.wait_merging_threads().is_ok()); + } + let searcher = index.reader()?.searcher(); + + let query = TermQuery::new( + Term::from_field_u64(id_field, existing_id), + IndexRecordOption::Basic, + ); + let top_docs: Vec<(f32, DocAddress)> = + searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + + assert_eq!(top_docs.len(), 1); // Fails + + Ok(()) + } + + #[test] + fn test_bug_1618() -> crate::Result<()> { + let mut schema_builder = schema::Schema::builder(); + let id_field = schema_builder.add_i64_field("id", INDEXED); + + let schema = schema_builder.build(); + let index = Index::builder().schema(schema).create_in_ram()?; + let mut index_writer = index.writer_for_tests()?; + index_writer.set_merge_policy(Box::new(NoMergePolicy)); + + index_writer.add_document(doc!( + id_field=>10i64, + ))?; + index_writer.add_document(doc!( + id_field=>30i64, + ))?; + index_writer.commit()?; + + // Merge + { + assert!(index_writer.wait_merging_threads().is_ok()); + let mut index_writer = index.writer_for_tests()?; + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + index_writer.merge(&segment_ids).wait().unwrap(); + assert!(index_writer.wait_merging_threads().is_ok()); + } + let searcher = index.reader()?.searcher(); + + let query = TermQuery::new( + Term::from_field_i64(id_field, 10i64), + IndexRecordOption::Basic, + ); + let top_docs: Vec<(f32, DocAddress)> = + searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + + assert_eq!(top_docs.len(), 1); // Fails + + let query = TermQuery::new( + Term::from_field_i64(id_field, 30i64), + IndexRecordOption::Basic, + ); + let top_docs: Vec<(f32, DocAddress)> = + searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + + assert_eq!(top_docs.len(), 1); // Fails + + Ok(()) + } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 812aa78b6..bcbb3271b 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -176,10 +176,12 @@ impl SegmentWriter { if !field_entry.is_indexed() { continue; } + let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx); let postings_writer: &mut dyn PostingsWriter = self.per_field_postings_writers.get_for_field_mut(field); term_buffer.set_field(field_entry.field_type().value_type(), field); + match *field_entry.field_type() { FieldType::Facet(_) => { for value in values { @@ -240,46 +242,76 @@ impl SegmentWriter { } } FieldType::U64(_) => { + let mut num_vals = 0; for value in values { + num_vals += 1; let u64_val = value.as_u64().ok_or_else(make_schema_error)?; term_buffer.set_u64(u64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } + if field_entry.has_fieldnorms() { + self.fieldnorms_writer.record(doc_id, field, num_vals); + } } FieldType::Date(_) => { + let mut num_vals = 0; for value in values { + num_vals += 1; let date_val = value.as_date().ok_or_else(make_schema_error)?; term_buffer.set_u64(date_val.truncate(DatePrecision::Seconds).to_u64()); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } + if field_entry.has_fieldnorms() { + self.fieldnorms_writer.record(doc_id, field, num_vals); + } } FieldType::I64(_) => { + let mut num_vals = 0; for value in values { + num_vals += 1; let i64_val = value.as_i64().ok_or_else(make_schema_error)?; term_buffer.set_i64(i64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } + if field_entry.has_fieldnorms() { + self.fieldnorms_writer.record(doc_id, field, num_vals); + } } FieldType::F64(_) => { + let mut num_vals = 0; for value in values { + num_vals += 1; let f64_val = value.as_f64().ok_or_else(make_schema_error)?; term_buffer.set_f64(f64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } + if field_entry.has_fieldnorms() { + self.fieldnorms_writer.record(doc_id, field, num_vals); + } } FieldType::Bool(_) => { + let mut num_vals = 0; for value in values { + num_vals += 1; let bool_val = value.as_bool().ok_or_else(make_schema_error)?; term_buffer.set_bool(bool_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } + if field_entry.has_fieldnorms() { + self.fieldnorms_writer.record(doc_id, field, num_vals); + } } FieldType::Bytes(_) => { + let mut num_vals = 0; for value in values { + num_vals += 1; let bytes = value.as_bytes().ok_or_else(make_schema_error)?; term_buffer.set_bytes(bytes); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } + if field_entry.has_fieldnorms() { + self.fieldnorms_writer.record(doc_id, field, num_vals); + } } FieldType::JsonObject(_) => { let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];