added tool to detect position corruption

This commit is contained in:
Paul Masurel
2021-08-02 10:38:46 +09:00
committed by Andre-Philippe Paquet
parent 67f53289ef
commit 6c485bfd8a
4 changed files with 64 additions and 4 deletions

View File

@@ -12,6 +12,11 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2018"
[[bin]]
name = "debug_position"
path = "src/debug_position.rs"
[dependencies]
base64 = "0.13"
byteorder = "1.4.3"

53
src/debug_position.rs Normal file
View File

@@ -0,0 +1,53 @@
use std::panic;
use tantivy;
use tantivy::DocSet;
use tantivy::Postings;
use tantivy::Searcher;
use tantivy::TERMINATED;
use tantivy::schema::Field;
use tantivy::schema::IndexRecordOption;
fn test_field(searcher: &Searcher, field: Field) -> tantivy::Result<()> {
for segment_reader in searcher.segment_readers() {
println!("\n\n====\nsegment {:?}", segment_reader.segment_id());
println!("maxdoc {} del {} ", segment_reader.max_doc(), segment_reader.num_deleted_docs());
let inv_idx = segment_reader.inverted_index(field)?;
let termdict = inv_idx.terms();
println!("num terms {}", termdict.num_terms());
let mut terms = termdict.stream()?;
while terms.advance() {
let term_info = terms.value();
let mut postings = inv_idx.read_postings_from_terminfo(term_info, tantivy::schema::IndexRecordOption::WithFreqsAndPositions)?;
let mut seen_doc = 0;
while postings.doc() != TERMINATED {
let mut postings_clone= postings.clone();
// println!("termord {} seen_doc {} termpositions {:?} docfreq {}", terms.term_ord(), seen_doc, term_info.positions_range, term_info.doc_freq);
let mut positions = Vec::new();
postings_clone.positions(&mut positions);
seen_doc += 1;
postings.advance();
}
}
}
Ok(())
}
fn main() -> tantivy::Result<()> {
let index = tantivy::Index::open_in_dir(".")?;
let reader = index.reader()?;
let searcher = reader.searcher();
let schema = index.schema();
for (field, field_entry) in schema.fields() {
let field_type = field_entry.field_type();
let has_position = field_type.get_index_record_option()
.map(|opt| opt == IndexRecordOption::WithFreqsAndPositions)
.unwrap_or(false);
if !has_position {
continue;
}
test_field(&*searcher, field)?;
}
Ok(())
}

View File

@@ -4,11 +4,12 @@ use crate::common::HasLen;
use crate::directory::OwnedBytes;
use std::fmt;
use std::ops::Range;
use std::panic::{RefUnwindSafe, UnwindSafe};
use std::sync::{Arc, Weak};
use std::{io, ops::Deref};
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
/// Objects that represents files sections in tantivy.
///
@@ -40,7 +41,7 @@ impl<T: Deref<Target = [u8]>> HasLen for T {
impl<B> From<B> for FileSlice
where
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync + UnwindSafe + RefUnwindSafe,
{
fn from(bytes: B) -> FileSlice {
FileSlice::new(Box::new(OwnedBytes::new(bytes)))

View File

@@ -20,6 +20,7 @@ use std::fs::OpenOptions;
use std::fs::{self, File};
use std::io::{self, Seek, SeekFrom};
use std::io::{BufWriter, Read, Write};
use std::panic::{RefUnwindSafe, UnwindSafe};
use std::path::{Path, PathBuf};
use std::result;
use std::sync::Arc;
@@ -314,7 +315,7 @@ impl TerminatingWrite for SafeFileWriter {
}
#[derive(Clone)]
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync>);
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync + RefUnwindSafe + UnwindSafe>);
impl Deref for MmapArc {
type Target = [u8];