mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
added tool to detect position corruption
This commit is contained in:
committed by
Andre-Philippe Paquet
parent
67f53289ef
commit
6c485bfd8a
@@ -12,6 +12,11 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
|
||||
[[bin]]
|
||||
name = "debug_position"
|
||||
path = "src/debug_position.rs"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.13"
|
||||
byteorder = "1.4.3"
|
||||
|
||||
53
src/debug_position.rs
Normal file
53
src/debug_position.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
use std::panic;
|
||||
|
||||
use tantivy;
|
||||
use tantivy::DocSet;
|
||||
use tantivy::Postings;
|
||||
use tantivy::Searcher;
|
||||
use tantivy::TERMINATED;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::IndexRecordOption;
|
||||
|
||||
fn test_field(searcher: &Searcher, field: Field) -> tantivy::Result<()> {
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
println!("\n\n====\nsegment {:?}", segment_reader.segment_id());
|
||||
println!("maxdoc {} del {} ", segment_reader.max_doc(), segment_reader.num_deleted_docs());
|
||||
let inv_idx = segment_reader.inverted_index(field)?;
|
||||
let termdict = inv_idx.terms();
|
||||
println!("num terms {}", termdict.num_terms());
|
||||
let mut terms = termdict.stream()?;
|
||||
while terms.advance() {
|
||||
let term_info = terms.value();
|
||||
let mut postings = inv_idx.read_postings_from_terminfo(term_info, tantivy::schema::IndexRecordOption::WithFreqsAndPositions)?;
|
||||
let mut seen_doc = 0;
|
||||
while postings.doc() != TERMINATED {
|
||||
let mut postings_clone= postings.clone();
|
||||
// println!("termord {} seen_doc {} termpositions {:?} docfreq {}", terms.term_ord(), seen_doc, term_info.positions_range, term_info.doc_freq);
|
||||
let mut positions = Vec::new();
|
||||
postings_clone.positions(&mut positions);
|
||||
seen_doc += 1;
|
||||
postings.advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let index = tantivy::Index::open_in_dir(".")?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let schema = index.schema();
|
||||
for (field, field_entry) in schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
let has_position = field_type.get_index_record_option()
|
||||
.map(|opt| opt == IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap_or(false);
|
||||
if !has_position {
|
||||
continue;
|
||||
}
|
||||
test_field(&*searcher, field)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -4,11 +4,12 @@ use crate::common::HasLen;
|
||||
use crate::directory::OwnedBytes;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::{io, ops::Deref};
|
||||
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
|
||||
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
|
||||
|
||||
/// Objects that represents files sections in tantivy.
|
||||
///
|
||||
@@ -40,7 +41,7 @@ impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
|
||||
impl<B> From<B> for FileSlice
|
||||
where
|
||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
|
||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync + UnwindSafe + RefUnwindSafe,
|
||||
{
|
||||
fn from(bytes: B) -> FileSlice {
|
||||
FileSlice::new(Box::new(OwnedBytes::new(bytes)))
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::fs::OpenOptions;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
@@ -314,7 +315,7 @@ impl TerminatingWrite for SafeFileWriter {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync>);
|
||||
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync + RefUnwindSafe + UnwindSafe>);
|
||||
|
||||
impl Deref for MmapArc {
|
||||
type Target = [u8];
|
||||
|
||||
Reference in New Issue
Block a user