mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Compare commits
3 Commits
0.15.3
...
debug-posi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a91973ab0 | ||
|
|
8a7ca64b16 | ||
|
|
6c485bfd8a |
@@ -12,6 +12,11 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
|
||||
[[bin]]
|
||||
name = "debug_position"
|
||||
path = "src/debug_position.rs"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.13"
|
||||
byteorder = "1.4.3"
|
||||
|
||||
77
src/debug_position.rs
Normal file
77
src/debug_position.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use std::panic;
|
||||
|
||||
use futures::executor::block_on;
|
||||
use tantivy;
|
||||
use tantivy::DocSet;
|
||||
use tantivy::Postings;
|
||||
use tantivy::Searcher;
|
||||
use tantivy::TERMINATED;
|
||||
use tantivy::merge_policy;
|
||||
use tantivy::merge_policy::DefaultMergePolicy;
|
||||
use tantivy::merge_policy::MergePolicy;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::IndexRecordOption;
|
||||
|
||||
fn test_field(searcher: &Searcher, field: Field) -> tantivy::Result<()> {
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
println!("\n\n====\nsegment {:?}", segment_reader.segment_id());
|
||||
println!("maxdoc {} del {} ", segment_reader.max_doc(), segment_reader.num_deleted_docs());
|
||||
let inv_idx = segment_reader.inverted_index(field)?;
|
||||
let termdict = inv_idx.terms();
|
||||
println!("num terms {}", termdict.num_terms());
|
||||
let mut terms = termdict.stream()?;
|
||||
while terms.advance() {
|
||||
let term_info = terms.value();
|
||||
let mut postings = inv_idx.read_postings_from_terminfo(term_info, tantivy::schema::IndexRecordOption::WithFreqsAndPositions)?;
|
||||
let mut seen_doc = 0;
|
||||
while postings.doc() != TERMINATED {
|
||||
let mut postings_clone= postings.clone();
|
||||
// println!("termord {} seen_doc {} termpositions {:?} docfreq {}", terms.term_ord(), seen_doc, term_info.positions_range, term_info.doc_freq);
|
||||
let mut positions = Vec::new();
|
||||
postings_clone.positions(&mut positions);
|
||||
seen_doc += 1;
|
||||
postings.advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let index = tantivy::Index::open_in_dir(".")?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let schema = index.schema();
|
||||
for (field, field_entry) in schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
let has_position = field_type.get_index_record_option()
|
||||
.map(|opt| opt == IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap_or(false);
|
||||
if !has_position {
|
||||
continue;
|
||||
}
|
||||
test_field(&*searcher, field)?;
|
||||
}
|
||||
|
||||
|
||||
println!("GC");
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000)?;
|
||||
block_on(index_writer.garbage_collect_files())?;
|
||||
|
||||
print!("----- validdating checksum");
|
||||
index.validate_checksum()?;
|
||||
|
||||
print!("----- success");
|
||||
|
||||
let default_merge_policy = DefaultMergePolicy::default();
|
||||
let segment_metas = index.searchable_segment_metas()?;
|
||||
let merge_candidates = default_merge_policy.compute_merge_candidates(&segment_metas);
|
||||
println!("{:?}", merge_candidates);
|
||||
for merge_candidate in merge_candidates {
|
||||
println!("merge_candidate {:?}", merge_candidate);
|
||||
let future = index_writer.merge(&merge_candidate.0[..]);
|
||||
let seg = block_on(future)?;
|
||||
println!("seg {:?} ", seg);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -4,11 +4,12 @@ use crate::common::HasLen;
|
||||
use crate::directory::OwnedBytes;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::{io, ops::Deref};
|
||||
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
|
||||
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
|
||||
|
||||
/// Objects that represents files sections in tantivy.
|
||||
///
|
||||
@@ -40,7 +41,7 @@ impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
|
||||
impl<B> From<B> for FileSlice
|
||||
where
|
||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
|
||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync + UnwindSafe + RefUnwindSafe,
|
||||
{
|
||||
fn from(bytes: B) -> FileSlice {
|
||||
FileSlice::new(Box::new(OwnedBytes::new(bytes)))
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::fs::OpenOptions;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
@@ -314,7 +315,7 @@ impl TerminatingWrite for SafeFileWriter {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync>);
|
||||
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync + RefUnwindSafe + UnwindSafe>);
|
||||
|
||||
impl Deref for MmapArc {
|
||||
type Target = [u8];
|
||||
|
||||
@@ -927,14 +927,14 @@ impl IndexMerger {
|
||||
// I think this is not strictly necessary, it would be possible to
|
||||
// avoid the loading into a vec via some form of kmerge, but then the merge
|
||||
// logic would deviate much more from the stacking case (unsorted index)
|
||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||
if doc_id_mapping.is_some() {
|
||||
doc_id_and_positions.push((
|
||||
remapped_doc_id,
|
||||
term_freq,
|
||||
positions_buffer.to_vec(),
|
||||
delta_positions.to_vec(),
|
||||
));
|
||||
} else {
|
||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user