mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 10:02:55 +00:00
Compare commits
3 Commits
0.15.3
...
debug-posi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a91973ab0 | ||
|
|
8a7ca64b16 | ||
|
|
6c485bfd8a |
@@ -12,6 +12,11 @@ readme = "README.md"
|
|||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "debug_position"
|
||||||
|
path = "src/debug_position.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
base64 = "0.13"
|
base64 = "0.13"
|
||||||
byteorder = "1.4.3"
|
byteorder = "1.4.3"
|
||||||
|
|||||||
77
src/debug_position.rs
Normal file
77
src/debug_position.rs
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
use std::panic;
|
||||||
|
|
||||||
|
use futures::executor::block_on;
|
||||||
|
use tantivy;
|
||||||
|
use tantivy::DocSet;
|
||||||
|
use tantivy::Postings;
|
||||||
|
use tantivy::Searcher;
|
||||||
|
use tantivy::TERMINATED;
|
||||||
|
use tantivy::merge_policy;
|
||||||
|
use tantivy::merge_policy::DefaultMergePolicy;
|
||||||
|
use tantivy::merge_policy::MergePolicy;
|
||||||
|
use tantivy::schema::Field;
|
||||||
|
use tantivy::schema::IndexRecordOption;
|
||||||
|
|
||||||
|
fn test_field(searcher: &Searcher, field: Field) -> tantivy::Result<()> {
|
||||||
|
for segment_reader in searcher.segment_readers() {
|
||||||
|
println!("\n\n====\nsegment {:?}", segment_reader.segment_id());
|
||||||
|
println!("maxdoc {} del {} ", segment_reader.max_doc(), segment_reader.num_deleted_docs());
|
||||||
|
let inv_idx = segment_reader.inverted_index(field)?;
|
||||||
|
let termdict = inv_idx.terms();
|
||||||
|
println!("num terms {}", termdict.num_terms());
|
||||||
|
let mut terms = termdict.stream()?;
|
||||||
|
while terms.advance() {
|
||||||
|
let term_info = terms.value();
|
||||||
|
let mut postings = inv_idx.read_postings_from_terminfo(term_info, tantivy::schema::IndexRecordOption::WithFreqsAndPositions)?;
|
||||||
|
let mut seen_doc = 0;
|
||||||
|
while postings.doc() != TERMINATED {
|
||||||
|
let mut postings_clone= postings.clone();
|
||||||
|
// println!("termord {} seen_doc {} termpositions {:?} docfreq {}", terms.term_ord(), seen_doc, term_info.positions_range, term_info.doc_freq);
|
||||||
|
let mut positions = Vec::new();
|
||||||
|
postings_clone.positions(&mut positions);
|
||||||
|
seen_doc += 1;
|
||||||
|
postings.advance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> tantivy::Result<()> {
|
||||||
|
let index = tantivy::Index::open_in_dir(".")?;
|
||||||
|
let reader = index.reader()?;
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let schema = index.schema();
|
||||||
|
for (field, field_entry) in schema.fields() {
|
||||||
|
let field_type = field_entry.field_type();
|
||||||
|
let has_position = field_type.get_index_record_option()
|
||||||
|
.map(|opt| opt == IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.unwrap_or(false);
|
||||||
|
if !has_position {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
test_field(&*searcher, field)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
println!("GC");
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 100_000_000)?;
|
||||||
|
block_on(index_writer.garbage_collect_files())?;
|
||||||
|
|
||||||
|
print!("----- validdating checksum");
|
||||||
|
index.validate_checksum()?;
|
||||||
|
|
||||||
|
print!("----- success");
|
||||||
|
|
||||||
|
let default_merge_policy = DefaultMergePolicy::default();
|
||||||
|
let segment_metas = index.searchable_segment_metas()?;
|
||||||
|
let merge_candidates = default_merge_policy.compute_merge_candidates(&segment_metas);
|
||||||
|
println!("{:?}", merge_candidates);
|
||||||
|
for merge_candidate in merge_candidates {
|
||||||
|
println!("merge_candidate {:?}", merge_candidate);
|
||||||
|
let future = index_writer.merge(&merge_candidate.0[..]);
|
||||||
|
let seg = block_on(future)?;
|
||||||
|
println!("seg {:?} ", seg);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -4,11 +4,12 @@ use crate::common::HasLen;
|
|||||||
use crate::directory::OwnedBytes;
|
use crate::directory::OwnedBytes;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||||
use std::sync::{Arc, Weak};
|
use std::sync::{Arc, Weak};
|
||||||
use std::{io, ops::Deref};
|
use std::{io, ops::Deref};
|
||||||
|
|
||||||
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
|
||||||
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
|
||||||
|
|
||||||
/// Objects that represents files sections in tantivy.
|
/// Objects that represents files sections in tantivy.
|
||||||
///
|
///
|
||||||
@@ -40,7 +41,7 @@ impl<T: Deref<Target = [u8]>> HasLen for T {
|
|||||||
|
|
||||||
impl<B> From<B> for FileSlice
|
impl<B> From<B> for FileSlice
|
||||||
where
|
where
|
||||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
|
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync + UnwindSafe + RefUnwindSafe,
|
||||||
{
|
{
|
||||||
fn from(bytes: B) -> FileSlice {
|
fn from(bytes: B) -> FileSlice {
|
||||||
FileSlice::new(Box::new(OwnedBytes::new(bytes)))
|
FileSlice::new(Box::new(OwnedBytes::new(bytes)))
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ use std::fs::OpenOptions;
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, Seek, SeekFrom};
|
use std::io::{self, Seek, SeekFrom};
|
||||||
use std::io::{BufWriter, Read, Write};
|
use std::io::{BufWriter, Read, Write};
|
||||||
|
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::result;
|
use std::result;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -314,7 +315,7 @@ impl TerminatingWrite for SafeFileWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync>);
|
struct MmapArc(Arc<dyn Deref<Target = [u8]> + Send + Sync + RefUnwindSafe + UnwindSafe>);
|
||||||
|
|
||||||
impl Deref for MmapArc {
|
impl Deref for MmapArc {
|
||||||
type Target = [u8];
|
type Target = [u8];
|
||||||
|
|||||||
@@ -927,14 +927,14 @@ impl IndexMerger {
|
|||||||
// I think this is not strictly necessary, it would be possible to
|
// I think this is not strictly necessary, it would be possible to
|
||||||
// avoid the loading into a vec via some form of kmerge, but then the merge
|
// avoid the loading into a vec via some form of kmerge, but then the merge
|
||||||
// logic would deviate much more from the stacking case (unsorted index)
|
// logic would deviate much more from the stacking case (unsorted index)
|
||||||
|
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
||||||
if doc_id_mapping.is_some() {
|
if doc_id_mapping.is_some() {
|
||||||
doc_id_and_positions.push((
|
doc_id_and_positions.push((
|
||||||
remapped_doc_id,
|
remapped_doc_id,
|
||||||
term_freq,
|
term_freq,
|
||||||
positions_buffer.to_vec(),
|
delta_positions.to_vec(),
|
||||||
));
|
));
|
||||||
} else {
|
} else {
|
||||||
let delta_positions = delta_computer.compute_delta(&positions_buffer);
|
|
||||||
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user