mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-02 23:32:54 +00:00
Compare commits
2 Commits
commit-cha
...
limit-rand
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9eb87e91cc | ||
|
|
36f43da4d8 |
50
doc/src/index-format.md
Normal file
50
doc/src/index-format.md
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
|
||||||
|
# Managed files
|
||||||
|
+----------+-----------+-------------------+
|
||||||
|
| content | footer | footer_len: u32 |
|
||||||
|
+----------+-----------+-------------------+
|
||||||
|
|
||||||
|
# Term Dictionary (Composite File)
|
||||||
|
|
||||||
|
+---------+---------------------------+------------------------+
|
||||||
|
| fst | term_info_store | footer_len: u64 |
|
||||||
|
+---------+---------------------------+------------------------+
|
||||||
|
|
||||||
|
During a merge the term info store need to fit in memory.
|
||||||
|
It has a cost of n bytes per term.
|
||||||
|
|
||||||
|
# term_info_store
|
||||||
|
+-------------------+---------------------------+------------------------+
|
||||||
|
| len_block_meta | block_meta | term_infos |
|
||||||
|
+-------------------+---------------------------+------------------------+
|
||||||
|
|
||||||
|
# inverted_index
|
||||||
|
+------------------------+---------------------------+------------------------+
|
||||||
|
| total_num_tokens: u64 | posting_lists.. | term_infos |
|
||||||
|
+------------------------+---------------------------+------------------------+
|
||||||
|
|
||||||
|
# postings lists
|
||||||
|
+------------------------+---------------------------+------------------------+
|
||||||
|
|
|
||||||
|
+
|
||||||
|
|
||||||
|
# composite file
|
||||||
|
+----------------+-----+----------------+----------------------+----------------+
|
||||||
|
| field file 1 | ... | field field n |composite file footer | footer len: u32|
|
||||||
|
+----------------+-----+----------------+----------------------+----------------+
|
||||||
|
|
||||||
|
# composite file footer
|
||||||
|
|
||||||
|
+-----------------+---------------------------------------+
|
||||||
|
|num fields: vint | (file_addr, offset_delta: vint) []... |
|
||||||
|
+-----------------+---------------------------------------+
|
||||||
|
|
||||||
|
# FileAddr
|
||||||
|
+--------------+--------------+
|
||||||
|
| field: u32 | idx: VInt |
|
||||||
|
+--------------+--------------+
|
||||||
|
|
||||||
|
# Posting lists
|
||||||
|
+-----------------------------------------+
|
||||||
|
| skip_reader
|
||||||
|
+-----------------------------------------+
|
||||||
@@ -116,6 +116,7 @@ impl SegmentMeta {
|
|||||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||||
|
SegmentComponent::FIELDSTATS => ".fieldstats".to_string(),
|
||||||
});
|
});
|
||||||
PathBuf::from(path)
|
PathBuf::from(path)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
use crate::common::BinarySerializable;
|
|
||||||
use crate::directory::ReadOnlySource;
|
use crate::directory::ReadOnlySource;
|
||||||
use crate::positions::PositionReader;
|
use crate::positions::PositionReader;
|
||||||
use crate::postings::TermInfo;
|
use crate::postings::TermInfo;
|
||||||
@@ -36,14 +35,12 @@ impl InvertedIndexReader {
|
|||||||
postings_source: ReadOnlySource,
|
postings_source: ReadOnlySource,
|
||||||
positions_source: ReadOnlySource,
|
positions_source: ReadOnlySource,
|
||||||
positions_idx_source: ReadOnlySource,
|
positions_idx_source: ReadOnlySource,
|
||||||
|
total_num_tokens: u64,
|
||||||
record_option: IndexRecordOption,
|
record_option: IndexRecordOption,
|
||||||
) -> InvertedIndexReader {
|
) -> InvertedIndexReader {
|
||||||
let total_num_tokens_data = postings_source.slice(0, 8);
|
|
||||||
let mut total_num_tokens_cursor = total_num_tokens_data.as_slice();
|
|
||||||
let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64);
|
|
||||||
InvertedIndexReader {
|
InvertedIndexReader {
|
||||||
termdict,
|
termdict,
|
||||||
postings_source: postings_source.slice_from(8),
|
postings_source,
|
||||||
positions_source,
|
positions_source,
|
||||||
positions_idx_source,
|
positions_idx_source,
|
||||||
record_option,
|
record_option,
|
||||||
@@ -89,7 +86,7 @@ impl InvertedIndexReader {
|
|||||||
term_info: &TermInfo,
|
term_info: &TermInfo,
|
||||||
block_postings: &mut BlockSegmentPostings,
|
block_postings: &mut BlockSegmentPostings,
|
||||||
) {
|
) {
|
||||||
let offset = term_info.postings_offset as usize;
|
let offset = term_info.postings_start_offset as usize;
|
||||||
let end_source = self.postings_source.len();
|
let end_source = self.postings_source.len();
|
||||||
let postings_slice = self.postings_source.slice(offset, end_source);
|
let postings_slice = self.postings_source.slice(offset, end_source);
|
||||||
block_postings.reset(term_info.doc_freq, postings_slice);
|
block_postings.reset(term_info.doc_freq, postings_slice);
|
||||||
@@ -117,8 +114,10 @@ impl InvertedIndexReader {
|
|||||||
term_info: &TermInfo,
|
term_info: &TermInfo,
|
||||||
requested_option: IndexRecordOption,
|
requested_option: IndexRecordOption,
|
||||||
) -> BlockSegmentPostings {
|
) -> BlockSegmentPostings {
|
||||||
let offset = term_info.postings_offset as usize;
|
let postings_data = self.postings_source.slice(
|
||||||
let postings_data = self.postings_source.slice_from(offset);
|
term_info.postings_start_offset as usize,
|
||||||
|
term_info.postings_end_offset as usize,
|
||||||
|
);
|
||||||
BlockSegmentPostings::from_data(
|
BlockSegmentPostings::from_data(
|
||||||
term_info.doc_freq,
|
term_info.doc_freq,
|
||||||
postings_data,
|
postings_data,
|
||||||
|
|||||||
@@ -24,14 +24,17 @@ pub enum SegmentComponent {
|
|||||||
/// Accessing a document from the store is relatively slow, as it
|
/// Accessing a document from the store is relatively slow, as it
|
||||||
/// requires to decompress the entire block it belongs to.
|
/// requires to decompress the entire block it belongs to.
|
||||||
STORE,
|
STORE,
|
||||||
|
|
||||||
/// Bitset describing which document of the segment is deleted.
|
/// Bitset describing which document of the segment is deleted.
|
||||||
DELETE,
|
DELETE,
|
||||||
|
|
||||||
|
FIELDSTATS,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentComponent {
|
impl SegmentComponent {
|
||||||
/// Iterates through the components.
|
/// Iterates through the components.
|
||||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||||
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
|
static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [
|
||||||
SegmentComponent::POSTINGS,
|
SegmentComponent::POSTINGS,
|
||||||
SegmentComponent::POSITIONS,
|
SegmentComponent::POSITIONS,
|
||||||
SegmentComponent::POSITIONSSKIP,
|
SegmentComponent::POSITIONSSKIP,
|
||||||
@@ -40,6 +43,7 @@ impl SegmentComponent {
|
|||||||
SegmentComponent::TERMS,
|
SegmentComponent::TERMS,
|
||||||
SegmentComponent::STORE,
|
SegmentComponent::STORE,
|
||||||
SegmentComponent::DELETE,
|
SegmentComponent::DELETE,
|
||||||
|
SegmentComponent::FIELDSTATS,
|
||||||
];
|
];
|
||||||
SEGMENT_COMPONENTS.iter()
|
SEGMENT_COMPONENTS.iter()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
use crate::common::CompositeFile;
|
|
||||||
use crate::common::HasLen;
|
use crate::common::HasLen;
|
||||||
use crate::core::InvertedIndexReader;
|
use crate::core::InvertedIndexReader;
|
||||||
use crate::core::Segment;
|
use crate::core::Segment;
|
||||||
@@ -16,6 +15,7 @@ use crate::space_usage::SegmentSpaceUsage;
|
|||||||
use crate::store::StoreReader;
|
use crate::store::StoreReader;
|
||||||
use crate::termdict::TermDictionary;
|
use crate::termdict::TermDictionary;
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
|
use crate::{common::CompositeFile, postings::FieldStats};
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
@@ -49,6 +49,7 @@ pub struct SegmentReader {
|
|||||||
positions_idx_composite: CompositeFile,
|
positions_idx_composite: CompositeFile,
|
||||||
fast_fields_readers: Arc<FastFieldReaders>,
|
fast_fields_readers: Arc<FastFieldReaders>,
|
||||||
fieldnorm_readers: FieldNormReaders,
|
fieldnorm_readers: FieldNormReaders,
|
||||||
|
field_stats: FieldStats,
|
||||||
|
|
||||||
store_source: ReadOnlySource,
|
store_source: ReadOnlySource,
|
||||||
delete_bitset_opt: Option<DeleteBitSet>,
|
delete_bitset_opt: Option<DeleteBitSet>,
|
||||||
@@ -179,6 +180,9 @@ impl SegmentReader {
|
|||||||
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||||
|
|
||||||
|
let field_stats_data = segment.open_read(SegmentComponent::FIELDSTATS)?;
|
||||||
|
let field_stats = FieldStats::from_source(field_stats_data.as_slice())?;
|
||||||
|
|
||||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||||
Some(DeleteBitSet::open(delete_data))
|
Some(DeleteBitSet::open(delete_data))
|
||||||
@@ -194,6 +198,7 @@ impl SegmentReader {
|
|||||||
postings_composite,
|
postings_composite,
|
||||||
fast_fields_readers: fast_field_readers,
|
fast_fields_readers: fast_field_readers,
|
||||||
fieldnorm_readers,
|
fieldnorm_readers,
|
||||||
|
field_stats,
|
||||||
segment_id: segment.id(),
|
segment_id: segment.id(),
|
||||||
store_source,
|
store_source,
|
||||||
delete_bitset_opt,
|
delete_bitset_opt,
|
||||||
@@ -260,11 +265,17 @@ impl SegmentReader {
|
|||||||
.open_read(field)
|
.open_read(field)
|
||||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||||
|
|
||||||
|
let total_num_tokens = self
|
||||||
|
.field_stats
|
||||||
|
.get(field)
|
||||||
|
.map(|field_stat| field_stat.num_tokens())
|
||||||
|
.unwrap_or(0u64);
|
||||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||||
TermDictionary::from_source(&termdict_source),
|
TermDictionary::from_source(&termdict_source),
|
||||||
postings_source,
|
postings_source,
|
||||||
positions_source,
|
positions_source,
|
||||||
positions_idx_source,
|
positions_idx_source,
|
||||||
|
total_num_tokens,
|
||||||
record_option,
|
record_option,
|
||||||
));
|
));
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ Postings module (also called inverted index)
|
|||||||
mod block_search;
|
mod block_search;
|
||||||
mod block_segment_postings;
|
mod block_segment_postings;
|
||||||
pub(crate) mod compression;
|
pub(crate) mod compression;
|
||||||
|
mod field_stats;
|
||||||
mod postings;
|
mod postings;
|
||||||
mod postings_writer;
|
mod postings_writer;
|
||||||
mod recorder;
|
mod recorder;
|
||||||
@@ -15,6 +16,7 @@ mod stacker;
|
|||||||
mod term_info;
|
mod term_info;
|
||||||
|
|
||||||
pub(crate) use self::block_search::BlockSearcher;
|
pub(crate) use self::block_search::BlockSearcher;
|
||||||
|
pub(crate) use self::field_stats::{FieldStat, FieldStats};
|
||||||
|
|
||||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use super::TermInfo;
|
use super::{FieldStat, FieldStats, TermInfo};
|
||||||
use crate::common::{BinarySerializable, VInt};
|
|
||||||
use crate::common::{CompositeWrite, CountingWriter};
|
use crate::common::{CompositeWrite, CountingWriter};
|
||||||
use crate::core::Segment;
|
use crate::core::Segment;
|
||||||
use crate::directory::WritePtr;
|
use crate::directory::WritePtr;
|
||||||
@@ -11,6 +10,10 @@ use crate::query::BM25Weight;
|
|||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::schema::{Field, FieldEntry, FieldType};
|
use crate::schema::{Field, FieldEntry, FieldType};
|
||||||
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||||
|
use crate::{
|
||||||
|
common::{BinarySerializable, VInt},
|
||||||
|
directory::TerminatingWrite,
|
||||||
|
};
|
||||||
use crate::{DocId, Score};
|
use crate::{DocId, Score};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
@@ -51,6 +54,8 @@ pub struct InvertedIndexSerializer {
|
|||||||
postings_write: CompositeWrite<WritePtr>,
|
postings_write: CompositeWrite<WritePtr>,
|
||||||
positions_write: CompositeWrite<WritePtr>,
|
positions_write: CompositeWrite<WritePtr>,
|
||||||
positionsidx_write: CompositeWrite<WritePtr>,
|
positionsidx_write: CompositeWrite<WritePtr>,
|
||||||
|
field_stats: FieldStats,
|
||||||
|
field_stats_write: WritePtr,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,6 +66,7 @@ impl InvertedIndexSerializer {
|
|||||||
postings_write: CompositeWrite<WritePtr>,
|
postings_write: CompositeWrite<WritePtr>,
|
||||||
positions_write: CompositeWrite<WritePtr>,
|
positions_write: CompositeWrite<WritePtr>,
|
||||||
positionsidx_write: CompositeWrite<WritePtr>,
|
positionsidx_write: CompositeWrite<WritePtr>,
|
||||||
|
field_stats_write: WritePtr,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
) -> crate::Result<InvertedIndexSerializer> {
|
) -> crate::Result<InvertedIndexSerializer> {
|
||||||
Ok(InvertedIndexSerializer {
|
Ok(InvertedIndexSerializer {
|
||||||
@@ -68,18 +74,21 @@ impl InvertedIndexSerializer {
|
|||||||
postings_write,
|
postings_write,
|
||||||
positions_write,
|
positions_write,
|
||||||
positionsidx_write,
|
positionsidx_write,
|
||||||
|
field_stats: FieldStats::default(),
|
||||||
|
field_stats_write,
|
||||||
schema,
|
schema,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Open a new `PostingsSerializer` for the given segment
|
/// Open a new `PostingsSerializer` for the given segment
|
||||||
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
|
pub fn open(segment: &mut Segment) -> crate::Result<InvertedIndexSerializer> {
|
||||||
use crate::SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
use crate::SegmentComponent::{FIELDSTATS, POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
||||||
InvertedIndexSerializer::create(
|
InvertedIndexSerializer::create(
|
||||||
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||||
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||||
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
||||||
CompositeWrite::wrap(segment.open_write(POSITIONSSKIP)?),
|
CompositeWrite::wrap(segment.open_write(POSITIONSSKIP)?),
|
||||||
|
segment.open_write(FIELDSTATS)?,
|
||||||
segment.schema(),
|
segment.schema(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -94,6 +103,8 @@ impl InvertedIndexSerializer {
|
|||||||
total_num_tokens: u64,
|
total_num_tokens: u64,
|
||||||
fieldnorm_reader: Option<FieldNormReader>,
|
fieldnorm_reader: Option<FieldNormReader>,
|
||||||
) -> io::Result<FieldSerializer<'_>> {
|
) -> io::Result<FieldSerializer<'_>> {
|
||||||
|
self.field_stats
|
||||||
|
.insert(field, FieldStat::new(total_num_tokens));
|
||||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||||
let term_dictionary_write = self.terms_write.for_field(field);
|
let term_dictionary_write = self.terms_write.for_field(field);
|
||||||
let postings_write = self.postings_write.for_field(field);
|
let postings_write = self.postings_write.for_field(field);
|
||||||
@@ -112,7 +123,10 @@ impl InvertedIndexSerializer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Closes the serializer.
|
/// Closes the serializer.
|
||||||
pub fn close(self) -> io::Result<()> {
|
pub fn close(mut self) -> io::Result<()> {
|
||||||
|
self.field_stats
|
||||||
|
.serialize(self.field_stats_write.get_mut())?;
|
||||||
|
self.field_stats_write.terminate()?;
|
||||||
self.terms_write.close()?;
|
self.terms_write.close()?;
|
||||||
self.postings_write.close()?;
|
self.postings_write.close()?;
|
||||||
self.positions_write.close()?;
|
self.positions_write.close()?;
|
||||||
@@ -142,7 +156,6 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
positionsidx_write: &'a mut CountingWriter<WritePtr>,
|
positionsidx_write: &'a mut CountingWriter<WritePtr>,
|
||||||
fieldnorm_reader: Option<FieldNormReader>,
|
fieldnorm_reader: Option<FieldNormReader>,
|
||||||
) -> io::Result<FieldSerializer<'a>> {
|
) -> io::Result<FieldSerializer<'a>> {
|
||||||
total_num_tokens.serialize(postings_write)?;
|
|
||||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||||
FieldType::Str(ref text_options) => {
|
FieldType::Str(ref text_options) => {
|
||||||
if let Some(text_indexing_options) = text_options.get_indexing_options() {
|
if let Some(text_indexing_options) = text_options.get_indexing_options() {
|
||||||
@@ -190,7 +203,8 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
.unwrap_or(0u64);
|
.unwrap_or(0u64);
|
||||||
TermInfo {
|
TermInfo {
|
||||||
doc_freq: 0,
|
doc_freq: 0,
|
||||||
postings_offset: self.postings_serializer.addr(),
|
postings_start_offset: self.postings_serializer.addr(),
|
||||||
|
postings_end_offset: 0u64,
|
||||||
positions_idx,
|
positions_idx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -244,10 +258,12 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
/// using `VInt` encoding.
|
/// using `VInt` encoding.
|
||||||
pub fn close_term(&mut self) -> io::Result<()> {
|
pub fn close_term(&mut self) -> io::Result<()> {
|
||||||
if self.term_open {
|
if self.term_open {
|
||||||
self.term_dictionary_builder
|
|
||||||
.insert_value(&self.current_term_info)?;
|
|
||||||
self.postings_serializer
|
self.postings_serializer
|
||||||
.close_term(self.current_term_info.doc_freq)?;
|
.close_term(self.current_term_info.doc_freq)?;
|
||||||
|
let end_offset = self.postings_serializer.addr();
|
||||||
|
self.current_term_info.postings_end_offset = end_offset;
|
||||||
|
self.term_dictionary_builder
|
||||||
|
.insert_value(&self.current_term_info)?;
|
||||||
self.term_open = false;
|
self.term_open = false;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -7,35 +7,49 @@ use std::io;
|
|||||||
pub struct TermInfo {
|
pub struct TermInfo {
|
||||||
/// Number of documents in the segment containing the term
|
/// Number of documents in the segment containing the term
|
||||||
pub doc_freq: u32,
|
pub doc_freq: u32,
|
||||||
/// Start offset within the postings (`.idx`) file.
|
/// Start offset of the posting list within the postings (`.idx`) file.
|
||||||
pub postings_offset: u64,
|
pub postings_start_offset: u64,
|
||||||
|
/// End offset of the posting list within the postings (`.idx`) file.
|
||||||
|
pub postings_end_offset: u64,
|
||||||
/// Start offset of the first block within the position (`.pos`) file.
|
/// Start offset of the first block within the position (`.pos`) file.
|
||||||
pub positions_idx: u64,
|
pub positions_idx: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl TermInfo {
|
||||||
|
pub(crate) fn posting_num_bytes(&self) -> u32 {
|
||||||
|
let num_bytes = self.postings_end_offset - self.postings_start_offset;
|
||||||
|
assert!(num_bytes <= std::u32::MAX as u64);
|
||||||
|
num_bytes as u32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl FixedSize for TermInfo {
|
impl FixedSize for TermInfo {
|
||||||
/// Size required for the binary serialization of a `TermInfo` object.
|
/// Size required for the binary serialization of a `TermInfo` object.
|
||||||
/// This is large, but in practise, `TermInfo` are encoded in blocks and
|
/// This is large, but in practise, `TermInfo` are encoded in blocks and
|
||||||
/// only the first `TermInfo` of a block is serialized uncompressed.
|
/// only the first `TermInfo` of a block is serialized uncompressed.
|
||||||
/// The subsequent `TermInfo` are delta encoded and bitpacked.
|
/// The subsequent `TermInfo` are delta encoded and bitpacked.
|
||||||
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
|
const SIZE_IN_BYTES: usize = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BinarySerializable for TermInfo {
|
impl BinarySerializable for TermInfo {
|
||||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
self.doc_freq.serialize(writer)?;
|
self.doc_freq.serialize(writer)?;
|
||||||
self.postings_offset.serialize(writer)?;
|
self.postings_start_offset.serialize(writer)?;
|
||||||
|
self.posting_num_bytes().serialize(writer)?;
|
||||||
self.positions_idx.serialize(writer)?;
|
self.positions_idx.serialize(writer)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
let doc_freq = u32::deserialize(reader)?;
|
let doc_freq = u32::deserialize(reader)?;
|
||||||
let postings_offset = u64::deserialize(reader)?;
|
let postings_start_offset = u64::deserialize(reader)?;
|
||||||
|
let postings_num_bytes = u32::deserialize(reader)?;
|
||||||
|
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes);
|
||||||
let positions_idx = u64::deserialize(reader)?;
|
let positions_idx = u64::deserialize(reader)?;
|
||||||
Ok(TermInfo {
|
Ok(TermInfo {
|
||||||
doc_freq,
|
doc_freq,
|
||||||
postings_offset,
|
postings_start_offset,
|
||||||
|
postings_end_offset,
|
||||||
positions_idx,
|
positions_idx,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ pub enum ComponentSpaceUsage {
|
|||||||
Store(StoreSpaceUsage),
|
Store(StoreSpaceUsage),
|
||||||
/// Some sort of raw byte count
|
/// Some sort of raw byte count
|
||||||
Basic(ByteCount),
|
Basic(ByteCount),
|
||||||
|
///
|
||||||
|
Unimplemented,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents combined space usage of an entire searcher and its component segments.
|
/// Represents combined space usage of an entire searcher and its component segments.
|
||||||
@@ -119,7 +121,7 @@ impl SegmentSpaceUsage {
|
|||||||
/// Clones the underlying data.
|
/// Clones the underlying data.
|
||||||
/// Use the components directly if this is somehow in performance critical code.
|
/// Use the components directly if this is somehow in performance critical code.
|
||||||
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
|
||||||
use self::ComponentSpaceUsage::*;
|
use self::ComponentSpaceUsage::{Basic, PerField, Store, Unimplemented};
|
||||||
use crate::SegmentComponent::*;
|
use crate::SegmentComponent::*;
|
||||||
match component {
|
match component {
|
||||||
POSTINGS => PerField(self.postings().clone()),
|
POSTINGS => PerField(self.postings().clone()),
|
||||||
@@ -130,6 +132,7 @@ impl SegmentSpaceUsage {
|
|||||||
TERMS => PerField(self.termdict().clone()),
|
TERMS => PerField(self.termdict().clone()),
|
||||||
STORE => Store(self.store().clone()),
|
STORE => Store(self.store().clone()),
|
||||||
DELETE => Basic(self.deletes()),
|
DELETE => Basic(self.deletes()),
|
||||||
|
FIELDSTATS => Unimplemented,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -44,11 +44,13 @@ mod tests {
|
|||||||
|
|
||||||
const BLOCK_SIZE: usize = 1_500;
|
const BLOCK_SIZE: usize = 1_500;
|
||||||
|
|
||||||
fn make_term_info(val: u64) -> TermInfo {
|
fn make_term_info(term_ord: u64) -> TermInfo {
|
||||||
|
let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord;
|
||||||
TermInfo {
|
TermInfo {
|
||||||
doc_freq: val as u32,
|
doc_freq: term_ord as u32,
|
||||||
positions_idx: val * 2u64,
|
postings_start_offset: offset(term_ord),
|
||||||
postings_offset: val * 3u64,
|
postings_end_offset: offset(term_ord + 1),
|
||||||
|
positions_idx: offset(term_ord) * 2u64,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,20 +210,14 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_stream_high_range_prefix_suffix() {
|
fn test_stream_high_range_prefix_suffix() -> std::io::Result<()> {
|
||||||
let buffer: Vec<u8> = {
|
let buffer: Vec<u8> = {
|
||||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
|
||||||
// term requires more than 16bits
|
// term requires more than 16bits
|
||||||
term_dictionary_builder
|
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?;
|
||||||
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
|
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?;
|
||||||
.unwrap();
|
term_dictionary_builder.insert("abr", &make_term_info(3))?;
|
||||||
term_dictionary_builder
|
term_dictionary_builder.finish()?
|
||||||
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
|
|
||||||
.unwrap();
|
|
||||||
term_dictionary_builder
|
|
||||||
.insert("abr", &make_term_info(2))
|
|
||||||
.unwrap();
|
|
||||||
term_dictionary_builder.finish().unwrap()
|
|
||||||
};
|
};
|
||||||
let source = ReadOnlySource::from(buffer);
|
let source = ReadOnlySource::from(buffer);
|
||||||
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
|
let term_dictionary: TermDictionary = TermDictionary::from_source(&source);
|
||||||
@@ -229,12 +225,15 @@ mod tests {
|
|||||||
assert!(kv_stream.advance());
|
assert!(kv_stream.advance());
|
||||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
|
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
|
||||||
assert_eq!(kv_stream.value(), &make_term_info(1));
|
assert_eq!(kv_stream.value(), &make_term_info(1));
|
||||||
|
dbg!(make_term_info(1));
|
||||||
assert!(kv_stream.advance());
|
assert!(kv_stream.advance());
|
||||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
|
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
|
||||||
assert_eq!(kv_stream.value(), &make_term_info(2));
|
assert_eq!(kv_stream.value(), &make_term_info(2));
|
||||||
assert!(kv_stream.advance());
|
assert!(kv_stream.advance());
|
||||||
assert_eq!(kv_stream.key(), "abr".as_bytes());
|
assert_eq!(kv_stream.key(), "abr".as_bytes());
|
||||||
|
assert_eq!(kv_stream.value(), &make_term_info(3));
|
||||||
assert!(!kv_stream.advance());
|
assert!(!kv_stream.advance());
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -57,21 +57,28 @@ impl TermInfoBlockMeta {
|
|||||||
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits
|
self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Here inner_offset is the offset within the block, WITHOUT the first term_info.
|
||||||
|
// In other word, term_info #1,#2,#3 gets inner_offset 0,1,2... While term_info #0
|
||||||
|
// is encoded without bitpacking.
|
||||||
fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo {
|
fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo {
|
||||||
|
assert!(inner_offset < BLOCK_LEN - 1);
|
||||||
let num_bits = self.num_bits() as usize;
|
let num_bits = self.num_bits() as usize;
|
||||||
let mut cursor = num_bits * inner_offset;
|
let mut cursor = num_bits * inner_offset;
|
||||||
|
|
||||||
|
let postings_start_offset = extract_bits(data, cursor, self.postings_offset_nbits);
|
||||||
|
let postings_end_offset = self.ref_term_info.postings_start_offset
|
||||||
|
+ extract_bits(data, cursor + num_bits, self.postings_offset_nbits);
|
||||||
|
cursor += self.postings_offset_nbits as usize;
|
||||||
|
|
||||||
let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32;
|
let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32;
|
||||||
cursor += self.doc_freq_nbits as usize;
|
cursor += self.doc_freq_nbits as usize;
|
||||||
|
|
||||||
let postings_offset = extract_bits(data, cursor, self.postings_offset_nbits);
|
|
||||||
cursor += self.postings_offset_nbits as usize;
|
|
||||||
|
|
||||||
let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits);
|
let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits);
|
||||||
|
|
||||||
TermInfo {
|
TermInfo {
|
||||||
doc_freq,
|
doc_freq,
|
||||||
postings_offset: postings_offset + self.ref_term_info.postings_offset,
|
postings_start_offset: postings_start_offset + self.ref_term_info.postings_start_offset,
|
||||||
|
postings_end_offset,
|
||||||
positions_idx: positions_idx + self.ref_term_info.positions_idx,
|
positions_idx: positions_idx + self.ref_term_info.positions_idx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -126,14 +133,13 @@ impl TermInfoStore {
|
|||||||
.expect("Failed to deserialize terminfoblockmeta");
|
.expect("Failed to deserialize terminfoblockmeta");
|
||||||
let inner_offset = (term_ord as usize) % BLOCK_LEN;
|
let inner_offset = (term_ord as usize) % BLOCK_LEN;
|
||||||
if inner_offset == 0 {
|
if inner_offset == 0 {
|
||||||
term_info_block_data.ref_term_info
|
return term_info_block_data.ref_term_info;
|
||||||
} else {
|
|
||||||
let term_info_data = self.term_info_source.as_slice();
|
|
||||||
term_info_block_data.deserialize_term_info(
|
|
||||||
&term_info_data[term_info_block_data.offset as usize..],
|
|
||||||
inner_offset - 1,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
let term_info_data = self.term_info_source.as_slice();
|
||||||
|
term_info_block_data.deserialize_term_info(
|
||||||
|
&term_info_data[term_info_block_data.offset as usize..],
|
||||||
|
inner_offset - 1,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn num_terms(&self) -> usize {
|
pub fn num_terms(&self) -> usize {
|
||||||
@@ -154,16 +160,17 @@ fn bitpack_serialize<W: Write>(
|
|||||||
term_info_block_meta: &TermInfoBlockMeta,
|
term_info_block_meta: &TermInfoBlockMeta,
|
||||||
term_info: &TermInfo,
|
term_info: &TermInfo,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
|
bit_packer.write(
|
||||||
|
term_info.postings_start_offset,
|
||||||
|
term_info_block_meta.postings_offset_nbits,
|
||||||
|
write,
|
||||||
|
)?;
|
||||||
bit_packer.write(
|
bit_packer.write(
|
||||||
u64::from(term_info.doc_freq),
|
u64::from(term_info.doc_freq),
|
||||||
term_info_block_meta.doc_freq_nbits,
|
term_info_block_meta.doc_freq_nbits,
|
||||||
write,
|
write,
|
||||||
)?;
|
)?;
|
||||||
bit_packer.write(
|
|
||||||
term_info.postings_offset,
|
|
||||||
term_info_block_meta.postings_offset_nbits,
|
|
||||||
write,
|
|
||||||
)?;
|
|
||||||
bit_packer.write(
|
bit_packer.write(
|
||||||
term_info.positions_idx,
|
term_info.positions_idx,
|
||||||
term_info_block_meta.positions_idx_nbits,
|
term_info_block_meta.positions_idx_nbits,
|
||||||
@@ -183,23 +190,27 @@ impl TermInfoStoreWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn flush_block(&mut self) -> io::Result<()> {
|
fn flush_block(&mut self) -> io::Result<()> {
|
||||||
if self.term_infos.is_empty() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let mut bit_packer = BitPacker::new();
|
let mut bit_packer = BitPacker::new();
|
||||||
let ref_term_info = self.term_infos[0].clone();
|
let ref_term_info = self.term_infos[0].clone();
|
||||||
|
|
||||||
|
let last_term_info = if let Some(last_term_info) = self.term_infos.last().cloned() {
|
||||||
|
last_term_info
|
||||||
|
} else {
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
let postings_end_offset =
|
||||||
|
last_term_info.postings_end_offset - ref_term_info.postings_start_offset;
|
||||||
for term_info in &mut self.term_infos[1..] {
|
for term_info in &mut self.term_infos[1..] {
|
||||||
term_info.postings_offset -= ref_term_info.postings_offset;
|
term_info.postings_start_offset -= ref_term_info.postings_start_offset;
|
||||||
term_info.positions_idx -= ref_term_info.positions_idx;
|
term_info.positions_idx -= ref_term_info.positions_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut max_doc_freq: u32 = 0u32;
|
let mut max_doc_freq: u32 = 0u32;
|
||||||
let mut max_postings_offset: u64 = 0u64;
|
let max_postings_offset: u64 = postings_end_offset;
|
||||||
let mut max_positions_idx: u64 = 0u64;
|
let max_positions_idx: u64 = last_term_info.positions_idx;
|
||||||
|
|
||||||
for term_info in &self.term_infos[1..] {
|
for term_info in &self.term_infos[1..] {
|
||||||
max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq);
|
max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq);
|
||||||
max_postings_offset = cmp::max(max_postings_offset, term_info.postings_offset);
|
|
||||||
max_positions_idx = cmp::max(max_positions_idx, term_info.positions_idx);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
|
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
|
||||||
@@ -224,6 +235,12 @@ impl TermInfoStoreWriter {
|
|||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bit_packer.write(
|
||||||
|
postings_end_offset,
|
||||||
|
term_info_block_meta.postings_offset_nbits,
|
||||||
|
&mut self.buffer_term_infos,
|
||||||
|
)?;
|
||||||
|
|
||||||
// Block need end up at the end of a byte.
|
// Block need end up at the end of a byte.
|
||||||
bit_packer.flush(&mut self.buffer_term_infos)?;
|
bit_packer.flush(&mut self.buffer_term_infos)?;
|
||||||
self.term_infos.clear();
|
self.term_infos.clear();
|
||||||
@@ -232,6 +249,7 @@ impl TermInfoStoreWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||||
|
assert!(term_info.postings_end_offset >= term_info.postings_start_offset);
|
||||||
self.num_terms += 1u64;
|
self.num_terms += 1u64;
|
||||||
self.term_infos.push(term_info.clone());
|
self.term_infos.push(term_info.clone());
|
||||||
if self.term_infos.len() >= BLOCK_LEN {
|
if self.term_infos.len() >= BLOCK_LEN {
|
||||||
@@ -291,10 +309,11 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_term_info_block_meta_serialization() {
|
fn test_term_info_block_meta_serialization() {
|
||||||
let term_info_block_meta = TermInfoBlockMeta {
|
let term_info_block_meta = TermInfoBlockMeta {
|
||||||
offset: 2009,
|
offset: 2009u64,
|
||||||
ref_term_info: TermInfo {
|
ref_term_info: TermInfo {
|
||||||
doc_freq: 512,
|
doc_freq: 512,
|
||||||
postings_offset: 51,
|
postings_start_offset: 51,
|
||||||
|
postings_end_offset: 57u64,
|
||||||
positions_idx: 3584,
|
positions_idx: 3584,
|
||||||
},
|
},
|
||||||
doc_freq_nbits: 10,
|
doc_freq_nbits: 10,
|
||||||
@@ -312,10 +331,12 @@ mod tests {
|
|||||||
fn test_pack() {
|
fn test_pack() {
|
||||||
let mut store_writer = TermInfoStoreWriter::new();
|
let mut store_writer = TermInfoStoreWriter::new();
|
||||||
let mut term_infos = vec![];
|
let mut term_infos = vec![];
|
||||||
|
let offset = |i| (i * 13 + i * i) as u64;
|
||||||
for i in 0..1000 {
|
for i in 0..1000 {
|
||||||
let term_info = TermInfo {
|
let term_info = TermInfo {
|
||||||
doc_freq: i as u32,
|
doc_freq: i as u32,
|
||||||
postings_offset: (i / 10) as u64,
|
postings_start_offset: offset(i),
|
||||||
|
postings_end_offset: offset(i + 1),
|
||||||
positions_idx: (i * 7) as u64,
|
positions_idx: (i * 7) as u64,
|
||||||
};
|
};
|
||||||
store_writer.write_term_info(&term_info).unwrap();
|
store_writer.write_term_info(&term_info).unwrap();
|
||||||
@@ -325,7 +346,12 @@ mod tests {
|
|||||||
store_writer.serialize(&mut buffer).unwrap();
|
store_writer.serialize(&mut buffer).unwrap();
|
||||||
let term_info_store = TermInfoStore::open(&ReadOnlySource::from(buffer));
|
let term_info_store = TermInfoStore::open(&ReadOnlySource::from(buffer));
|
||||||
for i in 0..1000 {
|
for i in 0..1000 {
|
||||||
assert_eq!(term_info_store.get(i as u64), term_infos[i]);
|
assert_eq!(
|
||||||
|
term_info_store.get(i as u64),
|
||||||
|
term_infos[i],
|
||||||
|
"term info {}",
|
||||||
|
i
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user