mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 09:32:54 +00:00
Replacing (start, end) by Range
This commit is contained in:
@@ -69,12 +69,12 @@ fn highlight(snippet: Snippet) -> String {
|
||||
let mut result = String::new();
|
||||
let mut start_from = 0;
|
||||
|
||||
for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) {
|
||||
result.push_str(&snippet.fragments()[start_from..start]);
|
||||
for fragment_range in snippet.highlighted() {
|
||||
result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
|
||||
result.push_str(" --> ");
|
||||
result.push_str(&snippet.fragments()[start..end]);
|
||||
result.push_str(&snippet.fragments()[fragment_range.clone()]);
|
||||
result.push_str(" <-- ");
|
||||
start_from = end;
|
||||
start_from = fragment_range.end;
|
||||
}
|
||||
|
||||
result.push_str(&snippet.fragments()[start_from..]);
|
||||
|
||||
@@ -8,6 +8,8 @@ use crate::space_usage::FieldUsage;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::iter::ExactSizeIterator;
|
||||
use std::ops::Range;
|
||||
|
||||
use super::HasLen;
|
||||
|
||||
@@ -105,7 +107,7 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
|
||||
#[derive(Clone)]
|
||||
pub struct CompositeFile {
|
||||
data: FileSlice,
|
||||
offsets_index: HashMap<FileAddr, (usize, usize)>,
|
||||
offsets_index: HashMap<FileAddr, Range<usize>>,
|
||||
}
|
||||
|
||||
impl CompositeFile {
|
||||
@@ -117,7 +119,7 @@ impl CompositeFile {
|
||||
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
|
||||
let footer_start = end - 4 - footer_len;
|
||||
let footer_data = data
|
||||
.slice(footer_start, footer_start + footer_len)
|
||||
.slice(footer_start..footer_start + footer_len)
|
||||
.read_bytes()?;
|
||||
let mut footer_buffer = footer_data.as_slice();
|
||||
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
@@ -138,7 +140,7 @@ impl CompositeFile {
|
||||
let file_addr = file_addrs[i];
|
||||
let start_offset = offsets[i];
|
||||
let end_offset = offsets[i + 1];
|
||||
field_index.insert(file_addr, (start_offset, end_offset));
|
||||
field_index.insert(file_addr, start_offset..end_offset);
|
||||
}
|
||||
|
||||
Ok(CompositeFile {
|
||||
@@ -167,16 +169,16 @@ impl CompositeFile {
|
||||
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<FileSlice> {
|
||||
self.offsets_index
|
||||
.get(&FileAddr { field, idx })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
.map(|byte_range| self.data.slice(byte_range.clone()))
|
||||
}
|
||||
|
||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
let mut fields = HashMap::new();
|
||||
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
||||
for (&field_addr, byte_range) in &self.offsets_index {
|
||||
fields
|
||||
.entry(field_addr.field)
|
||||
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
||||
.add_field_idx(field_addr.idx, end - start);
|
||||
.add_field_idx(field_addr.idx, byte_range.len());
|
||||
}
|
||||
PerFieldSpaceUsage::new(fields)
|
||||
}
|
||||
|
||||
@@ -90,9 +90,9 @@ impl InvertedIndexReader {
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings,
|
||||
) -> io::Result<()> {
|
||||
let start_offset = term_info.postings_start_offset as usize;
|
||||
let stop_offset = term_info.postings_stop_offset as usize;
|
||||
let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset);
|
||||
let postings_slice = self
|
||||
.postings_file_slice
|
||||
.slice(term_info.postings_range.clone());
|
||||
block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?);
|
||||
Ok(())
|
||||
}
|
||||
@@ -120,10 +120,9 @@ impl InvertedIndexReader {
|
||||
term_info: &TermInfo,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> io::Result<BlockSegmentPostings> {
|
||||
let postings_data = self.postings_file_slice.slice(
|
||||
term_info.postings_start_offset as usize,
|
||||
term_info.postings_stop_offset as usize,
|
||||
);
|
||||
let postings_data = self
|
||||
.postings_file_slice
|
||||
.slice(term_info.postings_range.clone());
|
||||
BlockSegmentPostings::open(
|
||||
term_info.doc_freq,
|
||||
postings_data,
|
||||
|
||||
@@ -2,6 +2,7 @@ use stable_deref_trait::StableDeref;
|
||||
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::OwnedBytes;
|
||||
use std::ops::Range;
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::{io, ops::Deref};
|
||||
|
||||
@@ -20,19 +21,19 @@ pub trait FileHandle: 'static + Send + Sync + HasLen {
|
||||
/// Reads a slice of bytes.
|
||||
///
|
||||
/// This method may panic if the range requested is invalid.
|
||||
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes>;
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes>;
|
||||
}
|
||||
|
||||
impl FileHandle for &'static [u8] {
|
||||
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
|
||||
let bytes = &self[from..to];
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
let bytes = &self[range];
|
||||
Ok(OwnedBytes::new(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
fn len(&self) -> usize {
|
||||
self.as_ref().len()
|
||||
self.deref().len()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,8 +53,7 @@ where
|
||||
#[derive(Clone)]
|
||||
pub struct FileSlice {
|
||||
data: Arc<dyn FileHandle>,
|
||||
start: usize,
|
||||
stop: usize,
|
||||
range: Range<usize>,
|
||||
}
|
||||
|
||||
impl FileSlice {
|
||||
@@ -68,8 +68,7 @@ impl FileSlice {
|
||||
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
|
||||
FileSlice {
|
||||
data: Arc::from(file_handle),
|
||||
start: 0,
|
||||
stop: num_bytes,
|
||||
range: 0..num_bytes,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,14 +76,12 @@ impl FileSlice {
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if `to < from` or if `to` exceeds the filesize.
|
||||
pub fn slice(&self, from: usize, to: usize) -> FileSlice {
|
||||
assert!(to <= self.len());
|
||||
assert!(to >= from);
|
||||
/// Panics if `byte_range.end` exceeds the filesize.
|
||||
pub fn slice(&self, byte_range: Range<usize>) -> FileSlice {
|
||||
assert!(byte_range.end <= self.len());
|
||||
FileSlice {
|
||||
data: self.data.clone(),
|
||||
start: self.start + from,
|
||||
stop: self.start + to,
|
||||
range: self.range.start + byte_range.start..self.range.start + byte_range.end,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,19 +98,21 @@ impl FileSlice {
|
||||
/// In particular, it is up to the `Directory` implementation
|
||||
/// to handle caching if needed.
|
||||
pub fn read_bytes(&self) -> io::Result<OwnedBytes> {
|
||||
self.data.read_bytes(self.start, self.stop)
|
||||
self.data.read_bytes(self.range.clone())
|
||||
}
|
||||
|
||||
/// Reads a specific slice of data.
|
||||
///
|
||||
/// This is equivalent to running `file_slice.slice(from, to).read_bytes()`.
|
||||
pub fn read_bytes_slice(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
|
||||
assert!(from <= to);
|
||||
pub fn read_bytes_slice(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
assert!(
|
||||
self.start + to <= self.stop,
|
||||
"`to` exceeds the fileslice length"
|
||||
range.end <= self.len(),
|
||||
"end of requested range exceeds the fileslice length ({} > {})",
|
||||
range.end,
|
||||
self.len()
|
||||
);
|
||||
self.data.read_bytes(self.start + from, self.start + to)
|
||||
self.data
|
||||
.read_bytes(self.range.start + range.start..self.range.start + range.end)
|
||||
}
|
||||
|
||||
/// Splits the FileSlice at the given offset and return two file slices.
|
||||
@@ -138,7 +137,7 @@ impl FileSlice {
|
||||
///
|
||||
/// Equivalent to `.slice(from_offset, self.len())`
|
||||
pub fn slice_from(&self, from_offset: usize) -> FileSlice {
|
||||
self.slice(from_offset, self.len())
|
||||
self.slice(from_offset..self.len())
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `to`
|
||||
@@ -146,19 +145,19 @@ impl FileSlice {
|
||||
///
|
||||
/// Equivalent to `.slice(0, to_offset)`
|
||||
pub fn slice_to(&self, to_offset: usize) -> FileSlice {
|
||||
self.slice(0, to_offset)
|
||||
self.slice(0..to_offset)
|
||||
}
|
||||
}
|
||||
|
||||
impl FileHandle for FileSlice {
|
||||
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
|
||||
self.read_bytes_slice(from, to)
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
self.read_bytes_slice(range)
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for FileSlice {
|
||||
fn len(&self) -> usize {
|
||||
self.stop - self.start
|
||||
self.range.len()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -217,30 +216,23 @@ mod tests {
|
||||
let slice = FileSlice::new(Box::new(&b"abcdef"[..]));
|
||||
assert_eq!(slice.len(), 6);
|
||||
assert_eq!(slice.read_bytes()?.as_ref(), b"abcdef");
|
||||
assert_eq!(slice.slice(1, 4).read_bytes()?.as_ref(), b"bcd");
|
||||
assert_eq!(slice.slice(1..4).read_bytes()?.as_ref(), b"bcd");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slice_read_slice() -> io::Result<()> {
|
||||
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
|
||||
assert_eq!(slice_deref.read_bytes_slice(1, 4)?.as_ref(), b"bcd");
|
||||
assert_eq!(slice_deref.read_bytes_slice(1..4)?.as_ref(), b"bcd");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "assertion failed: from <= to")]
|
||||
fn test_slice_read_slice_invalid_range() {
|
||||
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
|
||||
assert_eq!(slice_deref.read_bytes_slice(1, 0).unwrap().as_ref(), b"bcd");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "`to` exceeds the fileslice length")]
|
||||
#[should_panic(expected = "end of requested range exceeds the fileslice length (10 > 6)")]
|
||||
fn test_slice_read_slice_invalid_range_exceeds() {
|
||||
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
|
||||
assert_eq!(
|
||||
slice_deref.read_bytes_slice(0, 10).unwrap().as_ref(),
|
||||
slice_deref.read_bytes_slice(0..10).unwrap().as_ref(),
|
||||
b"bcd"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::directory::FileHandle;
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
|
||||
@@ -17,8 +17,8 @@ pub struct OwnedBytes {
|
||||
}
|
||||
|
||||
impl FileHandle for OwnedBytes {
|
||||
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
|
||||
Ok(self.slice(from, to))
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
Ok(self.slice(range))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,9 +42,9 @@ impl OwnedBytes {
|
||||
}
|
||||
|
||||
/// creates a fileslice that is just a view over a slice of the data.
|
||||
pub fn slice(&self, from: usize, to: usize) -> Self {
|
||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||
OwnedBytes {
|
||||
data: &self.data[from..to],
|
||||
data: &self.data[range],
|
||||
box_stable_deref: self.box_stable_deref.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::DocId;
|
||||
|
||||
@@ -28,24 +30,24 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
(start, stop)
|
||||
start..stop
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
let range = self.range(doc);
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let (start, stop) = self.range(doc);
|
||||
(stop - start) as usize
|
||||
let range = self.range(doc);
|
||||
(range.end - range.start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
|
||||
@@ -125,21 +125,18 @@ impl MultiValuedFastFieldWriter {
|
||||
1,
|
||||
)?;
|
||||
|
||||
let last_interval = (
|
||||
self.doc_index.last().cloned().unwrap(),
|
||||
self.vals.len() as u64,
|
||||
);
|
||||
let last_interval =
|
||||
self.doc_index.last().cloned().unwrap() as usize..self.vals.len();
|
||||
|
||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||
for (start, stop) in self
|
||||
for range in self
|
||||
.doc_index
|
||||
.windows(2)
|
||||
.map(|interval| (interval[0], interval[1]))
|
||||
.map(|interval| interval[0] as usize..interval[1] as usize)
|
||||
.chain(Some(last_interval).into_iter())
|
||||
.map(|(start, stop)| (start as usize, stop as usize))
|
||||
{
|
||||
doc_vals.clear();
|
||||
let remapped_vals = self.vals[start..stop]
|
||||
let remapped_vals = self.vals[range]
|
||||
.iter()
|
||||
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
|
||||
doc_vals.extend(remapped_vals);
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::postings::compression::AlignedBuffer;
|
||||
|
||||
/// This modules define the logic used to search for a doc in a given
|
||||
@@ -72,7 +74,7 @@ fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
fn exponential_search(arr: &[u32], target: u32) -> Range<usize> {
|
||||
let end = arr.len();
|
||||
let mut begin = 0;
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
@@ -80,17 +82,17 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
break;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
return begin..pivot;
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
(begin, end)
|
||||
begin..end
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(&block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
let range = exponential_search(&block_docs, target);
|
||||
range.start + linear_search(&block_docs[range], target)
|
||||
}
|
||||
|
||||
/// Tantivy may rely on SIMD instructions to search for a specific document within
|
||||
@@ -182,11 +184,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 0), 0..1);
|
||||
assert_eq!(exponential_search(&[1, 2], 1), 0..1);
|
||||
assert_eq!(
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
(3, 7)
|
||||
3..7
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ use fnv::FnvHashMap;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use std::ops::{DerefMut, Range};
|
||||
|
||||
fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
@@ -52,7 +52,7 @@ pub struct MultiFieldPostingsWriter {
|
||||
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, usize, usize)> {
|
||||
) -> Vec<(Field, Range<usize>)> {
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
@@ -70,7 +70,7 @@ fn make_field_partition(
|
||||
offsets.push(term_offsets.len());
|
||||
let mut field_offsets = vec![];
|
||||
for i in 0..fields.len() {
|
||||
field_offsets.push((fields[i], offsets[i], offsets[i + 1]));
|
||||
field_offsets.push((fields[i], offsets[i]..offsets[i + 1]));
|
||||
}
|
||||
field_offsets
|
||||
}
|
||||
@@ -138,14 +138,14 @@ impl MultiFieldPostingsWriter {
|
||||
|
||||
let field_offsets = make_field_partition(&term_offsets);
|
||||
|
||||
for (field, start, stop) in field_offsets {
|
||||
for (field, byte_offsets) in field_offsets {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let unordered_term_ids = term_offsets[start..stop]
|
||||
let unordered_term_ids = term_offsets[byte_offsets.clone()]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
@@ -169,7 +169,7 @@ impl MultiFieldPostingsWriter {
|
||||
fieldnorm_reader,
|
||||
)?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&term_offsets[byte_offsets],
|
||||
&mut field_serializer,
|
||||
&self.term_index.heap,
|
||||
&self.heap,
|
||||
|
||||
@@ -183,10 +183,10 @@ impl<'a> FieldSerializer<'a> {
|
||||
} else {
|
||||
0u64
|
||||
};
|
||||
let addr = self.postings_serializer.addr() as usize;
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
postings_start_offset: self.postings_serializer.addr(),
|
||||
postings_stop_offset: 0u64,
|
||||
postings_range: addr..addr,
|
||||
positions_idx,
|
||||
}
|
||||
}
|
||||
@@ -242,7 +242,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
if self.term_open {
|
||||
self.postings_serializer
|
||||
.close_term(self.current_term_info.doc_freq)?;
|
||||
self.current_term_info.postings_stop_offset = self.postings_serializer.addr();
|
||||
self.current_term_info.postings_range.end = self.postings_serializer.addr() as usize;
|
||||
self.term_dictionary_builder
|
||||
.insert_value(&self.current_term_info)?;
|
||||
self.term_open = false;
|
||||
|
||||
@@ -17,10 +17,6 @@ pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone)]
|
||||
struct KeyValue {
|
||||
key_value_addr: Addr,
|
||||
|
||||
@@ -1,25 +1,24 @@
|
||||
use crate::common::{BinarySerializable, FixedSize};
|
||||
use std::io;
|
||||
use std::iter::ExactSizeIterator;
|
||||
use std::ops::Range;
|
||||
|
||||
/// `TermInfo` wraps the metadata associated to a Term.
|
||||
/// It is segment-local.
|
||||
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
|
||||
#[derive(Debug, Default, Eq, PartialEq, Clone)]
|
||||
pub struct TermInfo {
|
||||
/// Number of documents in the segment containing the term
|
||||
pub doc_freq: u32,
|
||||
/// Start offset of the posting list within the postings (`.idx`) file.
|
||||
pub postings_start_offset: u64,
|
||||
/// Stop offset of the posting list within the postings (`.idx`) file.
|
||||
/// The byte range is `[start_offset..stop_offset)`.
|
||||
pub postings_stop_offset: u64,
|
||||
/// Byte range of the posting list within the postings (`.idx`) file.
|
||||
pub postings_range: Range<usize>,
|
||||
/// Start offset of the first block within the position (`.pos`) file.
|
||||
pub positions_idx: u64,
|
||||
}
|
||||
|
||||
impl TermInfo {
|
||||
pub(crate) fn posting_num_bytes(&self) -> u32 {
|
||||
let num_bytes = self.postings_stop_offset - self.postings_start_offset;
|
||||
assert!(num_bytes <= std::u32::MAX as u64);
|
||||
let num_bytes = self.postings_range.len();
|
||||
assert!(num_bytes <= std::u32::MAX as usize);
|
||||
num_bytes as u32
|
||||
}
|
||||
}
|
||||
@@ -35,7 +34,7 @@ impl FixedSize for TermInfo {
|
||||
impl BinarySerializable for TermInfo {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.doc_freq.serialize(writer)?;
|
||||
self.postings_start_offset.serialize(writer)?;
|
||||
(self.postings_range.start as u64).serialize(writer)?;
|
||||
self.posting_num_bytes().serialize(writer)?;
|
||||
self.positions_idx.serialize(writer)?;
|
||||
Ok(())
|
||||
@@ -43,14 +42,13 @@ impl BinarySerializable for TermInfo {
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let doc_freq = u32::deserialize(reader)?;
|
||||
let postings_start_offset = u64::deserialize(reader)?;
|
||||
let postings_start_offset = u64::deserialize(reader)? as usize;
|
||||
let postings_num_bytes = u32::deserialize(reader)?;
|
||||
let postings_stop_offset = postings_start_offset + u64::from(postings_num_bytes);
|
||||
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes) as usize;
|
||||
let positions_idx = u64::deserialize(reader)?;
|
||||
Ok(TermInfo {
|
||||
doc_freq,
|
||||
postings_start_offset,
|
||||
postings_stop_offset,
|
||||
postings_range: postings_start_offset..postings_end_offset,
|
||||
positions_idx,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -8,33 +8,17 @@ use htmlescape::encode_minimal;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::Range;
|
||||
|
||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct HighlightSection {
|
||||
start: usize,
|
||||
stop: usize,
|
||||
}
|
||||
|
||||
impl HighlightSection {
|
||||
fn new(start: usize, stop: usize) -> HighlightSection {
|
||||
HighlightSection { start, stop }
|
||||
}
|
||||
|
||||
/// Returns the bounds of the `HighlightSection`.
|
||||
pub fn bounds(&self) -> (usize, usize) {
|
||||
(self.start, self.stop)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FragmentCandidate {
|
||||
score: Score,
|
||||
start_offset: usize,
|
||||
stop_offset: usize,
|
||||
num_chars: usize,
|
||||
highlighted: Vec<HighlightSection>,
|
||||
highlighted: Vec<Range<usize>>,
|
||||
}
|
||||
|
||||
impl FragmentCandidate {
|
||||
@@ -63,8 +47,7 @@ impl FragmentCandidate {
|
||||
|
||||
if let Some(&score) = terms.get(&token.text.to_lowercase()) {
|
||||
self.score += score;
|
||||
self.highlighted
|
||||
.push(HighlightSection::new(token.offset_from, token.offset_to));
|
||||
self.highlighted.push(token.offset_from..token.offset_to);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -74,7 +57,7 @@ impl FragmentCandidate {
|
||||
#[derive(Debug)]
|
||||
pub struct Snippet {
|
||||
fragments: String,
|
||||
highlighted: Vec<HighlightSection>,
|
||||
highlighted: Vec<Range<usize>>,
|
||||
}
|
||||
|
||||
const HIGHLIGHTEN_PREFIX: &str = "<b>";
|
||||
@@ -97,9 +80,9 @@ impl Snippet {
|
||||
for item in self.highlighted.iter() {
|
||||
html.push_str(&encode_minimal(&self.fragments[start_from..item.start]));
|
||||
html.push_str(HIGHLIGHTEN_PREFIX);
|
||||
html.push_str(&encode_minimal(&self.fragments[item.start..item.stop]));
|
||||
html.push_str(&encode_minimal(&self.fragments[item.clone()]));
|
||||
html.push_str(HIGHLIGHTEN_POSTFIX);
|
||||
start_from = item.stop;
|
||||
start_from = item.end;
|
||||
}
|
||||
html.push_str(&encode_minimal(
|
||||
&self.fragments[start_from..self.fragments.len()],
|
||||
@@ -113,7 +96,7 @@ impl Snippet {
|
||||
}
|
||||
|
||||
/// Returns a list of higlighted positions from the `Snippet`.
|
||||
pub fn highlighted(&self) -> &[HighlightSection] {
|
||||
pub fn highlighted(&self) -> &[Range<usize>] {
|
||||
&self.highlighted
|
||||
}
|
||||
}
|
||||
@@ -185,12 +168,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
let highlighted = fragment
|
||||
.highlighted
|
||||
.iter()
|
||||
.map(|item| {
|
||||
HighlightSection::new(
|
||||
item.start - fragment.start_offset,
|
||||
item.stop - fragment.start_offset,
|
||||
)
|
||||
})
|
||||
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
|
||||
.collect();
|
||||
Snippet {
|
||||
fragments: fragment_text.to_string(),
|
||||
|
||||
@@ -2,6 +2,7 @@ use crate::common::VInt;
|
||||
use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD};
|
||||
use crate::DocId;
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
/// Represents a block of checkpoints.
|
||||
///
|
||||
@@ -24,19 +25,19 @@ impl Default for CheckpointBlock {
|
||||
impl CheckpointBlock {
|
||||
/// If non-empty returns [start_doc, end_doc)
|
||||
/// for the overall block.
|
||||
pub fn doc_interval(&self) -> Option<(DocId, DocId)> {
|
||||
pub fn doc_interval(&self) -> Option<Range<DocId>> {
|
||||
let start_doc_opt = self
|
||||
.checkpoints
|
||||
.first()
|
||||
.cloned()
|
||||
.map(|checkpoint| checkpoint.start_doc);
|
||||
.map(|checkpoint| checkpoint.doc_range.start);
|
||||
let end_doc_opt = self
|
||||
.checkpoints
|
||||
.last()
|
||||
.cloned()
|
||||
.map(|checkpoint| checkpoint.end_doc);
|
||||
.map(|checkpoint| checkpoint.doc_range.end);
|
||||
match (start_doc_opt, end_doc_opt) {
|
||||
(Some(start_doc), Some(end_doc)) => Some((start_doc, end_doc)),
|
||||
(Some(start_doc), Some(end_doc)) => Some(start_doc..end_doc),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -55,7 +56,7 @@ impl CheckpointBlock {
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> Checkpoint {
|
||||
self.checkpoints[idx]
|
||||
self.checkpoints[idx].clone()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
@@ -67,12 +68,13 @@ impl CheckpointBlock {
|
||||
if self.checkpoints.is_empty() {
|
||||
return;
|
||||
}
|
||||
VInt(self.checkpoints[0].start_doc as u64).serialize_into_vec(buffer);
|
||||
VInt(self.checkpoints[0].start_offset as u64).serialize_into_vec(buffer);
|
||||
VInt(self.checkpoints[0].doc_range.start as u64).serialize_into_vec(buffer);
|
||||
VInt(self.checkpoints[0].byte_range.start as u64).serialize_into_vec(buffer);
|
||||
for checkpoint in &self.checkpoints {
|
||||
let delta_doc = checkpoint.end_doc - checkpoint.start_doc;
|
||||
let delta_doc = checkpoint.doc_range.end - checkpoint.doc_range.start;
|
||||
VInt(delta_doc as u64).serialize_into_vec(buffer);
|
||||
VInt(checkpoint.end_offset - checkpoint.start_offset).serialize_into_vec(buffer);
|
||||
VInt((checkpoint.byte_range.end - checkpoint.byte_range.start) as u64)
|
||||
.serialize_into_vec(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,15 +88,13 @@ impl CheckpointBlock {
|
||||
return Ok(());
|
||||
}
|
||||
let mut doc = VInt::deserialize_u64(data)? as DocId;
|
||||
let mut start_offset = VInt::deserialize_u64(data)?;
|
||||
let mut start_offset = VInt::deserialize_u64(data)? as usize;
|
||||
for _ in 0..len {
|
||||
let num_docs = VInt::deserialize_u64(data)? as DocId;
|
||||
let block_num_bytes = VInt::deserialize_u64(data)?;
|
||||
let block_num_bytes = VInt::deserialize_u64(data)? as usize;
|
||||
self.checkpoints.push(Checkpoint {
|
||||
start_doc: doc,
|
||||
end_doc: doc + num_docs,
|
||||
start_offset,
|
||||
end_offset: start_offset + block_num_bytes,
|
||||
doc_range: doc..doc + num_docs,
|
||||
byte_range: start_offset..start_offset + block_num_bytes,
|
||||
});
|
||||
doc += num_docs;
|
||||
start_offset += block_num_bytes;
|
||||
@@ -112,17 +112,15 @@ mod tests {
|
||||
|
||||
fn test_aux_ser_deser(checkpoints: &[Checkpoint]) -> io::Result<()> {
|
||||
let mut block = CheckpointBlock::default();
|
||||
for &checkpoint in checkpoints {
|
||||
block.push(checkpoint);
|
||||
for checkpoint in checkpoints {
|
||||
block.push(checkpoint.clone());
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
block.serialize(&mut buffer);
|
||||
let mut block_deser = CheckpointBlock::default();
|
||||
let checkpoint = Checkpoint {
|
||||
start_doc: 0,
|
||||
end_doc: 1,
|
||||
start_offset: 2,
|
||||
end_offset: 3,
|
||||
doc_range: 0..1,
|
||||
byte_range: 2..3,
|
||||
};
|
||||
block_deser.push(checkpoint); // < check that value is erased before deser
|
||||
let mut data = &buffer[..];
|
||||
@@ -140,26 +138,22 @@ mod tests {
|
||||
#[test]
|
||||
fn test_block_serialize_simple() -> io::Result<()> {
|
||||
let checkpoints = vec![Checkpoint {
|
||||
start_doc: 10,
|
||||
end_doc: 12,
|
||||
start_offset: 100,
|
||||
end_offset: 120,
|
||||
doc_range: 10..12,
|
||||
byte_range: 100..120,
|
||||
}];
|
||||
test_aux_ser_deser(&checkpoints)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_serialize() -> io::Result<()> {
|
||||
let offsets: Vec<u64> = (0..11).map(|i| i * i * i).collect();
|
||||
let offsets: Vec<usize> = (0..11).map(|i| i * i * i).collect();
|
||||
let mut checkpoints = vec![];
|
||||
let mut start_doc = 0;
|
||||
for i in 0..10 {
|
||||
let end_doc = (i * i) as DocId;
|
||||
checkpoints.push(Checkpoint {
|
||||
start_doc,
|
||||
end_doc,
|
||||
start_offset: offsets[i],
|
||||
end_offset: offsets[i + 1],
|
||||
doc_range: start_doc..end_doc,
|
||||
byte_range: offsets[i]..offsets[i + 1],
|
||||
});
|
||||
start_doc = end_doc;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
const CHECKPOINT_PERIOD: usize = 8;
|
||||
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
mod block;
|
||||
mod skip_index;
|
||||
mod skip_index_builder;
|
||||
@@ -15,30 +16,24 @@ pub use self::skip_index_builder::SkipIndexBuilder;
|
||||
/// of checkpoints.
|
||||
///
|
||||
/// All of the intervals here defined are semi-open.
|
||||
/// The checkpoint describes that the block within the bytes
|
||||
/// `[start_offset..end_offset)` spans over the docs
|
||||
/// `[start_doc..end_doc)`.
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
/// The checkpoint describes that the block within the `byte_range`
|
||||
/// and spans over the `doc_range`.
|
||||
#[derive(Clone, Eq, PartialEq)]
|
||||
pub struct Checkpoint {
|
||||
pub start_doc: DocId,
|
||||
pub end_doc: DocId,
|
||||
pub start_offset: u64,
|
||||
pub end_offset: u64,
|
||||
pub doc_range: Range<DocId>,
|
||||
pub byte_range: Range<usize>,
|
||||
}
|
||||
|
||||
impl Checkpoint {
|
||||
pub(crate) fn follows(&self, other: &Checkpoint) -> bool {
|
||||
(self.start_doc == other.end_doc) && (self.start_offset == other.end_offset)
|
||||
(self.doc_range.start == other.doc_range.end)
|
||||
&& (self.doc_range.start == other.doc_range.end)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Checkpoint {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"(doc=[{}..{}), bytes=[{}..{}))",
|
||||
self.start_doc, self.end_doc, self.start_offset, self.end_offset
|
||||
)
|
||||
write!(f, "(doc={:?}, bytes={:?})", self.doc_range, self.byte_range)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,12 +69,10 @@ mod tests {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
|
||||
let checkpoint = Checkpoint {
|
||||
start_doc: 0,
|
||||
end_doc: 2,
|
||||
start_offset: 0,
|
||||
end_offset: 3,
|
||||
doc_range: 0..2,
|
||||
byte_range: 0..3,
|
||||
};
|
||||
skip_index_builder.insert(checkpoint);
|
||||
skip_index_builder.insert(checkpoint.clone());
|
||||
skip_index_builder.write(&mut output)?;
|
||||
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
|
||||
let mut skip_cursor = skip_index.checkpoints();
|
||||
@@ -93,40 +86,30 @@ mod tests {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let checkpoints = vec![
|
||||
Checkpoint {
|
||||
start_doc: 0,
|
||||
end_doc: 3,
|
||||
start_offset: 0,
|
||||
end_offset: 9,
|
||||
doc_range: 0..3,
|
||||
byte_range: 0..9,
|
||||
},
|
||||
Checkpoint {
|
||||
start_doc: 3,
|
||||
end_doc: 4,
|
||||
start_offset: 9,
|
||||
end_offset: 25,
|
||||
doc_range: 3..4,
|
||||
byte_range: 9..25,
|
||||
},
|
||||
Checkpoint {
|
||||
start_doc: 4,
|
||||
end_doc: 6,
|
||||
start_offset: 25,
|
||||
end_offset: 49,
|
||||
doc_range: 4..6,
|
||||
byte_range: 25..49,
|
||||
},
|
||||
Checkpoint {
|
||||
start_doc: 6,
|
||||
end_doc: 8,
|
||||
start_offset: 49,
|
||||
end_offset: 81,
|
||||
doc_range: 6..8,
|
||||
byte_range: 49..81,
|
||||
},
|
||||
Checkpoint {
|
||||
start_doc: 8,
|
||||
end_doc: 10,
|
||||
start_offset: 81,
|
||||
end_offset: 100,
|
||||
doc_range: 8..10,
|
||||
byte_range: 81..100,
|
||||
},
|
||||
];
|
||||
|
||||
let mut skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
|
||||
for &checkpoint in &checkpoints {
|
||||
skip_index_builder.insert(checkpoint);
|
||||
for checkpoint in &checkpoints {
|
||||
skip_index_builder.insert(checkpoint.clone());
|
||||
}
|
||||
skip_index_builder.write(&mut output)?;
|
||||
|
||||
@@ -138,8 +121,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn offset_test(doc: DocId) -> u64 {
|
||||
(doc as u64) * (doc as u64)
|
||||
fn offset_test(doc: DocId) -> usize {
|
||||
(doc as usize) * (doc as usize)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -181,15 +164,13 @@ mod tests {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let checkpoints: Vec<Checkpoint> = (0..1000)
|
||||
.map(|i| Checkpoint {
|
||||
start_doc: i,
|
||||
end_doc: i + 1,
|
||||
start_offset: offset_test(i),
|
||||
end_offset: offset_test(i + 1),
|
||||
doc_range: i..(i + 1),
|
||||
byte_range: offset_test(i)..offset_test(i + 1),
|
||||
})
|
||||
.collect();
|
||||
let mut skip_index_builder = SkipIndexBuilder::new();
|
||||
for checkpoint in &checkpoints {
|
||||
skip_index_builder.insert(*checkpoint);
|
||||
skip_index_builder.insert(checkpoint.clone());
|
||||
}
|
||||
skip_index_builder.write(&mut output)?;
|
||||
assert_eq!(output.len(), 4035);
|
||||
@@ -200,10 +181,10 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn integrate_delta(vals: Vec<u64>) -> Vec<u64> {
|
||||
fn integrate_delta(vals: Vec<usize>) -> Vec<usize> {
|
||||
let mut output = Vec::with_capacity(vals.len() + 1);
|
||||
output.push(0u64);
|
||||
let mut prev = 0u64;
|
||||
output.push(0);
|
||||
let mut prev = 0;
|
||||
for val in vals {
|
||||
let new_val = val + prev;
|
||||
prev = new_val;
|
||||
@@ -217,16 +198,14 @@ mod tests {
|
||||
(0..max_len)
|
||||
.prop_flat_map(move |len: usize| {
|
||||
(
|
||||
proptest::collection::vec(1u64..20u64, len as usize).prop_map(integrate_delta),
|
||||
proptest::collection::vec(1u64..26u64, len as usize).prop_map(integrate_delta),
|
||||
proptest::collection::vec(1usize..20, len as usize).prop_map(integrate_delta),
|
||||
proptest::collection::vec(1usize..26, len as usize).prop_map(integrate_delta),
|
||||
)
|
||||
.prop_map(|(docs, offsets)| {
|
||||
(0..docs.len() - 1)
|
||||
.map(move |i| Checkpoint {
|
||||
start_doc: docs[i] as DocId,
|
||||
end_doc: docs[i + 1] as DocId,
|
||||
start_offset: offsets[i],
|
||||
end_offset: offsets[i + 1],
|
||||
doc_range: docs[i] as DocId..docs[i + 1] as DocId,
|
||||
byte_range: offsets[i]..offsets[i + 1],
|
||||
})
|
||||
.collect::<Vec<Checkpoint>>()
|
||||
})
|
||||
@@ -240,17 +219,17 @@ mod tests {
|
||||
) -> Option<Checkpoint> {
|
||||
checkpoints
|
||||
.into_iter()
|
||||
.filter(|checkpoint| checkpoint.end_doc > target)
|
||||
.filter(|checkpoint| checkpoint.doc_range.end > target)
|
||||
.next()
|
||||
}
|
||||
|
||||
fn test_skip_index_aux(skip_index: SkipIndex, checkpoints: &[Checkpoint]) {
|
||||
if let Some(last_checkpoint) = checkpoints.last() {
|
||||
for doc in 0u32..last_checkpoint.end_doc {
|
||||
for doc in 0u32..last_checkpoint.doc_range.end {
|
||||
let expected = seek_manual(skip_index.checkpoints(), doc);
|
||||
assert_eq!(expected, skip_index.seek(doc), "Doc {}", doc);
|
||||
}
|
||||
assert!(skip_index.seek(last_checkpoint.end_doc).is_none());
|
||||
assert!(skip_index.seek(last_checkpoint.doc_range.end).is_none());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -36,21 +36,21 @@ struct Layer {
|
||||
|
||||
impl Layer {
|
||||
fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
|
||||
self.cursor_at_offset(0u64)
|
||||
self.cursor_at_offset(0)
|
||||
}
|
||||
|
||||
fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
|
||||
fn cursor_at_offset(&self, start_offset: usize) -> impl Iterator<Item = Checkpoint> + '_ {
|
||||
let data = &self.data.as_slice();
|
||||
LayerCursor {
|
||||
remaining: &data[start_offset as usize..],
|
||||
remaining: &data[start_offset..],
|
||||
block: CheckpointBlock::default(),
|
||||
cursor: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn seek_start_at_offset(&self, target: DocId, offset: u64) -> Option<Checkpoint> {
|
||||
fn seek_start_at_offset(&self, target: DocId, offset: usize) -> Option<Checkpoint> {
|
||||
self.cursor_at_offset(offset)
|
||||
.find(|checkpoint| checkpoint.end_doc > target)
|
||||
.find(|checkpoint| checkpoint.doc_range.end > target)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ impl SkipIndex {
|
||||
let mut layers = Vec::new();
|
||||
for end_offset in offsets {
|
||||
let layer = Layer {
|
||||
data: data.slice(start_offset as usize, end_offset as usize),
|
||||
data: data.slice(start_offset as usize..end_offset as usize),
|
||||
};
|
||||
layers.push(layer);
|
||||
start_offset = end_offset;
|
||||
@@ -88,17 +88,15 @@ impl SkipIndex {
|
||||
let first_layer_len = self
|
||||
.layers
|
||||
.first()
|
||||
.map(|layer| layer.data.len() as u64)
|
||||
.unwrap_or(0u64);
|
||||
.map(|layer| layer.data.len())
|
||||
.unwrap_or(0);
|
||||
let mut cur_checkpoint = Checkpoint {
|
||||
start_doc: 0u32,
|
||||
end_doc: 1u32,
|
||||
start_offset: 0u64,
|
||||
end_offset: first_layer_len,
|
||||
doc_range: 0u32..1u32,
|
||||
byte_range: 0..first_layer_len,
|
||||
};
|
||||
for layer in &self.layers {
|
||||
if let Some(checkpoint) =
|
||||
layer.seek_start_at_offset(target, cur_checkpoint.start_offset)
|
||||
layer.seek_start_at_offset(target, cur_checkpoint.byte_range.start)
|
||||
{
|
||||
cur_checkpoint = checkpoint;
|
||||
} else {
|
||||
|
||||
@@ -28,16 +28,14 @@ impl LayerBuilder {
|
||||
///
|
||||
/// If the block was empty to begin with, simply return None.
|
||||
fn flush_block(&mut self) -> Option<Checkpoint> {
|
||||
if let Some((start_doc, end_doc)) = self.block.doc_interval() {
|
||||
let start_offset = self.buffer.len() as u64;
|
||||
if let Some(doc_range) = self.block.doc_interval() {
|
||||
let start_offset = self.buffer.len();
|
||||
self.block.serialize(&mut self.buffer);
|
||||
let end_offset = self.buffer.len() as u64;
|
||||
let end_offset = self.buffer.len();
|
||||
self.block.clear();
|
||||
Some(Checkpoint {
|
||||
start_doc,
|
||||
end_doc,
|
||||
start_offset,
|
||||
end_offset,
|
||||
doc_range,
|
||||
byte_range: start_offset..end_offset,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
|
||||
@@ -17,7 +17,7 @@ const LRU_CACHE_CAPACITY: usize = 100;
|
||||
|
||||
type Block = Arc<Vec<u8>>;
|
||||
|
||||
type BlockCache = Arc<Mutex<LruCache<u64, Block>>>;
|
||||
type BlockCache = Arc<Mutex<LruCache<usize, Block>>>;
|
||||
|
||||
/// Reads document off tantivy's [`Store`](./index.html)
|
||||
pub struct StoreReader {
|
||||
@@ -59,16 +59,11 @@ impl StoreReader {
|
||||
}
|
||||
|
||||
fn compressed_block(&self, checkpoint: &Checkpoint) -> io::Result<OwnedBytes> {
|
||||
self.data
|
||||
.slice(
|
||||
checkpoint.start_offset as usize,
|
||||
checkpoint.end_offset as usize,
|
||||
)
|
||||
.read_bytes()
|
||||
self.data.slice(checkpoint.byte_range.clone()).read_bytes()
|
||||
}
|
||||
|
||||
fn read_block(&self, checkpoint: &Checkpoint) -> io::Result<Block> {
|
||||
if let Some(block) = self.cache.lock().unwrap().get(&checkpoint.start_offset) {
|
||||
if let Some(block) = self.cache.lock().unwrap().get(&checkpoint.byte_range.start) {
|
||||
self.cache_hits.fetch_add(1, Ordering::SeqCst);
|
||||
return Ok(block.clone());
|
||||
}
|
||||
@@ -83,7 +78,7 @@ impl StoreReader {
|
||||
self.cache
|
||||
.lock()
|
||||
.unwrap()
|
||||
.put(checkpoint.start_offset, block.clone());
|
||||
.put(checkpoint.byte_range.start, block.clone());
|
||||
|
||||
Ok(block)
|
||||
}
|
||||
@@ -100,7 +95,7 @@ impl StoreReader {
|
||||
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
|
||||
})?;
|
||||
let mut cursor = &self.read_block(&checkpoint)?[..];
|
||||
for _ in checkpoint.start_doc..doc_id {
|
||||
for _ in checkpoint.doc_range.start..doc_id {
|
||||
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
|
||||
cursor = &cursor[doc_length..];
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ impl StoreWriter {
|
||||
}
|
||||
assert_eq!(self.first_doc_in_block, self.doc);
|
||||
let doc_shift = self.doc;
|
||||
let start_shift = self.writer.written_bytes() as u64;
|
||||
let start_shift = self.writer.written_bytes() as usize;
|
||||
|
||||
// just bulk write all of the block of the given reader.
|
||||
self.writer
|
||||
@@ -83,34 +83,32 @@ impl StoreWriter {
|
||||
// concatenate the index of the `store_reader`, after translating
|
||||
// its start doc id and its start file offset.
|
||||
for mut checkpoint in store_reader.block_checkpoints() {
|
||||
checkpoint.start_doc += doc_shift;
|
||||
checkpoint.end_doc += doc_shift;
|
||||
checkpoint.start_offset += start_shift;
|
||||
checkpoint.end_offset += start_shift;
|
||||
checkpoint.doc_range.start += doc_shift;
|
||||
checkpoint.doc_range.end += doc_shift;
|
||||
checkpoint.byte_range.start += start_shift;
|
||||
checkpoint.byte_range.end += start_shift;
|
||||
self.register_checkpoint(checkpoint);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn register_checkpoint(&mut self, checkpoint: Checkpoint) {
|
||||
self.offset_index_writer.insert(checkpoint);
|
||||
self.first_doc_in_block = checkpoint.end_doc;
|
||||
self.doc = checkpoint.end_doc;
|
||||
self.offset_index_writer.insert(checkpoint.clone());
|
||||
self.first_doc_in_block = checkpoint.doc_range.end;
|
||||
self.doc = checkpoint.doc_range.end;
|
||||
}
|
||||
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
assert!(self.doc > 0);
|
||||
self.intermediary_buffer.clear();
|
||||
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
|
||||
let start_offset = self.writer.written_bytes();
|
||||
let start_offset = self.writer.written_bytes() as usize;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
let end_offset = self.writer.written_bytes();
|
||||
let end_offset = self.writer.written_bytes() as usize;
|
||||
let end_doc = self.doc;
|
||||
self.register_checkpoint(Checkpoint {
|
||||
start_doc: self.first_doc_in_block,
|
||||
end_doc,
|
||||
start_offset,
|
||||
end_offset,
|
||||
doc_range: self.first_doc_in_block..end_doc,
|
||||
byte_range: start_offset..end_offset,
|
||||
});
|
||||
self.current_block.clear();
|
||||
Ok(())
|
||||
|
||||
@@ -68,18 +68,17 @@ impl TermInfoBlockMeta {
|
||||
let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize;
|
||||
let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize;
|
||||
|
||||
let postings_start_offset = self.ref_term_info.postings_start_offset
|
||||
+ extract_bits(data, posting_start_addr, self.postings_offset_nbits);
|
||||
let postings_stop_offset = self.ref_term_info.postings_start_offset
|
||||
+ extract_bits(data, posting_stop_addr, self.postings_offset_nbits);
|
||||
let postings_start_offset = self.ref_term_info.postings_range.start
|
||||
+ extract_bits(data, posting_start_addr, self.postings_offset_nbits) as usize;
|
||||
let postings_end_offset = self.ref_term_info.postings_range.start
|
||||
+ extract_bits(data, posting_stop_addr, self.postings_offset_nbits) as usize;
|
||||
let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32;
|
||||
let positions_idx = self.ref_term_info.positions_idx
|
||||
+ extract_bits(data, positions_idx_addr, self.positions_idx_nbits);
|
||||
|
||||
TermInfo {
|
||||
doc_freq,
|
||||
postings_start_offset,
|
||||
postings_stop_offset,
|
||||
postings_range: postings_start_offset..postings_end_offset,
|
||||
positions_idx,
|
||||
}
|
||||
}
|
||||
@@ -163,7 +162,7 @@ fn bitpack_serialize<W: Write>(
|
||||
term_info: &TermInfo,
|
||||
) -> io::Result<()> {
|
||||
bit_packer.write(
|
||||
term_info.postings_start_offset,
|
||||
term_info.postings_range.start as u64,
|
||||
term_info_block_meta.postings_offset_nbits,
|
||||
write,
|
||||
)?;
|
||||
@@ -200,15 +199,15 @@ impl TermInfoStoreWriter {
|
||||
} else {
|
||||
return Ok(());
|
||||
};
|
||||
let postings_stop_offset =
|
||||
last_term_info.postings_stop_offset - ref_term_info.postings_start_offset;
|
||||
let postings_end_offset =
|
||||
last_term_info.postings_range.end - ref_term_info.postings_range.start;
|
||||
for term_info in &mut self.term_infos[1..] {
|
||||
term_info.postings_start_offset -= ref_term_info.postings_start_offset;
|
||||
term_info.postings_range.start -= ref_term_info.postings_range.start;
|
||||
term_info.positions_idx -= ref_term_info.positions_idx;
|
||||
}
|
||||
|
||||
let mut max_doc_freq: u32 = 0u32;
|
||||
let max_postings_offset: u64 = postings_stop_offset;
|
||||
let max_postings_offset: usize = postings_end_offset;
|
||||
let max_positions_idx: u64 = last_term_info.positions_idx;
|
||||
|
||||
for term_info in &self.term_infos[1..] {
|
||||
@@ -216,7 +215,7 @@ impl TermInfoStoreWriter {
|
||||
}
|
||||
|
||||
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
|
||||
let max_postings_offset_nbits = compute_num_bits(max_postings_offset);
|
||||
let max_postings_offset_nbits = compute_num_bits(max_postings_offset as u64);
|
||||
let max_positions_idx_nbits = compute_num_bits(max_positions_idx);
|
||||
|
||||
let term_info_block_meta = TermInfoBlockMeta {
|
||||
@@ -238,7 +237,7 @@ impl TermInfoStoreWriter {
|
||||
}
|
||||
|
||||
bit_packer.write(
|
||||
postings_stop_offset,
|
||||
postings_end_offset as u64,
|
||||
term_info_block_meta.postings_offset_nbits,
|
||||
&mut self.buffer_term_infos,
|
||||
)?;
|
||||
@@ -251,7 +250,6 @@ impl TermInfoStoreWriter {
|
||||
}
|
||||
|
||||
pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||
assert!(term_info.postings_stop_offset >= term_info.postings_start_offset);
|
||||
self.num_terms += 1u64;
|
||||
self.term_infos.push(term_info.clone());
|
||||
if self.term_infos.len() >= BLOCK_LEN {
|
||||
@@ -314,8 +312,7 @@ mod tests {
|
||||
offset: 2009u64,
|
||||
ref_term_info: TermInfo {
|
||||
doc_freq: 512,
|
||||
postings_start_offset: 51,
|
||||
postings_stop_offset: 57u64,
|
||||
postings_range: 51..57,
|
||||
positions_idx: 3584,
|
||||
},
|
||||
doc_freq_nbits: 10,
|
||||
@@ -333,12 +330,11 @@ mod tests {
|
||||
fn test_pack() -> crate::Result<()> {
|
||||
let mut store_writer = TermInfoStoreWriter::new();
|
||||
let mut term_infos = vec![];
|
||||
let offset = |i| (i * 13 + i * i) as u64;
|
||||
for i in 0..1000 {
|
||||
let offset = |i| (i * 13 + i * i);
|
||||
for i in 0usize..1000usize {
|
||||
let term_info = TermInfo {
|
||||
doc_freq: i as u32,
|
||||
postings_start_offset: offset(i),
|
||||
postings_stop_offset: offset(i + 1),
|
||||
postings_range: offset(i)..offset(i + 1),
|
||||
positions_idx: (i * 7) as u64,
|
||||
};
|
||||
store_writer.write_term_info(&term_info)?;
|
||||
|
||||
@@ -9,12 +9,11 @@ use std::str;
|
||||
const BLOCK_SIZE: usize = 1_500;
|
||||
|
||||
fn make_term_info(term_ord: u64) -> TermInfo {
|
||||
let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord;
|
||||
let offset = |term_ord: u64| (term_ord * 100 + term_ord * term_ord) as usize;
|
||||
TermInfo {
|
||||
doc_freq: term_ord as u32,
|
||||
postings_start_offset: offset(term_ord),
|
||||
postings_stop_offset: offset(term_ord + 1),
|
||||
positions_idx: offset(term_ord) * 2u64,
|
||||
postings_range: offset(term_ord)..offset(term_ord + 1),
|
||||
positions_idx: offset(term_ord) as u64 * 2u64,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user