Replacing (start, end) by Range

This commit is contained in:
Paul Masurel
2021-03-10 14:06:21 +09:00
parent 316d65d7c6
commit 31137beea6
21 changed files with 218 additions and 295 deletions

View File

@@ -69,12 +69,12 @@ fn highlight(snippet: Snippet) -> String {
let mut result = String::new();
let mut start_from = 0;
for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) {
result.push_str(&snippet.fragments()[start_from..start]);
for fragment_range in snippet.highlighted() {
result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
result.push_str(" --> ");
result.push_str(&snippet.fragments()[start..end]);
result.push_str(&snippet.fragments()[fragment_range.clone()]);
result.push_str(" <-- ");
start_from = end;
start_from = fragment_range.end;
}
result.push_str(&snippet.fragments()[start_from..]);

View File

@@ -8,6 +8,8 @@ use crate::space_usage::FieldUsage;
use crate::space_usage::PerFieldSpaceUsage;
use std::collections::HashMap;
use std::io::{self, Read, Write};
use std::iter::ExactSizeIterator;
use std::ops::Range;
use super::HasLen;
@@ -105,7 +107,7 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
#[derive(Clone)]
pub struct CompositeFile {
data: FileSlice,
offsets_index: HashMap<FileAddr, (usize, usize)>,
offsets_index: HashMap<FileAddr, Range<usize>>,
}
impl CompositeFile {
@@ -117,7 +119,7 @@ impl CompositeFile {
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
let footer_start = end - 4 - footer_len;
let footer_data = data
.slice(footer_start, footer_start + footer_len)
.slice(footer_start..footer_start + footer_len)
.read_bytes()?;
let mut footer_buffer = footer_data.as_slice();
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
@@ -138,7 +140,7 @@ impl CompositeFile {
let file_addr = file_addrs[i];
let start_offset = offsets[i];
let end_offset = offsets[i + 1];
field_index.insert(file_addr, (start_offset, end_offset));
field_index.insert(file_addr, start_offset..end_offset);
}
Ok(CompositeFile {
@@ -167,16 +169,16 @@ impl CompositeFile {
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<FileSlice> {
self.offsets_index
.get(&FileAddr { field, idx })
.map(|&(from, to)| self.data.slice(from, to))
.map(|byte_range| self.data.slice(byte_range.clone()))
}
pub fn space_usage(&self) -> PerFieldSpaceUsage {
let mut fields = HashMap::new();
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
for (&field_addr, byte_range) in &self.offsets_index {
fields
.entry(field_addr.field)
.or_insert_with(|| FieldUsage::empty(field_addr.field))
.add_field_idx(field_addr.idx, end - start);
.add_field_idx(field_addr.idx, byte_range.len());
}
PerFieldSpaceUsage::new(fields)
}

View File

@@ -90,9 +90,9 @@ impl InvertedIndexReader {
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) -> io::Result<()> {
let start_offset = term_info.postings_start_offset as usize;
let stop_offset = term_info.postings_stop_offset as usize;
let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset);
let postings_slice = self
.postings_file_slice
.slice(term_info.postings_range.clone());
block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?);
Ok(())
}
@@ -120,10 +120,9 @@ impl InvertedIndexReader {
term_info: &TermInfo,
requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings> {
let postings_data = self.postings_file_slice.slice(
term_info.postings_start_offset as usize,
term_info.postings_stop_offset as usize,
);
let postings_data = self
.postings_file_slice
.slice(term_info.postings_range.clone());
BlockSegmentPostings::open(
term_info.doc_freq,
postings_data,

View File

@@ -2,6 +2,7 @@ use stable_deref_trait::StableDeref;
use crate::common::HasLen;
use crate::directory::OwnedBytes;
use std::ops::Range;
use std::sync::{Arc, Weak};
use std::{io, ops::Deref};
@@ -20,19 +21,19 @@ pub trait FileHandle: 'static + Send + Sync + HasLen {
/// Reads a slice of bytes.
///
/// This method may panic if the range requested is invalid.
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes>;
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes>;
}
impl FileHandle for &'static [u8] {
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
let bytes = &self[from..to];
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
let bytes = &self[range];
Ok(OwnedBytes::new(bytes))
}
}
impl<T: Deref<Target = [u8]>> HasLen for T {
fn len(&self) -> usize {
self.as_ref().len()
self.deref().len()
}
}
@@ -52,8 +53,7 @@ where
#[derive(Clone)]
pub struct FileSlice {
data: Arc<dyn FileHandle>,
start: usize,
stop: usize,
range: Range<usize>,
}
impl FileSlice {
@@ -68,8 +68,7 @@ impl FileSlice {
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
FileSlice {
data: Arc::from(file_handle),
start: 0,
stop: num_bytes,
range: 0..num_bytes,
}
}
@@ -77,14 +76,12 @@ impl FileSlice {
///
/// # Panics
///
/// Panics if `to < from` or if `to` exceeds the filesize.
pub fn slice(&self, from: usize, to: usize) -> FileSlice {
assert!(to <= self.len());
assert!(to >= from);
/// Panics if `byte_range.end` exceeds the filesize.
pub fn slice(&self, byte_range: Range<usize>) -> FileSlice {
assert!(byte_range.end <= self.len());
FileSlice {
data: self.data.clone(),
start: self.start + from,
stop: self.start + to,
range: self.range.start + byte_range.start..self.range.start + byte_range.end,
}
}
@@ -101,19 +98,21 @@ impl FileSlice {
/// In particular, it is up to the `Directory` implementation
/// to handle caching if needed.
pub fn read_bytes(&self) -> io::Result<OwnedBytes> {
self.data.read_bytes(self.start, self.stop)
self.data.read_bytes(self.range.clone())
}
/// Reads a specific slice of data.
///
/// This is equivalent to running `file_slice.slice(from, to).read_bytes()`.
pub fn read_bytes_slice(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
assert!(from <= to);
pub fn read_bytes_slice(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
assert!(
self.start + to <= self.stop,
"`to` exceeds the fileslice length"
range.end <= self.len(),
"end of requested range exceeds the fileslice length ({} > {})",
range.end,
self.len()
);
self.data.read_bytes(self.start + from, self.start + to)
self.data
.read_bytes(self.range.start + range.start..self.range.start + range.end)
}
/// Splits the FileSlice at the given offset and return two file slices.
@@ -138,7 +137,7 @@ impl FileSlice {
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> FileSlice {
self.slice(from_offset, self.len())
self.slice(from_offset..self.len())
}
/// Like `.slice(...)` but enforcing only the `to`
@@ -146,19 +145,19 @@ impl FileSlice {
///
/// Equivalent to `.slice(0, to_offset)`
pub fn slice_to(&self, to_offset: usize) -> FileSlice {
self.slice(0, to_offset)
self.slice(0..to_offset)
}
}
impl FileHandle for FileSlice {
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
self.read_bytes_slice(from, to)
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
self.read_bytes_slice(range)
}
}
impl HasLen for FileSlice {
fn len(&self) -> usize {
self.stop - self.start
self.range.len()
}
}
@@ -217,30 +216,23 @@ mod tests {
let slice = FileSlice::new(Box::new(&b"abcdef"[..]));
assert_eq!(slice.len(), 6);
assert_eq!(slice.read_bytes()?.as_ref(), b"abcdef");
assert_eq!(slice.slice(1, 4).read_bytes()?.as_ref(), b"bcd");
assert_eq!(slice.slice(1..4).read_bytes()?.as_ref(), b"bcd");
Ok(())
}
#[test]
fn test_slice_read_slice() -> io::Result<()> {
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
assert_eq!(slice_deref.read_bytes_slice(1, 4)?.as_ref(), b"bcd");
assert_eq!(slice_deref.read_bytes_slice(1..4)?.as_ref(), b"bcd");
Ok(())
}
#[test]
#[should_panic(expected = "assertion failed: from <= to")]
fn test_slice_read_slice_invalid_range() {
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
assert_eq!(slice_deref.read_bytes_slice(1, 0).unwrap().as_ref(), b"bcd");
}
#[test]
#[should_panic(expected = "`to` exceeds the fileslice length")]
#[should_panic(expected = "end of requested range exceeds the fileslice length (10 > 6)")]
fn test_slice_read_slice_invalid_range_exceeds() {
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
assert_eq!(
slice_deref.read_bytes_slice(0, 10).unwrap().as_ref(),
slice_deref.read_bytes_slice(0..10).unwrap().as_ref(),
b"bcd"
);
}

View File

@@ -2,7 +2,7 @@ use crate::directory::FileHandle;
use stable_deref_trait::StableDeref;
use std::convert::TryInto;
use std::mem;
use std::ops::Deref;
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::{fmt, io};
@@ -17,8 +17,8 @@ pub struct OwnedBytes {
}
impl FileHandle for OwnedBytes {
fn read_bytes(&self, from: usize, to: usize) -> io::Result<OwnedBytes> {
Ok(self.slice(from, to))
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
Ok(self.slice(range))
}
}
@@ -42,9 +42,9 @@ impl OwnedBytes {
}
/// creates a fileslice that is just a view over a slice of the data.
pub fn slice(&self, from: usize, to: usize) -> Self {
pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes {
data: &self.data[from..to],
data: &self.data[range],
box_stable_deref: self.box_stable_deref.clone(),
}
}

View File

@@ -1,3 +1,5 @@
use std::ops::Range;
use crate::fastfield::{FastFieldReader, FastValue};
use crate::DocId;
@@ -28,24 +30,24 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
fn range(&self, doc: DocId) -> (u64, u64) {
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
(start, stop)
start..stop
}
/// Returns the array of values associated to the given `doc`.
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let (start, stop) = self.range(doc);
let len = (stop - start) as usize;
let range = self.range(doc);
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range_u64(start, &mut vals[..]);
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
}
/// Returns the number of values associated with the document `DocId`.
pub fn num_vals(&self, doc: DocId) -> usize {
let (start, stop) = self.range(doc);
(stop - start) as usize
let range = self.range(doc);
(range.end - range.start) as usize
}
/// Returns the overall number of values in this field .

View File

@@ -125,21 +125,18 @@ impl MultiValuedFastFieldWriter {
1,
)?;
let last_interval = (
self.doc_index.last().cloned().unwrap(),
self.vals.len() as u64,
);
let last_interval =
self.doc_index.last().cloned().unwrap() as usize..self.vals.len();
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
for (start, stop) in self
for range in self
.doc_index
.windows(2)
.map(|interval| (interval[0], interval[1]))
.map(|interval| interval[0] as usize..interval[1] as usize)
.chain(Some(last_interval).into_iter())
.map(|(start, stop)| (start as usize, stop as usize))
{
doc_vals.clear();
let remapped_vals = self.vals[start..stop]
let remapped_vals = self.vals[range]
.iter()
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
doc_vals.extend(remapped_vals);

View File

@@ -1,3 +1,5 @@
use std::ops::Range;
use crate::postings::compression::AlignedBuffer;
/// This modules define the logic used to search for a doc in a given
@@ -72,7 +74,7 @@ fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
fn exponential_search(arr: &[u32], target: u32) -> Range<usize> {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
@@ -80,17 +82,17 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
break;
}
if arr[pivot] > target {
return (begin, pivot);
return begin..pivot;
}
begin = pivot;
}
(begin, end)
begin..end
}
#[inline(never)]
fn galloping(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(&block_docs, target);
start + linear_search(&block_docs[start..end], target)
let range = exponential_search(&block_docs, target);
range.start + linear_search(&block_docs[range], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
@@ -182,11 +184,11 @@ mod tests {
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
assert_eq!(exponential_search(&[1, 2], 0), 0..1);
assert_eq!(exponential_search(&[1, 2], 1), 0..1);
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
(3, 7)
3..7
);
}

View File

@@ -16,7 +16,7 @@ use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
use std::marker::PhantomData;
use std::ops::DerefMut;
use std::ops::{DerefMut, Range};
fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
match *field_entry.field_type() {
@@ -52,7 +52,7 @@ pub struct MultiFieldPostingsWriter {
fn make_field_partition(
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
) -> Vec<(Field, usize, usize)> {
) -> Vec<(Field, Range<usize>)> {
let term_offsets_it = term_offsets
.iter()
.map(|(key, _, _)| Term::wrap(key).field())
@@ -70,7 +70,7 @@ fn make_field_partition(
offsets.push(term_offsets.len());
let mut field_offsets = vec![];
for i in 0..fields.len() {
field_offsets.push((fields[i], offsets[i], offsets[i + 1]));
field_offsets.push((fields[i], offsets[i]..offsets[i + 1]));
}
field_offsets
}
@@ -138,14 +138,14 @@ impl MultiFieldPostingsWriter {
let field_offsets = make_field_partition(&term_offsets);
for (field, start, stop) in field_offsets {
for (field, byte_offsets) in field_offsets {
let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Str(_) | FieldType::HierarchicalFacet => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let unordered_term_ids = term_offsets[start..stop]
let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter()
.map(|&(_, _, bucket)| bucket);
let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
@@ -169,7 +169,7 @@ impl MultiFieldPostingsWriter {
fieldnorm_reader,
)?;
postings_writer.serialize(
&term_offsets[start..stop],
&term_offsets[byte_offsets],
&mut field_serializer,
&self.term_index.heap,
&self.heap,

View File

@@ -183,10 +183,10 @@ impl<'a> FieldSerializer<'a> {
} else {
0u64
};
let addr = self.postings_serializer.addr() as usize;
TermInfo {
doc_freq: 0,
postings_start_offset: self.postings_serializer.addr(),
postings_stop_offset: 0u64,
postings_range: addr..addr,
positions_idx,
}
}
@@ -242,7 +242,7 @@ impl<'a> FieldSerializer<'a> {
if self.term_open {
self.postings_serializer
.close_term(self.current_term_info.doc_freq)?;
self.current_term_info.postings_stop_offset = self.postings_serializer.addr();
self.current_term_info.postings_range.end = self.postings_serializer.addr() as usize;
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.term_open = false;

View File

@@ -17,10 +17,6 @@ pub fn compute_table_size(num_bits: usize) -> usize {
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external heap.
/// The `value_addr` also points to an address in the heap.
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
#[derive(Copy, Clone)]
struct KeyValue {
key_value_addr: Addr,

View File

@@ -1,25 +1,24 @@
use crate::common::{BinarySerializable, FixedSize};
use std::io;
use std::iter::ExactSizeIterator;
use std::ops::Range;
/// `TermInfo` wraps the metadata associated to a Term.
/// It is segment-local.
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
#[derive(Debug, Default, Eq, PartialEq, Clone)]
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,
/// Start offset of the posting list within the postings (`.idx`) file.
pub postings_start_offset: u64,
/// Stop offset of the posting list within the postings (`.idx`) file.
/// The byte range is `[start_offset..stop_offset)`.
pub postings_stop_offset: u64,
/// Byte range of the posting list within the postings (`.idx`) file.
pub postings_range: Range<usize>,
/// Start offset of the first block within the position (`.pos`) file.
pub positions_idx: u64,
}
impl TermInfo {
pub(crate) fn posting_num_bytes(&self) -> u32 {
let num_bytes = self.postings_stop_offset - self.postings_start_offset;
assert!(num_bytes <= std::u32::MAX as u64);
let num_bytes = self.postings_range.len();
assert!(num_bytes <= std::u32::MAX as usize);
num_bytes as u32
}
}
@@ -35,7 +34,7 @@ impl FixedSize for TermInfo {
impl BinarySerializable for TermInfo {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
self.doc_freq.serialize(writer)?;
self.postings_start_offset.serialize(writer)?;
(self.postings_range.start as u64).serialize(writer)?;
self.posting_num_bytes().serialize(writer)?;
self.positions_idx.serialize(writer)?;
Ok(())
@@ -43,14 +42,13 @@ impl BinarySerializable for TermInfo {
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = u32::deserialize(reader)?;
let postings_start_offset = u64::deserialize(reader)?;
let postings_start_offset = u64::deserialize(reader)? as usize;
let postings_num_bytes = u32::deserialize(reader)?;
let postings_stop_offset = postings_start_offset + u64::from(postings_num_bytes);
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes) as usize;
let positions_idx = u64::deserialize(reader)?;
Ok(TermInfo {
doc_freq,
postings_start_offset,
postings_stop_offset,
postings_range: postings_start_offset..postings_end_offset,
positions_idx,
})
}

View File

@@ -8,33 +8,17 @@ use htmlescape::encode_minimal;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::ops::Range;
const DEFAULT_MAX_NUM_CHARS: usize = 150;
#[derive(Debug)]
pub struct HighlightSection {
start: usize,
stop: usize,
}
impl HighlightSection {
fn new(start: usize, stop: usize) -> HighlightSection {
HighlightSection { start, stop }
}
/// Returns the bounds of the `HighlightSection`.
pub fn bounds(&self) -> (usize, usize) {
(self.start, self.stop)
}
}
#[derive(Debug)]
pub struct FragmentCandidate {
score: Score,
start_offset: usize,
stop_offset: usize,
num_chars: usize,
highlighted: Vec<HighlightSection>,
highlighted: Vec<Range<usize>>,
}
impl FragmentCandidate {
@@ -63,8 +47,7 @@ impl FragmentCandidate {
if let Some(&score) = terms.get(&token.text.to_lowercase()) {
self.score += score;
self.highlighted
.push(HighlightSection::new(token.offset_from, token.offset_to));
self.highlighted.push(token.offset_from..token.offset_to);
}
}
}
@@ -74,7 +57,7 @@ impl FragmentCandidate {
#[derive(Debug)]
pub struct Snippet {
fragments: String,
highlighted: Vec<HighlightSection>,
highlighted: Vec<Range<usize>>,
}
const HIGHLIGHTEN_PREFIX: &str = "<b>";
@@ -97,9 +80,9 @@ impl Snippet {
for item in self.highlighted.iter() {
html.push_str(&encode_minimal(&self.fragments[start_from..item.start]));
html.push_str(HIGHLIGHTEN_PREFIX);
html.push_str(&encode_minimal(&self.fragments[item.start..item.stop]));
html.push_str(&encode_minimal(&self.fragments[item.clone()]));
html.push_str(HIGHLIGHTEN_POSTFIX);
start_from = item.stop;
start_from = item.end;
}
html.push_str(&encode_minimal(
&self.fragments[start_from..self.fragments.len()],
@@ -113,7 +96,7 @@ impl Snippet {
}
/// Returns a list of higlighted positions from the `Snippet`.
pub fn highlighted(&self) -> &[HighlightSection] {
pub fn highlighted(&self) -> &[Range<usize>] {
&self.highlighted
}
}
@@ -185,12 +168,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
let highlighted = fragment
.highlighted
.iter()
.map(|item| {
HighlightSection::new(
item.start - fragment.start_offset,
item.stop - fragment.start_offset,
)
})
.map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
.collect();
Snippet {
fragments: fragment_text.to_string(),

View File

@@ -2,6 +2,7 @@ use crate::common::VInt;
use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD};
use crate::DocId;
use std::io;
use std::ops::Range;
/// Represents a block of checkpoints.
///
@@ -24,19 +25,19 @@ impl Default for CheckpointBlock {
impl CheckpointBlock {
/// If non-empty returns [start_doc, end_doc)
/// for the overall block.
pub fn doc_interval(&self) -> Option<(DocId, DocId)> {
pub fn doc_interval(&self) -> Option<Range<DocId>> {
let start_doc_opt = self
.checkpoints
.first()
.cloned()
.map(|checkpoint| checkpoint.start_doc);
.map(|checkpoint| checkpoint.doc_range.start);
let end_doc_opt = self
.checkpoints
.last()
.cloned()
.map(|checkpoint| checkpoint.end_doc);
.map(|checkpoint| checkpoint.doc_range.end);
match (start_doc_opt, end_doc_opt) {
(Some(start_doc), Some(end_doc)) => Some((start_doc, end_doc)),
(Some(start_doc), Some(end_doc)) => Some(start_doc..end_doc),
_ => None,
}
}
@@ -55,7 +56,7 @@ impl CheckpointBlock {
}
pub fn get(&self, idx: usize) -> Checkpoint {
self.checkpoints[idx]
self.checkpoints[idx].clone()
}
pub fn clear(&mut self) {
@@ -67,12 +68,13 @@ impl CheckpointBlock {
if self.checkpoints.is_empty() {
return;
}
VInt(self.checkpoints[0].start_doc as u64).serialize_into_vec(buffer);
VInt(self.checkpoints[0].start_offset as u64).serialize_into_vec(buffer);
VInt(self.checkpoints[0].doc_range.start as u64).serialize_into_vec(buffer);
VInt(self.checkpoints[0].byte_range.start as u64).serialize_into_vec(buffer);
for checkpoint in &self.checkpoints {
let delta_doc = checkpoint.end_doc - checkpoint.start_doc;
let delta_doc = checkpoint.doc_range.end - checkpoint.doc_range.start;
VInt(delta_doc as u64).serialize_into_vec(buffer);
VInt(checkpoint.end_offset - checkpoint.start_offset).serialize_into_vec(buffer);
VInt((checkpoint.byte_range.end - checkpoint.byte_range.start) as u64)
.serialize_into_vec(buffer);
}
}
@@ -86,15 +88,13 @@ impl CheckpointBlock {
return Ok(());
}
let mut doc = VInt::deserialize_u64(data)? as DocId;
let mut start_offset = VInt::deserialize_u64(data)?;
let mut start_offset = VInt::deserialize_u64(data)? as usize;
for _ in 0..len {
let num_docs = VInt::deserialize_u64(data)? as DocId;
let block_num_bytes = VInt::deserialize_u64(data)?;
let block_num_bytes = VInt::deserialize_u64(data)? as usize;
self.checkpoints.push(Checkpoint {
start_doc: doc,
end_doc: doc + num_docs,
start_offset,
end_offset: start_offset + block_num_bytes,
doc_range: doc..doc + num_docs,
byte_range: start_offset..start_offset + block_num_bytes,
});
doc += num_docs;
start_offset += block_num_bytes;
@@ -112,17 +112,15 @@ mod tests {
fn test_aux_ser_deser(checkpoints: &[Checkpoint]) -> io::Result<()> {
let mut block = CheckpointBlock::default();
for &checkpoint in checkpoints {
block.push(checkpoint);
for checkpoint in checkpoints {
block.push(checkpoint.clone());
}
let mut buffer = Vec::new();
block.serialize(&mut buffer);
let mut block_deser = CheckpointBlock::default();
let checkpoint = Checkpoint {
start_doc: 0,
end_doc: 1,
start_offset: 2,
end_offset: 3,
doc_range: 0..1,
byte_range: 2..3,
};
block_deser.push(checkpoint); // < check that value is erased before deser
let mut data = &buffer[..];
@@ -140,26 +138,22 @@ mod tests {
#[test]
fn test_block_serialize_simple() -> io::Result<()> {
let checkpoints = vec![Checkpoint {
start_doc: 10,
end_doc: 12,
start_offset: 100,
end_offset: 120,
doc_range: 10..12,
byte_range: 100..120,
}];
test_aux_ser_deser(&checkpoints)
}
#[test]
fn test_block_serialize() -> io::Result<()> {
let offsets: Vec<u64> = (0..11).map(|i| i * i * i).collect();
let offsets: Vec<usize> = (0..11).map(|i| i * i * i).collect();
let mut checkpoints = vec![];
let mut start_doc = 0;
for i in 0..10 {
let end_doc = (i * i) as DocId;
checkpoints.push(Checkpoint {
start_doc,
end_doc,
start_offset: offsets[i],
end_offset: offsets[i + 1],
doc_range: start_doc..end_doc,
byte_range: offsets[i]..offsets[i + 1],
});
start_doc = end_doc;
}

View File

@@ -1,6 +1,7 @@
const CHECKPOINT_PERIOD: usize = 8;
use std::fmt;
use std::ops::Range;
mod block;
mod skip_index;
mod skip_index_builder;
@@ -15,30 +16,24 @@ pub use self::skip_index_builder::SkipIndexBuilder;
/// of checkpoints.
///
/// All of the intervals here defined are semi-open.
/// The checkpoint describes that the block within the bytes
/// `[start_offset..end_offset)` spans over the docs
/// `[start_doc..end_doc)`.
#[derive(Clone, Copy, Eq, PartialEq)]
/// The checkpoint describes that the block within the `byte_range`
/// and spans over the `doc_range`.
#[derive(Clone, Eq, PartialEq)]
pub struct Checkpoint {
pub start_doc: DocId,
pub end_doc: DocId,
pub start_offset: u64,
pub end_offset: u64,
pub doc_range: Range<DocId>,
pub byte_range: Range<usize>,
}
impl Checkpoint {
pub(crate) fn follows(&self, other: &Checkpoint) -> bool {
(self.start_doc == other.end_doc) && (self.start_offset == other.end_offset)
(self.doc_range.start == other.doc_range.end)
&& (self.doc_range.start == other.doc_range.end)
}
}
impl fmt::Debug for Checkpoint {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"(doc=[{}..{}), bytes=[{}..{}))",
self.start_doc, self.end_doc, self.start_offset, self.end_offset
)
write!(f, "(doc={:?}, bytes={:?})", self.doc_range, self.byte_range)
}
}
@@ -74,12 +69,10 @@ mod tests {
let mut output: Vec<u8> = Vec::new();
let mut skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
let checkpoint = Checkpoint {
start_doc: 0,
end_doc: 2,
start_offset: 0,
end_offset: 3,
doc_range: 0..2,
byte_range: 0..3,
};
skip_index_builder.insert(checkpoint);
skip_index_builder.insert(checkpoint.clone());
skip_index_builder.write(&mut output)?;
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
let mut skip_cursor = skip_index.checkpoints();
@@ -93,40 +86,30 @@ mod tests {
let mut output: Vec<u8> = Vec::new();
let checkpoints = vec![
Checkpoint {
start_doc: 0,
end_doc: 3,
start_offset: 0,
end_offset: 9,
doc_range: 0..3,
byte_range: 0..9,
},
Checkpoint {
start_doc: 3,
end_doc: 4,
start_offset: 9,
end_offset: 25,
doc_range: 3..4,
byte_range: 9..25,
},
Checkpoint {
start_doc: 4,
end_doc: 6,
start_offset: 25,
end_offset: 49,
doc_range: 4..6,
byte_range: 25..49,
},
Checkpoint {
start_doc: 6,
end_doc: 8,
start_offset: 49,
end_offset: 81,
doc_range: 6..8,
byte_range: 49..81,
},
Checkpoint {
start_doc: 8,
end_doc: 10,
start_offset: 81,
end_offset: 100,
doc_range: 8..10,
byte_range: 81..100,
},
];
let mut skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
for &checkpoint in &checkpoints {
skip_index_builder.insert(checkpoint);
for checkpoint in &checkpoints {
skip_index_builder.insert(checkpoint.clone());
}
skip_index_builder.write(&mut output)?;
@@ -138,8 +121,8 @@ mod tests {
Ok(())
}
fn offset_test(doc: DocId) -> u64 {
(doc as u64) * (doc as u64)
fn offset_test(doc: DocId) -> usize {
(doc as usize) * (doc as usize)
}
#[test]
@@ -181,15 +164,13 @@ mod tests {
let mut output: Vec<u8> = Vec::new();
let checkpoints: Vec<Checkpoint> = (0..1000)
.map(|i| Checkpoint {
start_doc: i,
end_doc: i + 1,
start_offset: offset_test(i),
end_offset: offset_test(i + 1),
doc_range: i..(i + 1),
byte_range: offset_test(i)..offset_test(i + 1),
})
.collect();
let mut skip_index_builder = SkipIndexBuilder::new();
for checkpoint in &checkpoints {
skip_index_builder.insert(*checkpoint);
skip_index_builder.insert(checkpoint.clone());
}
skip_index_builder.write(&mut output)?;
assert_eq!(output.len(), 4035);
@@ -200,10 +181,10 @@ mod tests {
Ok(())
}
fn integrate_delta(vals: Vec<u64>) -> Vec<u64> {
fn integrate_delta(vals: Vec<usize>) -> Vec<usize> {
let mut output = Vec::with_capacity(vals.len() + 1);
output.push(0u64);
let mut prev = 0u64;
output.push(0);
let mut prev = 0;
for val in vals {
let new_val = val + prev;
prev = new_val;
@@ -217,16 +198,14 @@ mod tests {
(0..max_len)
.prop_flat_map(move |len: usize| {
(
proptest::collection::vec(1u64..20u64, len as usize).prop_map(integrate_delta),
proptest::collection::vec(1u64..26u64, len as usize).prop_map(integrate_delta),
proptest::collection::vec(1usize..20, len as usize).prop_map(integrate_delta),
proptest::collection::vec(1usize..26, len as usize).prop_map(integrate_delta),
)
.prop_map(|(docs, offsets)| {
(0..docs.len() - 1)
.map(move |i| Checkpoint {
start_doc: docs[i] as DocId,
end_doc: docs[i + 1] as DocId,
start_offset: offsets[i],
end_offset: offsets[i + 1],
doc_range: docs[i] as DocId..docs[i + 1] as DocId,
byte_range: offsets[i]..offsets[i + 1],
})
.collect::<Vec<Checkpoint>>()
})
@@ -240,17 +219,17 @@ mod tests {
) -> Option<Checkpoint> {
checkpoints
.into_iter()
.filter(|checkpoint| checkpoint.end_doc > target)
.filter(|checkpoint| checkpoint.doc_range.end > target)
.next()
}
fn test_skip_index_aux(skip_index: SkipIndex, checkpoints: &[Checkpoint]) {
if let Some(last_checkpoint) = checkpoints.last() {
for doc in 0u32..last_checkpoint.end_doc {
for doc in 0u32..last_checkpoint.doc_range.end {
let expected = seek_manual(skip_index.checkpoints(), doc);
assert_eq!(expected, skip_index.seek(doc), "Doc {}", doc);
}
assert!(skip_index.seek(last_checkpoint.end_doc).is_none());
assert!(skip_index.seek(last_checkpoint.doc_range.end).is_none());
}
}

View File

@@ -36,21 +36,21 @@ struct Layer {
impl Layer {
fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.cursor_at_offset(0u64)
self.cursor_at_offset(0)
}
fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
fn cursor_at_offset(&self, start_offset: usize) -> impl Iterator<Item = Checkpoint> + '_ {
let data = &self.data.as_slice();
LayerCursor {
remaining: &data[start_offset as usize..],
remaining: &data[start_offset..],
block: CheckpointBlock::default(),
cursor: 0,
}
}
fn seek_start_at_offset(&self, target: DocId, offset: u64) -> Option<Checkpoint> {
fn seek_start_at_offset(&self, target: DocId, offset: usize) -> Option<Checkpoint> {
self.cursor_at_offset(offset)
.find(|checkpoint| checkpoint.end_doc > target)
.find(|checkpoint| checkpoint.doc_range.end > target)
}
}
@@ -69,7 +69,7 @@ impl SkipIndex {
let mut layers = Vec::new();
for end_offset in offsets {
let layer = Layer {
data: data.slice(start_offset as usize, end_offset as usize),
data: data.slice(start_offset as usize..end_offset as usize),
};
layers.push(layer);
start_offset = end_offset;
@@ -88,17 +88,15 @@ impl SkipIndex {
let first_layer_len = self
.layers
.first()
.map(|layer| layer.data.len() as u64)
.unwrap_or(0u64);
.map(|layer| layer.data.len())
.unwrap_or(0);
let mut cur_checkpoint = Checkpoint {
start_doc: 0u32,
end_doc: 1u32,
start_offset: 0u64,
end_offset: first_layer_len,
doc_range: 0u32..1u32,
byte_range: 0..first_layer_len,
};
for layer in &self.layers {
if let Some(checkpoint) =
layer.seek_start_at_offset(target, cur_checkpoint.start_offset)
layer.seek_start_at_offset(target, cur_checkpoint.byte_range.start)
{
cur_checkpoint = checkpoint;
} else {

View File

@@ -28,16 +28,14 @@ impl LayerBuilder {
///
/// If the block was empty to begin with, simply return None.
fn flush_block(&mut self) -> Option<Checkpoint> {
if let Some((start_doc, end_doc)) = self.block.doc_interval() {
let start_offset = self.buffer.len() as u64;
if let Some(doc_range) = self.block.doc_interval() {
let start_offset = self.buffer.len();
self.block.serialize(&mut self.buffer);
let end_offset = self.buffer.len() as u64;
let end_offset = self.buffer.len();
self.block.clear();
Some(Checkpoint {
start_doc,
end_doc,
start_offset,
end_offset,
doc_range,
byte_range: start_offset..end_offset,
})
} else {
None

View File

@@ -17,7 +17,7 @@ const LRU_CACHE_CAPACITY: usize = 100;
type Block = Arc<Vec<u8>>;
type BlockCache = Arc<Mutex<LruCache<u64, Block>>>;
type BlockCache = Arc<Mutex<LruCache<usize, Block>>>;
/// Reads document off tantivy's [`Store`](./index.html)
pub struct StoreReader {
@@ -59,16 +59,11 @@ impl StoreReader {
}
fn compressed_block(&self, checkpoint: &Checkpoint) -> io::Result<OwnedBytes> {
self.data
.slice(
checkpoint.start_offset as usize,
checkpoint.end_offset as usize,
)
.read_bytes()
self.data.slice(checkpoint.byte_range.clone()).read_bytes()
}
fn read_block(&self, checkpoint: &Checkpoint) -> io::Result<Block> {
if let Some(block) = self.cache.lock().unwrap().get(&checkpoint.start_offset) {
if let Some(block) = self.cache.lock().unwrap().get(&checkpoint.byte_range.start) {
self.cache_hits.fetch_add(1, Ordering::SeqCst);
return Ok(block.clone());
}
@@ -83,7 +78,7 @@ impl StoreReader {
self.cache
.lock()
.unwrap()
.put(checkpoint.start_offset, block.clone());
.put(checkpoint.byte_range.start, block.clone());
Ok(block)
}
@@ -100,7 +95,7 @@ impl StoreReader {
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
})?;
let mut cursor = &self.read_block(&checkpoint)?[..];
for _ in checkpoint.start_doc..doc_id {
for _ in checkpoint.doc_range.start..doc_id {
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
cursor = &cursor[doc_length..];
}

View File

@@ -74,7 +74,7 @@ impl StoreWriter {
}
assert_eq!(self.first_doc_in_block, self.doc);
let doc_shift = self.doc;
let start_shift = self.writer.written_bytes() as u64;
let start_shift = self.writer.written_bytes() as usize;
// just bulk write all of the block of the given reader.
self.writer
@@ -83,34 +83,32 @@ impl StoreWriter {
// concatenate the index of the `store_reader`, after translating
// its start doc id and its start file offset.
for mut checkpoint in store_reader.block_checkpoints() {
checkpoint.start_doc += doc_shift;
checkpoint.end_doc += doc_shift;
checkpoint.start_offset += start_shift;
checkpoint.end_offset += start_shift;
checkpoint.doc_range.start += doc_shift;
checkpoint.doc_range.end += doc_shift;
checkpoint.byte_range.start += start_shift;
checkpoint.byte_range.end += start_shift;
self.register_checkpoint(checkpoint);
}
Ok(())
}
fn register_checkpoint(&mut self, checkpoint: Checkpoint) {
self.offset_index_writer.insert(checkpoint);
self.first_doc_in_block = checkpoint.end_doc;
self.doc = checkpoint.end_doc;
self.offset_index_writer.insert(checkpoint.clone());
self.first_doc_in_block = checkpoint.doc_range.end;
self.doc = checkpoint.doc_range.end;
}
fn write_and_compress_block(&mut self) -> io::Result<()> {
assert!(self.doc > 0);
self.intermediary_buffer.clear();
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
let start_offset = self.writer.written_bytes();
let start_offset = self.writer.written_bytes() as usize;
self.writer.write_all(&self.intermediary_buffer)?;
let end_offset = self.writer.written_bytes();
let end_offset = self.writer.written_bytes() as usize;
let end_doc = self.doc;
self.register_checkpoint(Checkpoint {
start_doc: self.first_doc_in_block,
end_doc,
start_offset,
end_offset,
doc_range: self.first_doc_in_block..end_doc,
byte_range: start_offset..end_offset,
});
self.current_block.clear();
Ok(())

View File

@@ -68,18 +68,17 @@ impl TermInfoBlockMeta {
let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize;
let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize;
let postings_start_offset = self.ref_term_info.postings_start_offset
+ extract_bits(data, posting_start_addr, self.postings_offset_nbits);
let postings_stop_offset = self.ref_term_info.postings_start_offset
+ extract_bits(data, posting_stop_addr, self.postings_offset_nbits);
let postings_start_offset = self.ref_term_info.postings_range.start
+ extract_bits(data, posting_start_addr, self.postings_offset_nbits) as usize;
let postings_end_offset = self.ref_term_info.postings_range.start
+ extract_bits(data, posting_stop_addr, self.postings_offset_nbits) as usize;
let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32;
let positions_idx = self.ref_term_info.positions_idx
+ extract_bits(data, positions_idx_addr, self.positions_idx_nbits);
TermInfo {
doc_freq,
postings_start_offset,
postings_stop_offset,
postings_range: postings_start_offset..postings_end_offset,
positions_idx,
}
}
@@ -163,7 +162,7 @@ fn bitpack_serialize<W: Write>(
term_info: &TermInfo,
) -> io::Result<()> {
bit_packer.write(
term_info.postings_start_offset,
term_info.postings_range.start as u64,
term_info_block_meta.postings_offset_nbits,
write,
)?;
@@ -200,15 +199,15 @@ impl TermInfoStoreWriter {
} else {
return Ok(());
};
let postings_stop_offset =
last_term_info.postings_stop_offset - ref_term_info.postings_start_offset;
let postings_end_offset =
last_term_info.postings_range.end - ref_term_info.postings_range.start;
for term_info in &mut self.term_infos[1..] {
term_info.postings_start_offset -= ref_term_info.postings_start_offset;
term_info.postings_range.start -= ref_term_info.postings_range.start;
term_info.positions_idx -= ref_term_info.positions_idx;
}
let mut max_doc_freq: u32 = 0u32;
let max_postings_offset: u64 = postings_stop_offset;
let max_postings_offset: usize = postings_end_offset;
let max_positions_idx: u64 = last_term_info.positions_idx;
for term_info in &self.term_infos[1..] {
@@ -216,7 +215,7 @@ impl TermInfoStoreWriter {
}
let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq));
let max_postings_offset_nbits = compute_num_bits(max_postings_offset);
let max_postings_offset_nbits = compute_num_bits(max_postings_offset as u64);
let max_positions_idx_nbits = compute_num_bits(max_positions_idx);
let term_info_block_meta = TermInfoBlockMeta {
@@ -238,7 +237,7 @@ impl TermInfoStoreWriter {
}
bit_packer.write(
postings_stop_offset,
postings_end_offset as u64,
term_info_block_meta.postings_offset_nbits,
&mut self.buffer_term_infos,
)?;
@@ -251,7 +250,6 @@ impl TermInfoStoreWriter {
}
pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> {
assert!(term_info.postings_stop_offset >= term_info.postings_start_offset);
self.num_terms += 1u64;
self.term_infos.push(term_info.clone());
if self.term_infos.len() >= BLOCK_LEN {
@@ -314,8 +312,7 @@ mod tests {
offset: 2009u64,
ref_term_info: TermInfo {
doc_freq: 512,
postings_start_offset: 51,
postings_stop_offset: 57u64,
postings_range: 51..57,
positions_idx: 3584,
},
doc_freq_nbits: 10,
@@ -333,12 +330,11 @@ mod tests {
fn test_pack() -> crate::Result<()> {
let mut store_writer = TermInfoStoreWriter::new();
let mut term_infos = vec![];
let offset = |i| (i * 13 + i * i) as u64;
for i in 0..1000 {
let offset = |i| (i * 13 + i * i);
for i in 0usize..1000usize {
let term_info = TermInfo {
doc_freq: i as u32,
postings_start_offset: offset(i),
postings_stop_offset: offset(i + 1),
postings_range: offset(i)..offset(i + 1),
positions_idx: (i * 7) as u64,
};
store_writer.write_term_info(&term_info)?;

View File

@@ -9,12 +9,11 @@ use std::str;
const BLOCK_SIZE: usize = 1_500;
fn make_term_info(term_ord: u64) -> TermInfo {
let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord;
let offset = |term_ord: u64| (term_ord * 100 + term_ord * term_ord) as usize;
TermInfo {
doc_freq: term_ord as u32,
postings_start_offset: offset(term_ord),
postings_stop_offset: offset(term_ord + 1),
positions_idx: offset(term_ord) * 2u64,
postings_range: offset(term_ord)..offset(term_ord + 1),
positions_idx: offset(term_ord) as u64 * 2u64,
}
}