diff --git a/examples/snippet.rs b/examples/snippet.rs index eab64056a..67bc27a79 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -69,12 +69,12 @@ fn highlight(snippet: Snippet) -> String { let mut result = String::new(); let mut start_from = 0; - for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) { - result.push_str(&snippet.fragments()[start_from..start]); + for fragment_range in snippet.highlighted() { + result.push_str(&snippet.fragments()[start_from..fragment_range.start]); result.push_str(" --> "); - result.push_str(&snippet.fragments()[start..end]); + result.push_str(&snippet.fragments()[fragment_range.clone()]); result.push_str(" <-- "); - start_from = end; + start_from = fragment_range.end; } result.push_str(&snippet.fragments()[start_from..]); diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 598274315..52c188291 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -8,6 +8,8 @@ use crate::space_usage::FieldUsage; use crate::space_usage::PerFieldSpaceUsage; use std::collections::HashMap; use std::io::{self, Read, Write}; +use std::iter::ExactSizeIterator; +use std::ops::Range; use super::HasLen; @@ -105,7 +107,7 @@ impl CompositeWrite { #[derive(Clone)] pub struct CompositeFile { data: FileSlice, - offsets_index: HashMap, + offsets_index: HashMap>, } impl CompositeFile { @@ -117,7 +119,7 @@ impl CompositeFile { let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; let footer_start = end - 4 - footer_len; let footer_data = data - .slice(footer_start, footer_start + footer_len) + .slice(footer_start..footer_start + footer_len) .read_bytes()?; let mut footer_buffer = footer_data.as_slice(); let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize; @@ -138,7 +140,7 @@ impl CompositeFile { let file_addr = file_addrs[i]; let start_offset = offsets[i]; let end_offset = offsets[i + 1]; - field_index.insert(file_addr, (start_offset, end_offset)); + field_index.insert(file_addr, start_offset..end_offset); } Ok(CompositeFile { @@ -167,16 +169,16 @@ impl CompositeFile { pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option { self.offsets_index .get(&FileAddr { field, idx }) - .map(|&(from, to)| self.data.slice(from, to)) + .map(|byte_range| self.data.slice(byte_range.clone())) } pub fn space_usage(&self) -> PerFieldSpaceUsage { let mut fields = HashMap::new(); - for (&field_addr, &(start, end)) in self.offsets_index.iter() { + for (&field_addr, byte_range) in &self.offsets_index { fields .entry(field_addr.field) .or_insert_with(|| FieldUsage::empty(field_addr.field)) - .add_field_idx(field_addr.idx, end - start); + .add_field_idx(field_addr.idx, byte_range.len()); } PerFieldSpaceUsage::new(fields) } diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 2f4edf76d..c7e5db710 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -90,9 +90,9 @@ impl InvertedIndexReader { term_info: &TermInfo, block_postings: &mut BlockSegmentPostings, ) -> io::Result<()> { - let start_offset = term_info.postings_start_offset as usize; - let stop_offset = term_info.postings_stop_offset as usize; - let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset); + let postings_slice = self + .postings_file_slice + .slice(term_info.postings_range.clone()); block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?); Ok(()) } @@ -120,10 +120,9 @@ impl InvertedIndexReader { term_info: &TermInfo, requested_option: IndexRecordOption, ) -> io::Result { - let postings_data = self.postings_file_slice.slice( - term_info.postings_start_offset as usize, - term_info.postings_stop_offset as usize, - ); + let postings_data = self + .postings_file_slice + .slice(term_info.postings_range.clone()); BlockSegmentPostings::open( term_info.doc_freq, postings_data, diff --git a/src/directory/file_slice.rs b/src/directory/file_slice.rs index fac8f86cb..2d725104a 100644 --- a/src/directory/file_slice.rs +++ b/src/directory/file_slice.rs @@ -2,6 +2,7 @@ use stable_deref_trait::StableDeref; use crate::common::HasLen; use crate::directory::OwnedBytes; +use std::ops::Range; use std::sync::{Arc, Weak}; use std::{io, ops::Deref}; @@ -20,19 +21,19 @@ pub trait FileHandle: 'static + Send + Sync + HasLen { /// Reads a slice of bytes. /// /// This method may panic if the range requested is invalid. - fn read_bytes(&self, from: usize, to: usize) -> io::Result; + fn read_bytes(&self, range: Range) -> io::Result; } impl FileHandle for &'static [u8] { - fn read_bytes(&self, from: usize, to: usize) -> io::Result { - let bytes = &self[from..to]; + fn read_bytes(&self, range: Range) -> io::Result { + let bytes = &self[range]; Ok(OwnedBytes::new(bytes)) } } impl> HasLen for T { fn len(&self) -> usize { - self.as_ref().len() + self.deref().len() } } @@ -52,8 +53,7 @@ where #[derive(Clone)] pub struct FileSlice { data: Arc, - start: usize, - stop: usize, + range: Range, } impl FileSlice { @@ -68,8 +68,7 @@ impl FileSlice { pub fn new_with_num_bytes(file_handle: Box, num_bytes: usize) -> Self { FileSlice { data: Arc::from(file_handle), - start: 0, - stop: num_bytes, + range: 0..num_bytes, } } @@ -77,14 +76,12 @@ impl FileSlice { /// /// # Panics /// - /// Panics if `to < from` or if `to` exceeds the filesize. - pub fn slice(&self, from: usize, to: usize) -> FileSlice { - assert!(to <= self.len()); - assert!(to >= from); + /// Panics if `byte_range.end` exceeds the filesize. + pub fn slice(&self, byte_range: Range) -> FileSlice { + assert!(byte_range.end <= self.len()); FileSlice { data: self.data.clone(), - start: self.start + from, - stop: self.start + to, + range: self.range.start + byte_range.start..self.range.start + byte_range.end, } } @@ -101,19 +98,21 @@ impl FileSlice { /// In particular, it is up to the `Directory` implementation /// to handle caching if needed. pub fn read_bytes(&self) -> io::Result { - self.data.read_bytes(self.start, self.stop) + self.data.read_bytes(self.range.clone()) } /// Reads a specific slice of data. /// /// This is equivalent to running `file_slice.slice(from, to).read_bytes()`. - pub fn read_bytes_slice(&self, from: usize, to: usize) -> io::Result { - assert!(from <= to); + pub fn read_bytes_slice(&self, range: Range) -> io::Result { assert!( - self.start + to <= self.stop, - "`to` exceeds the fileslice length" + range.end <= self.len(), + "end of requested range exceeds the fileslice length ({} > {})", + range.end, + self.len() ); - self.data.read_bytes(self.start + from, self.start + to) + self.data + .read_bytes(self.range.start + range.start..self.range.start + range.end) } /// Splits the FileSlice at the given offset and return two file slices. @@ -138,7 +137,7 @@ impl FileSlice { /// /// Equivalent to `.slice(from_offset, self.len())` pub fn slice_from(&self, from_offset: usize) -> FileSlice { - self.slice(from_offset, self.len()) + self.slice(from_offset..self.len()) } /// Like `.slice(...)` but enforcing only the `to` @@ -146,19 +145,19 @@ impl FileSlice { /// /// Equivalent to `.slice(0, to_offset)` pub fn slice_to(&self, to_offset: usize) -> FileSlice { - self.slice(0, to_offset) + self.slice(0..to_offset) } } impl FileHandle for FileSlice { - fn read_bytes(&self, from: usize, to: usize) -> io::Result { - self.read_bytes_slice(from, to) + fn read_bytes(&self, range: Range) -> io::Result { + self.read_bytes_slice(range) } } impl HasLen for FileSlice { fn len(&self) -> usize { - self.stop - self.start + self.range.len() } } @@ -217,30 +216,23 @@ mod tests { let slice = FileSlice::new(Box::new(&b"abcdef"[..])); assert_eq!(slice.len(), 6); assert_eq!(slice.read_bytes()?.as_ref(), b"abcdef"); - assert_eq!(slice.slice(1, 4).read_bytes()?.as_ref(), b"bcd"); + assert_eq!(slice.slice(1..4).read_bytes()?.as_ref(), b"bcd"); Ok(()) } #[test] fn test_slice_read_slice() -> io::Result<()> { let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..])); - assert_eq!(slice_deref.read_bytes_slice(1, 4)?.as_ref(), b"bcd"); + assert_eq!(slice_deref.read_bytes_slice(1..4)?.as_ref(), b"bcd"); Ok(()) } #[test] - #[should_panic(expected = "assertion failed: from <= to")] - fn test_slice_read_slice_invalid_range() { - let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..])); - assert_eq!(slice_deref.read_bytes_slice(1, 0).unwrap().as_ref(), b"bcd"); - } - - #[test] - #[should_panic(expected = "`to` exceeds the fileslice length")] + #[should_panic(expected = "end of requested range exceeds the fileslice length (10 > 6)")] fn test_slice_read_slice_invalid_range_exceeds() { let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..])); assert_eq!( - slice_deref.read_bytes_slice(0, 10).unwrap().as_ref(), + slice_deref.read_bytes_slice(0..10).unwrap().as_ref(), b"bcd" ); } diff --git a/src/directory/owned_bytes.rs b/src/directory/owned_bytes.rs index 73303f50c..33d2487a5 100644 --- a/src/directory/owned_bytes.rs +++ b/src/directory/owned_bytes.rs @@ -2,7 +2,7 @@ use crate::directory::FileHandle; use stable_deref_trait::StableDeref; use std::convert::TryInto; use std::mem; -use std::ops::Deref; +use std::ops::{Deref, Range}; use std::sync::Arc; use std::{fmt, io}; @@ -17,8 +17,8 @@ pub struct OwnedBytes { } impl FileHandle for OwnedBytes { - fn read_bytes(&self, from: usize, to: usize) -> io::Result { - Ok(self.slice(from, to)) + fn read_bytes(&self, range: Range) -> io::Result { + Ok(self.slice(range)) } } @@ -42,9 +42,9 @@ impl OwnedBytes { } /// creates a fileslice that is just a view over a slice of the data. - pub fn slice(&self, from: usize, to: usize) -> Self { + pub fn slice(&self, range: Range) -> Self { OwnedBytes { - data: &self.data[from..to], + data: &self.data[range], box_stable_deref: self.box_stable_deref.clone(), } } diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index ac0d7775d..fd4d95ae8 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -1,3 +1,5 @@ +use std::ops::Range; + use crate::fastfield::{FastFieldReader, FastValue}; use crate::DocId; @@ -28,24 +30,24 @@ impl MultiValuedFastFieldReader { /// Returns `(start, stop)`, such that the values associated /// to the given document are `start..stop`. - fn range(&self, doc: DocId) -> (u64, u64) { + fn range(&self, doc: DocId) -> Range { let start = self.idx_reader.get(doc); let stop = self.idx_reader.get(doc + 1); - (start, stop) + start..stop } /// Returns the array of values associated to the given `doc`. pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { - let (start, stop) = self.range(doc); - let len = (stop - start) as usize; + let range = self.range(doc); + let len = (range.end - range.start) as usize; vals.resize(len, Item::make_zero()); - self.vals_reader.get_range_u64(start, &mut vals[..]); + self.vals_reader.get_range_u64(range.start, &mut vals[..]); } /// Returns the number of values associated with the document `DocId`. pub fn num_vals(&self, doc: DocId) -> usize { - let (start, stop) = self.range(doc); - (stop - start) as usize + let range = self.range(doc); + (range.end - range.start) as usize } /// Returns the overall number of values in this field . diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 9caf116ed..e6e78a1ed 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -125,21 +125,18 @@ impl MultiValuedFastFieldWriter { 1, )?; - let last_interval = ( - self.doc_index.last().cloned().unwrap(), - self.vals.len() as u64, - ); + let last_interval = + self.doc_index.last().cloned().unwrap() as usize..self.vals.len(); let mut doc_vals: Vec = Vec::with_capacity(100); - for (start, stop) in self + for range in self .doc_index .windows(2) - .map(|interval| (interval[0], interval[1])) + .map(|interval| interval[0] as usize..interval[1] as usize) .chain(Some(last_interval).into_iter()) - .map(|(start, stop)| (start as usize, stop as usize)) { doc_vals.clear(); - let remapped_vals = self.vals[start..stop] + let remapped_vals = self.vals[range] .iter() .map(|val| *mapping.get(val).expect("Missing term ordinal")); doc_vals.extend(remapped_vals); diff --git a/src/postings/block_search.rs b/src/postings/block_search.rs index 08cd55379..1171f9262 100644 --- a/src/postings/block_search.rs +++ b/src/postings/block_search.rs @@ -1,3 +1,5 @@ +use std::ops::Range; + use crate::postings::compression::AlignedBuffer; /// This modules define the logic used to search for a doc in a given @@ -72,7 +74,7 @@ fn linear_search(arr: &[u32], target: u32) -> usize { arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum() } -fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) { +fn exponential_search(arr: &[u32], target: u32) -> Range { let end = arr.len(); let mut begin = 0; for &pivot in &[1, 3, 7, 15, 31, 63] { @@ -80,17 +82,17 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) { break; } if arr[pivot] > target { - return (begin, pivot); + return begin..pivot; } begin = pivot; } - (begin, end) + begin..end } #[inline(never)] fn galloping(block_docs: &[u32], target: u32) -> usize { - let (start, end) = exponential_search(&block_docs, target); - start + linear_search(&block_docs[start..end], target) + let range = exponential_search(&block_docs, target); + range.start + linear_search(&block_docs[range], target) } /// Tantivy may rely on SIMD instructions to search for a specific document within @@ -182,11 +184,11 @@ mod tests { #[test] fn test_exponentiel_search() { - assert_eq!(exponential_search(&[1, 2], 0), (0, 1)); - assert_eq!(exponential_search(&[1, 2], 1), (0, 1)); + assert_eq!(exponential_search(&[1, 2], 0), 0..1); + assert_eq!(exponential_search(&[1, 2], 1), 0..1); assert_eq!( exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7), - (3, 7) + 3..7 ); } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 49dde8237..681d9cb74 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -16,7 +16,7 @@ use fnv::FnvHashMap; use std::collections::HashMap; use std::io; use std::marker::PhantomData; -use std::ops::DerefMut; +use std::ops::{DerefMut, Range}; fn posting_from_field_entry(field_entry: &FieldEntry) -> Box { match *field_entry.field_type() { @@ -52,7 +52,7 @@ pub struct MultiFieldPostingsWriter { fn make_field_partition( term_offsets: &[(&[u8], Addr, UnorderedTermId)], -) -> Vec<(Field, usize, usize)> { +) -> Vec<(Field, Range)> { let term_offsets_it = term_offsets .iter() .map(|(key, _, _)| Term::wrap(key).field()) @@ -70,7 +70,7 @@ fn make_field_partition( offsets.push(term_offsets.len()); let mut field_offsets = vec![]; for i in 0..fields.len() { - field_offsets.push((fields[i], offsets[i], offsets[i + 1])); + field_offsets.push((fields[i], offsets[i]..offsets[i + 1])); } field_offsets } @@ -138,14 +138,14 @@ impl MultiFieldPostingsWriter { let field_offsets = make_field_partition(&term_offsets); - for (field, start, stop) in field_offsets { + for (field, byte_offsets) in field_offsets { let field_entry = self.schema.get_field_entry(field); match *field_entry.field_type() { FieldType::Str(_) | FieldType::HierarchicalFacet => { // populating the (unordered term ord) -> (ordered term ord) mapping // for the field. - let unordered_term_ids = term_offsets[start..stop] + let unordered_term_ids = term_offsets[byte_offsets.clone()] .iter() .map(|&(_, _, bucket)| bucket); let mapping: FnvHashMap = unordered_term_ids @@ -169,7 +169,7 @@ impl MultiFieldPostingsWriter { fieldnorm_reader, )?; postings_writer.serialize( - &term_offsets[start..stop], + &term_offsets[byte_offsets], &mut field_serializer, &self.term_index.heap, &self.heap, diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 9e8d9c39a..b326a017f 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -183,10 +183,10 @@ impl<'a> FieldSerializer<'a> { } else { 0u64 }; + let addr = self.postings_serializer.addr() as usize; TermInfo { doc_freq: 0, - postings_start_offset: self.postings_serializer.addr(), - postings_stop_offset: 0u64, + postings_range: addr..addr, positions_idx, } } @@ -242,7 +242,7 @@ impl<'a> FieldSerializer<'a> { if self.term_open { self.postings_serializer .close_term(self.current_term_info.doc_freq)?; - self.current_term_info.postings_stop_offset = self.postings_serializer.addr(); + self.current_term_info.postings_range.end = self.postings_serializer.addr() as usize; self.term_dictionary_builder .insert_value(&self.current_term_info)?; self.term_open = false; diff --git a/src/postings/stacker/term_hashmap.rs b/src/postings/stacker/term_hashmap.rs index 6ae135c25..4533f2e2f 100644 --- a/src/postings/stacker/term_hashmap.rs +++ b/src/postings/stacker/term_hashmap.rs @@ -17,10 +17,6 @@ pub fn compute_table_size(num_bits: usize) -> usize { /// `KeyValue` is the item stored in the hash table. /// The key is actually a `BytesRef` object stored in an external heap. /// The `value_addr` also points to an address in the heap. -/// -/// The key and the value are actually stored contiguously. -/// For this reason, the (start, stop) information is actually redundant -/// and can be simplified in the future #[derive(Copy, Clone)] struct KeyValue { key_value_addr: Addr, diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 4e08f2e9f..2aad4e45d 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,25 +1,24 @@ use crate::common::{BinarySerializable, FixedSize}; use std::io; +use std::iter::ExactSizeIterator; +use std::ops::Range; /// `TermInfo` wraps the metadata associated to a Term. /// It is segment-local. -#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)] +#[derive(Debug, Default, Eq, PartialEq, Clone)] pub struct TermInfo { /// Number of documents in the segment containing the term pub doc_freq: u32, - /// Start offset of the posting list within the postings (`.idx`) file. - pub postings_start_offset: u64, - /// Stop offset of the posting list within the postings (`.idx`) file. - /// The byte range is `[start_offset..stop_offset)`. - pub postings_stop_offset: u64, + /// Byte range of the posting list within the postings (`.idx`) file. + pub postings_range: Range, /// Start offset of the first block within the position (`.pos`) file. pub positions_idx: u64, } impl TermInfo { pub(crate) fn posting_num_bytes(&self) -> u32 { - let num_bytes = self.postings_stop_offset - self.postings_start_offset; - assert!(num_bytes <= std::u32::MAX as u64); + let num_bytes = self.postings_range.len(); + assert!(num_bytes <= std::u32::MAX as usize); num_bytes as u32 } } @@ -35,7 +34,7 @@ impl FixedSize for TermInfo { impl BinarySerializable for TermInfo { fn serialize(&self, writer: &mut W) -> io::Result<()> { self.doc_freq.serialize(writer)?; - self.postings_start_offset.serialize(writer)?; + (self.postings_range.start as u64).serialize(writer)?; self.posting_num_bytes().serialize(writer)?; self.positions_idx.serialize(writer)?; Ok(()) @@ -43,14 +42,13 @@ impl BinarySerializable for TermInfo { fn deserialize(reader: &mut R) -> io::Result { let doc_freq = u32::deserialize(reader)?; - let postings_start_offset = u64::deserialize(reader)?; + let postings_start_offset = u64::deserialize(reader)? as usize; let postings_num_bytes = u32::deserialize(reader)?; - let postings_stop_offset = postings_start_offset + u64::from(postings_num_bytes); + let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes) as usize; let positions_idx = u64::deserialize(reader)?; Ok(TermInfo { doc_freq, - postings_start_offset, - postings_stop_offset, + postings_range: postings_start_offset..postings_end_offset, positions_idx, }) } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index f71de1d9d..a2df79488 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -8,33 +8,17 @@ use htmlescape::encode_minimal; use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::ops::Range; const DEFAULT_MAX_NUM_CHARS: usize = 150; -#[derive(Debug)] -pub struct HighlightSection { - start: usize, - stop: usize, -} - -impl HighlightSection { - fn new(start: usize, stop: usize) -> HighlightSection { - HighlightSection { start, stop } - } - - /// Returns the bounds of the `HighlightSection`. - pub fn bounds(&self) -> (usize, usize) { - (self.start, self.stop) - } -} - #[derive(Debug)] pub struct FragmentCandidate { score: Score, start_offset: usize, stop_offset: usize, num_chars: usize, - highlighted: Vec, + highlighted: Vec>, } impl FragmentCandidate { @@ -63,8 +47,7 @@ impl FragmentCandidate { if let Some(&score) = terms.get(&token.text.to_lowercase()) { self.score += score; - self.highlighted - .push(HighlightSection::new(token.offset_from, token.offset_to)); + self.highlighted.push(token.offset_from..token.offset_to); } } } @@ -74,7 +57,7 @@ impl FragmentCandidate { #[derive(Debug)] pub struct Snippet { fragments: String, - highlighted: Vec, + highlighted: Vec>, } const HIGHLIGHTEN_PREFIX: &str = ""; @@ -97,9 +80,9 @@ impl Snippet { for item in self.highlighted.iter() { html.push_str(&encode_minimal(&self.fragments[start_from..item.start])); html.push_str(HIGHLIGHTEN_PREFIX); - html.push_str(&encode_minimal(&self.fragments[item.start..item.stop])); + html.push_str(&encode_minimal(&self.fragments[item.clone()])); html.push_str(HIGHLIGHTEN_POSTFIX); - start_from = item.stop; + start_from = item.end; } html.push_str(&encode_minimal( &self.fragments[start_from..self.fragments.len()], @@ -113,7 +96,7 @@ impl Snippet { } /// Returns a list of higlighted positions from the `Snippet`. - pub fn highlighted(&self) -> &[HighlightSection] { + pub fn highlighted(&self) -> &[Range] { &self.highlighted } } @@ -185,12 +168,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) let highlighted = fragment .highlighted .iter() - .map(|item| { - HighlightSection::new( - item.start - fragment.start_offset, - item.stop - fragment.start_offset, - ) - }) + .map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset) .collect(); Snippet { fragments: fragment_text.to_string(), diff --git a/src/store/index/block.rs b/src/store/index/block.rs index 33785748c..3b49905b5 100644 --- a/src/store/index/block.rs +++ b/src/store/index/block.rs @@ -2,6 +2,7 @@ use crate::common::VInt; use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; use crate::DocId; use std::io; +use std::ops::Range; /// Represents a block of checkpoints. /// @@ -24,19 +25,19 @@ impl Default for CheckpointBlock { impl CheckpointBlock { /// If non-empty returns [start_doc, end_doc) /// for the overall block. - pub fn doc_interval(&self) -> Option<(DocId, DocId)> { + pub fn doc_interval(&self) -> Option> { let start_doc_opt = self .checkpoints .first() .cloned() - .map(|checkpoint| checkpoint.start_doc); + .map(|checkpoint| checkpoint.doc_range.start); let end_doc_opt = self .checkpoints .last() .cloned() - .map(|checkpoint| checkpoint.end_doc); + .map(|checkpoint| checkpoint.doc_range.end); match (start_doc_opt, end_doc_opt) { - (Some(start_doc), Some(end_doc)) => Some((start_doc, end_doc)), + (Some(start_doc), Some(end_doc)) => Some(start_doc..end_doc), _ => None, } } @@ -55,7 +56,7 @@ impl CheckpointBlock { } pub fn get(&self, idx: usize) -> Checkpoint { - self.checkpoints[idx] + self.checkpoints[idx].clone() } pub fn clear(&mut self) { @@ -67,12 +68,13 @@ impl CheckpointBlock { if self.checkpoints.is_empty() { return; } - VInt(self.checkpoints[0].start_doc as u64).serialize_into_vec(buffer); - VInt(self.checkpoints[0].start_offset as u64).serialize_into_vec(buffer); + VInt(self.checkpoints[0].doc_range.start as u64).serialize_into_vec(buffer); + VInt(self.checkpoints[0].byte_range.start as u64).serialize_into_vec(buffer); for checkpoint in &self.checkpoints { - let delta_doc = checkpoint.end_doc - checkpoint.start_doc; + let delta_doc = checkpoint.doc_range.end - checkpoint.doc_range.start; VInt(delta_doc as u64).serialize_into_vec(buffer); - VInt(checkpoint.end_offset - checkpoint.start_offset).serialize_into_vec(buffer); + VInt((checkpoint.byte_range.end - checkpoint.byte_range.start) as u64) + .serialize_into_vec(buffer); } } @@ -86,15 +88,13 @@ impl CheckpointBlock { return Ok(()); } let mut doc = VInt::deserialize_u64(data)? as DocId; - let mut start_offset = VInt::deserialize_u64(data)?; + let mut start_offset = VInt::deserialize_u64(data)? as usize; for _ in 0..len { let num_docs = VInt::deserialize_u64(data)? as DocId; - let block_num_bytes = VInt::deserialize_u64(data)?; + let block_num_bytes = VInt::deserialize_u64(data)? as usize; self.checkpoints.push(Checkpoint { - start_doc: doc, - end_doc: doc + num_docs, - start_offset, - end_offset: start_offset + block_num_bytes, + doc_range: doc..doc + num_docs, + byte_range: start_offset..start_offset + block_num_bytes, }); doc += num_docs; start_offset += block_num_bytes; @@ -112,17 +112,15 @@ mod tests { fn test_aux_ser_deser(checkpoints: &[Checkpoint]) -> io::Result<()> { let mut block = CheckpointBlock::default(); - for &checkpoint in checkpoints { - block.push(checkpoint); + for checkpoint in checkpoints { + block.push(checkpoint.clone()); } let mut buffer = Vec::new(); block.serialize(&mut buffer); let mut block_deser = CheckpointBlock::default(); let checkpoint = Checkpoint { - start_doc: 0, - end_doc: 1, - start_offset: 2, - end_offset: 3, + doc_range: 0..1, + byte_range: 2..3, }; block_deser.push(checkpoint); // < check that value is erased before deser let mut data = &buffer[..]; @@ -140,26 +138,22 @@ mod tests { #[test] fn test_block_serialize_simple() -> io::Result<()> { let checkpoints = vec![Checkpoint { - start_doc: 10, - end_doc: 12, - start_offset: 100, - end_offset: 120, + doc_range: 10..12, + byte_range: 100..120, }]; test_aux_ser_deser(&checkpoints) } #[test] fn test_block_serialize() -> io::Result<()> { - let offsets: Vec = (0..11).map(|i| i * i * i).collect(); + let offsets: Vec = (0..11).map(|i| i * i * i).collect(); let mut checkpoints = vec![]; let mut start_doc = 0; for i in 0..10 { let end_doc = (i * i) as DocId; checkpoints.push(Checkpoint { - start_doc, - end_doc, - start_offset: offsets[i], - end_offset: offsets[i + 1], + doc_range: start_doc..end_doc, + byte_range: offsets[i]..offsets[i + 1], }); start_doc = end_doc; } diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index 4e9312876..cdf4167ff 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -1,6 +1,7 @@ const CHECKPOINT_PERIOD: usize = 8; use std::fmt; +use std::ops::Range; mod block; mod skip_index; mod skip_index_builder; @@ -15,30 +16,24 @@ pub use self::skip_index_builder::SkipIndexBuilder; /// of checkpoints. /// /// All of the intervals here defined are semi-open. -/// The checkpoint describes that the block within the bytes -/// `[start_offset..end_offset)` spans over the docs -/// `[start_doc..end_doc)`. -#[derive(Clone, Copy, Eq, PartialEq)] +/// The checkpoint describes that the block within the `byte_range` +/// and spans over the `doc_range`. +#[derive(Clone, Eq, PartialEq)] pub struct Checkpoint { - pub start_doc: DocId, - pub end_doc: DocId, - pub start_offset: u64, - pub end_offset: u64, + pub doc_range: Range, + pub byte_range: Range, } impl Checkpoint { pub(crate) fn follows(&self, other: &Checkpoint) -> bool { - (self.start_doc == other.end_doc) && (self.start_offset == other.end_offset) + (self.doc_range.start == other.doc_range.end) + && (self.doc_range.start == other.doc_range.end) } } impl fmt::Debug for Checkpoint { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "(doc=[{}..{}), bytes=[{}..{}))", - self.start_doc, self.end_doc, self.start_offset, self.end_offset - ) + write!(f, "(doc={:?}, bytes={:?})", self.doc_range, self.byte_range) } } @@ -74,12 +69,10 @@ mod tests { let mut output: Vec = Vec::new(); let mut skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new(); let checkpoint = Checkpoint { - start_doc: 0, - end_doc: 2, - start_offset: 0, - end_offset: 3, + doc_range: 0..2, + byte_range: 0..3, }; - skip_index_builder.insert(checkpoint); + skip_index_builder.insert(checkpoint.clone()); skip_index_builder.write(&mut output)?; let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output)); let mut skip_cursor = skip_index.checkpoints(); @@ -93,40 +86,30 @@ mod tests { let mut output: Vec = Vec::new(); let checkpoints = vec![ Checkpoint { - start_doc: 0, - end_doc: 3, - start_offset: 0, - end_offset: 9, + doc_range: 0..3, + byte_range: 0..9, }, Checkpoint { - start_doc: 3, - end_doc: 4, - start_offset: 9, - end_offset: 25, + doc_range: 3..4, + byte_range: 9..25, }, Checkpoint { - start_doc: 4, - end_doc: 6, - start_offset: 25, - end_offset: 49, + doc_range: 4..6, + byte_range: 25..49, }, Checkpoint { - start_doc: 6, - end_doc: 8, - start_offset: 49, - end_offset: 81, + doc_range: 6..8, + byte_range: 49..81, }, Checkpoint { - start_doc: 8, - end_doc: 10, - start_offset: 81, - end_offset: 100, + doc_range: 8..10, + byte_range: 81..100, }, ]; let mut skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new(); - for &checkpoint in &checkpoints { - skip_index_builder.insert(checkpoint); + for checkpoint in &checkpoints { + skip_index_builder.insert(checkpoint.clone()); } skip_index_builder.write(&mut output)?; @@ -138,8 +121,8 @@ mod tests { Ok(()) } - fn offset_test(doc: DocId) -> u64 { - (doc as u64) * (doc as u64) + fn offset_test(doc: DocId) -> usize { + (doc as usize) * (doc as usize) } #[test] @@ -181,15 +164,13 @@ mod tests { let mut output: Vec = Vec::new(); let checkpoints: Vec = (0..1000) .map(|i| Checkpoint { - start_doc: i, - end_doc: i + 1, - start_offset: offset_test(i), - end_offset: offset_test(i + 1), + doc_range: i..(i + 1), + byte_range: offset_test(i)..offset_test(i + 1), }) .collect(); let mut skip_index_builder = SkipIndexBuilder::new(); for checkpoint in &checkpoints { - skip_index_builder.insert(*checkpoint); + skip_index_builder.insert(checkpoint.clone()); } skip_index_builder.write(&mut output)?; assert_eq!(output.len(), 4035); @@ -200,10 +181,10 @@ mod tests { Ok(()) } - fn integrate_delta(vals: Vec) -> Vec { + fn integrate_delta(vals: Vec) -> Vec { let mut output = Vec::with_capacity(vals.len() + 1); - output.push(0u64); - let mut prev = 0u64; + output.push(0); + let mut prev = 0; for val in vals { let new_val = val + prev; prev = new_val; @@ -217,16 +198,14 @@ mod tests { (0..max_len) .prop_flat_map(move |len: usize| { ( - proptest::collection::vec(1u64..20u64, len as usize).prop_map(integrate_delta), - proptest::collection::vec(1u64..26u64, len as usize).prop_map(integrate_delta), + proptest::collection::vec(1usize..20, len as usize).prop_map(integrate_delta), + proptest::collection::vec(1usize..26, len as usize).prop_map(integrate_delta), ) .prop_map(|(docs, offsets)| { (0..docs.len() - 1) .map(move |i| Checkpoint { - start_doc: docs[i] as DocId, - end_doc: docs[i + 1] as DocId, - start_offset: offsets[i], - end_offset: offsets[i + 1], + doc_range: docs[i] as DocId..docs[i + 1] as DocId, + byte_range: offsets[i]..offsets[i + 1], }) .collect::>() }) @@ -240,17 +219,17 @@ mod tests { ) -> Option { checkpoints .into_iter() - .filter(|checkpoint| checkpoint.end_doc > target) + .filter(|checkpoint| checkpoint.doc_range.end > target) .next() } fn test_skip_index_aux(skip_index: SkipIndex, checkpoints: &[Checkpoint]) { if let Some(last_checkpoint) = checkpoints.last() { - for doc in 0u32..last_checkpoint.end_doc { + for doc in 0u32..last_checkpoint.doc_range.end { let expected = seek_manual(skip_index.checkpoints(), doc); assert_eq!(expected, skip_index.seek(doc), "Doc {}", doc); } - assert!(skip_index.seek(last_checkpoint.end_doc).is_none()); + assert!(skip_index.seek(last_checkpoint.doc_range.end).is_none()); } } diff --git a/src/store/index/skip_index.rs b/src/store/index/skip_index.rs index f64dc5efd..306eb7ca1 100644 --- a/src/store/index/skip_index.rs +++ b/src/store/index/skip_index.rs @@ -36,21 +36,21 @@ struct Layer { impl Layer { fn cursor(&self) -> impl Iterator + '_ { - self.cursor_at_offset(0u64) + self.cursor_at_offset(0) } - fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator + '_ { + fn cursor_at_offset(&self, start_offset: usize) -> impl Iterator + '_ { let data = &self.data.as_slice(); LayerCursor { - remaining: &data[start_offset as usize..], + remaining: &data[start_offset..], block: CheckpointBlock::default(), cursor: 0, } } - fn seek_start_at_offset(&self, target: DocId, offset: u64) -> Option { + fn seek_start_at_offset(&self, target: DocId, offset: usize) -> Option { self.cursor_at_offset(offset) - .find(|checkpoint| checkpoint.end_doc > target) + .find(|checkpoint| checkpoint.doc_range.end > target) } } @@ -69,7 +69,7 @@ impl SkipIndex { let mut layers = Vec::new(); for end_offset in offsets { let layer = Layer { - data: data.slice(start_offset as usize, end_offset as usize), + data: data.slice(start_offset as usize..end_offset as usize), }; layers.push(layer); start_offset = end_offset; @@ -88,17 +88,15 @@ impl SkipIndex { let first_layer_len = self .layers .first() - .map(|layer| layer.data.len() as u64) - .unwrap_or(0u64); + .map(|layer| layer.data.len()) + .unwrap_or(0); let mut cur_checkpoint = Checkpoint { - start_doc: 0u32, - end_doc: 1u32, - start_offset: 0u64, - end_offset: first_layer_len, + doc_range: 0u32..1u32, + byte_range: 0..first_layer_len, }; for layer in &self.layers { if let Some(checkpoint) = - layer.seek_start_at_offset(target, cur_checkpoint.start_offset) + layer.seek_start_at_offset(target, cur_checkpoint.byte_range.start) { cur_checkpoint = checkpoint; } else { diff --git a/src/store/index/skip_index_builder.rs b/src/store/index/skip_index_builder.rs index 6d46dabed..416f7bfa0 100644 --- a/src/store/index/skip_index_builder.rs +++ b/src/store/index/skip_index_builder.rs @@ -28,16 +28,14 @@ impl LayerBuilder { /// /// If the block was empty to begin with, simply return None. fn flush_block(&mut self) -> Option { - if let Some((start_doc, end_doc)) = self.block.doc_interval() { - let start_offset = self.buffer.len() as u64; + if let Some(doc_range) = self.block.doc_interval() { + let start_offset = self.buffer.len(); self.block.serialize(&mut self.buffer); - let end_offset = self.buffer.len() as u64; + let end_offset = self.buffer.len(); self.block.clear(); Some(Checkpoint { - start_doc, - end_doc, - start_offset, - end_offset, + doc_range, + byte_range: start_offset..end_offset, }) } else { None diff --git a/src/store/reader.rs b/src/store/reader.rs index adcf11430..0c7295305 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -17,7 +17,7 @@ const LRU_CACHE_CAPACITY: usize = 100; type Block = Arc>; -type BlockCache = Arc>>; +type BlockCache = Arc>>; /// Reads document off tantivy's [`Store`](./index.html) pub struct StoreReader { @@ -59,16 +59,11 @@ impl StoreReader { } fn compressed_block(&self, checkpoint: &Checkpoint) -> io::Result { - self.data - .slice( - checkpoint.start_offset as usize, - checkpoint.end_offset as usize, - ) - .read_bytes() + self.data.slice(checkpoint.byte_range.clone()).read_bytes() } fn read_block(&self, checkpoint: &Checkpoint) -> io::Result { - if let Some(block) = self.cache.lock().unwrap().get(&checkpoint.start_offset) { + if let Some(block) = self.cache.lock().unwrap().get(&checkpoint.byte_range.start) { self.cache_hits.fetch_add(1, Ordering::SeqCst); return Ok(block.clone()); } @@ -83,7 +78,7 @@ impl StoreReader { self.cache .lock() .unwrap() - .put(checkpoint.start_offset, block.clone()); + .put(checkpoint.byte_range.start, block.clone()); Ok(block) } @@ -100,7 +95,7 @@ impl StoreReader { crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id)) })?; let mut cursor = &self.read_block(&checkpoint)?[..]; - for _ in checkpoint.start_doc..doc_id { + for _ in checkpoint.doc_range.start..doc_id { let doc_length = VInt::deserialize(&mut cursor)?.val() as usize; cursor = &cursor[doc_length..]; } diff --git a/src/store/writer.rs b/src/store/writer.rs index 3309f1a64..1728636e1 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -74,7 +74,7 @@ impl StoreWriter { } assert_eq!(self.first_doc_in_block, self.doc); let doc_shift = self.doc; - let start_shift = self.writer.written_bytes() as u64; + let start_shift = self.writer.written_bytes() as usize; // just bulk write all of the block of the given reader. self.writer @@ -83,34 +83,32 @@ impl StoreWriter { // concatenate the index of the `store_reader`, after translating // its start doc id and its start file offset. for mut checkpoint in store_reader.block_checkpoints() { - checkpoint.start_doc += doc_shift; - checkpoint.end_doc += doc_shift; - checkpoint.start_offset += start_shift; - checkpoint.end_offset += start_shift; + checkpoint.doc_range.start += doc_shift; + checkpoint.doc_range.end += doc_shift; + checkpoint.byte_range.start += start_shift; + checkpoint.byte_range.end += start_shift; self.register_checkpoint(checkpoint); } Ok(()) } fn register_checkpoint(&mut self, checkpoint: Checkpoint) { - self.offset_index_writer.insert(checkpoint); - self.first_doc_in_block = checkpoint.end_doc; - self.doc = checkpoint.end_doc; + self.offset_index_writer.insert(checkpoint.clone()); + self.first_doc_in_block = checkpoint.doc_range.end; + self.doc = checkpoint.doc_range.end; } fn write_and_compress_block(&mut self) -> io::Result<()> { assert!(self.doc > 0); self.intermediary_buffer.clear(); compress(&self.current_block[..], &mut self.intermediary_buffer)?; - let start_offset = self.writer.written_bytes(); + let start_offset = self.writer.written_bytes() as usize; self.writer.write_all(&self.intermediary_buffer)?; - let end_offset = self.writer.written_bytes(); + let end_offset = self.writer.written_bytes() as usize; let end_doc = self.doc; self.register_checkpoint(Checkpoint { - start_doc: self.first_doc_in_block, - end_doc, - start_offset, - end_offset, + doc_range: self.first_doc_in_block..end_doc, + byte_range: start_offset..end_offset, }); self.current_block.clear(); Ok(()) diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index 20b709a2f..73b69e7e4 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -68,18 +68,17 @@ impl TermInfoBlockMeta { let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize; let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize; - let postings_start_offset = self.ref_term_info.postings_start_offset - + extract_bits(data, posting_start_addr, self.postings_offset_nbits); - let postings_stop_offset = self.ref_term_info.postings_start_offset - + extract_bits(data, posting_stop_addr, self.postings_offset_nbits); + let postings_start_offset = self.ref_term_info.postings_range.start + + extract_bits(data, posting_start_addr, self.postings_offset_nbits) as usize; + let postings_end_offset = self.ref_term_info.postings_range.start + + extract_bits(data, posting_stop_addr, self.postings_offset_nbits) as usize; let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32; let positions_idx = self.ref_term_info.positions_idx + extract_bits(data, positions_idx_addr, self.positions_idx_nbits); TermInfo { doc_freq, - postings_start_offset, - postings_stop_offset, + postings_range: postings_start_offset..postings_end_offset, positions_idx, } } @@ -163,7 +162,7 @@ fn bitpack_serialize( term_info: &TermInfo, ) -> io::Result<()> { bit_packer.write( - term_info.postings_start_offset, + term_info.postings_range.start as u64, term_info_block_meta.postings_offset_nbits, write, )?; @@ -200,15 +199,15 @@ impl TermInfoStoreWriter { } else { return Ok(()); }; - let postings_stop_offset = - last_term_info.postings_stop_offset - ref_term_info.postings_start_offset; + let postings_end_offset = + last_term_info.postings_range.end - ref_term_info.postings_range.start; for term_info in &mut self.term_infos[1..] { - term_info.postings_start_offset -= ref_term_info.postings_start_offset; + term_info.postings_range.start -= ref_term_info.postings_range.start; term_info.positions_idx -= ref_term_info.positions_idx; } let mut max_doc_freq: u32 = 0u32; - let max_postings_offset: u64 = postings_stop_offset; + let max_postings_offset: usize = postings_end_offset; let max_positions_idx: u64 = last_term_info.positions_idx; for term_info in &self.term_infos[1..] { @@ -216,7 +215,7 @@ impl TermInfoStoreWriter { } let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq)); - let max_postings_offset_nbits = compute_num_bits(max_postings_offset); + let max_postings_offset_nbits = compute_num_bits(max_postings_offset as u64); let max_positions_idx_nbits = compute_num_bits(max_positions_idx); let term_info_block_meta = TermInfoBlockMeta { @@ -238,7 +237,7 @@ impl TermInfoStoreWriter { } bit_packer.write( - postings_stop_offset, + postings_end_offset as u64, term_info_block_meta.postings_offset_nbits, &mut self.buffer_term_infos, )?; @@ -251,7 +250,6 @@ impl TermInfoStoreWriter { } pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> { - assert!(term_info.postings_stop_offset >= term_info.postings_start_offset); self.num_terms += 1u64; self.term_infos.push(term_info.clone()); if self.term_infos.len() >= BLOCK_LEN { @@ -314,8 +312,7 @@ mod tests { offset: 2009u64, ref_term_info: TermInfo { doc_freq: 512, - postings_start_offset: 51, - postings_stop_offset: 57u64, + postings_range: 51..57, positions_idx: 3584, }, doc_freq_nbits: 10, @@ -333,12 +330,11 @@ mod tests { fn test_pack() -> crate::Result<()> { let mut store_writer = TermInfoStoreWriter::new(); let mut term_infos = vec![]; - let offset = |i| (i * 13 + i * i) as u64; - for i in 0..1000 { + let offset = |i| (i * 13 + i * i); + for i in 0usize..1000usize { let term_info = TermInfo { doc_freq: i as u32, - postings_start_offset: offset(i), - postings_stop_offset: offset(i + 1), + postings_range: offset(i)..offset(i + 1), positions_idx: (i * 7) as u64, }; store_writer.write_term_info(&term_info)?; diff --git a/src/termdict/tests.rs b/src/termdict/tests.rs index 9e0bde752..0dc7927cb 100644 --- a/src/termdict/tests.rs +++ b/src/termdict/tests.rs @@ -9,12 +9,11 @@ use std::str; const BLOCK_SIZE: usize = 1_500; fn make_term_info(term_ord: u64) -> TermInfo { - let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord; + let offset = |term_ord: u64| (term_ord * 100 + term_ord * term_ord) as usize; TermInfo { doc_freq: term_ord as u32, - postings_start_offset: offset(term_ord), - postings_stop_offset: offset(term_ord + 1), - positions_idx: offset(term_ord) * 2u64, + postings_range: offset(term_ord)..offset(term_ord + 1), + positions_idx: offset(term_ord) as u64 * 2u64, } }