mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
* sort index by field add sort info to IndexSettings generate docid mapping for sorted field (only fastfield) remap singlevalue fastfield * support docid mapping in multivalue fastfield move docid mapping to serialization step (less intermediate data for mapping) add support for docid mapping in multivalue fastfield * handle docid map in bytes fastfield * forward docid mapping, remap postings * fix merge conflicts * move test to index_sorter * add docid index mapping old->new add docid mapping for both directions old->new (used in postings) and new->old (used in fast field) handle mapping in postings recorder warn instead of info for MAX_TOKEN_LEN * remap docid in fielnorm * resort docids in recorder, more extensive tests * handle index sorting in docstore handle index sort in docstore, by saving all the docs in a temp docstore file (SegmentComponent::TempStore). On serialization the docid mapping is used to create a docstore in the correct order by reader the old docstore. add docstore sort tests refactor tests * refactor rename docid doc_id rename docid_map doc_id_map rename DocidMapping DocIdMapping fix typo * u32 to DocId * better doc_id_map creation remove unstable sort * add non mut method to FastFieldWriters add _mut prefix to &mut methods * remove sort_index * fix clippy issues * fix SegmentComponent iterator use std::mem::replace * fix test * fmt * handle indexsettings deserialize * add reading, writing bytes to doc store get bytes of document in doc store add store_bytes method doc writer to accept serialized document add serialization index settings test * rename index_sorter to doc_id_mapping use bufferlender in recorder * fix compile issue, make sort_by_field optional * fix test compile * validate index settings on merge validate index settings on merge forward merge info to SegmentSerializer (for TempStore) * fix doctest * add itertools, use kmerge add itertools, use kmerge push because rustfmt fails * implement/test merge for fastfield implement/test merge for fastfield rename len to num_deleted in DeleteBitSet * Use precalculated docid mapping in merger Use precalculated docid mapping in merger for sorted indices instead of on the fly calculation Add index creation macro benchmark, but commented out for now, since it is not really usable due to long runtimes, and extreme fluctuations. May be better suited in criterion or an external bench bin * fix fast field reader docs fix fast field reader docs, Error instead of None returned add u64s_lenient to fastreader add create docid mapping benchmark * add test for multifast field merge refactor test add test for multifast field merge * add num_bytes to BytesFastFieldReader equivalent to num_vals in MultiValuedFastFieldReader * add MultiValueLength trait add MultiValueLength trait in order to unify index creation for BytesFastFieldReader and MultiValuedFastFieldReader in merger * Add ReaderWithOrdinal, fix Add ReaderWithOrdinal to associate data to a reader in merger Fix bytes offset index creation in merger * add test for merging bytes with sorted docids * Merge fieldnorm for sorted index * handle posting list in merge in sorted index handle posting list in merge in sorted index by using doc id mapping for sorting reuse SegmentOrdinal type * handle doc store order in merge in sorted index * fix typo, cleanup * make IndexSetting non-optional * fix type, rename test file fix type rename test file add type * remove SegmentReaderWithOrdinal accessors * cargo fmt * add index sort & merge test to include deletes * Fix posting list merge issue Fix posting list merge issue - ensure serializer always gets monotonically increasing doc ids handle sorting and merging for facets field * performance: cache field readers, use bytes for doc store merge * change facet merge test to cover index sorting * add RawDocument abstraction to access bytes in doc store * fix deserialization, update changelog fix deserialization update changelog forward error on merge failed * cache store readers to utilize lru cache (4x performance) cache store readers, to utilize lru cache (4x faster performance, due to less decompress calls on the block) * add include_temp_doc_store flag in InnerSegmentMeta unset flag on deserialization and after finalize of a segment set flag when creating new instances
291 lines
8.5 KiB
Rust
291 lines
8.5 KiB
Rust
use crate::directory::FileHandle;
|
|
use stable_deref_trait::StableDeref;
|
|
use std::convert::TryInto;
|
|
use std::mem;
|
|
use std::ops::{Deref, Range};
|
|
use std::sync::Arc;
|
|
use std::{fmt, io};
|
|
|
|
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
|
/// this data as a static slice.
|
|
///
|
|
/// The backing object is required to be `StableDeref`.
|
|
#[derive(Clone)]
|
|
pub struct OwnedBytes {
|
|
data: &'static [u8],
|
|
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
|
|
}
|
|
|
|
impl FileHandle for OwnedBytes {
|
|
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
|
Ok(self.slice(range))
|
|
}
|
|
}
|
|
|
|
impl OwnedBytes {
|
|
/// Creates an empty `OwnedBytes`.
|
|
pub fn empty() -> OwnedBytes {
|
|
OwnedBytes::new(&[][..])
|
|
}
|
|
|
|
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
|
|
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
|
|
data_holder: T,
|
|
) -> OwnedBytes {
|
|
let box_stable_deref = Arc::new(data_holder);
|
|
let bytes: &[u8] = box_stable_deref.as_ref();
|
|
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
|
|
OwnedBytes {
|
|
data,
|
|
box_stable_deref,
|
|
}
|
|
}
|
|
|
|
/// creates a fileslice that is just a view over a slice of the data.
|
|
pub fn slice(&self, range: Range<usize>) -> Self {
|
|
OwnedBytes {
|
|
data: &self.data[range],
|
|
box_stable_deref: self.box_stable_deref.clone(),
|
|
}
|
|
}
|
|
|
|
/// Returns the underlying slice of data.
|
|
/// `Deref` and `AsRef` are also available.
|
|
#[inline]
|
|
pub fn as_slice(&self) -> &[u8] {
|
|
self.data
|
|
}
|
|
|
|
/// Returns the len of the slice.
|
|
#[inline]
|
|
pub fn len(&self) -> usize {
|
|
self.data.len()
|
|
}
|
|
|
|
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
|
|
///
|
|
/// Left will hold `split_len` bytes.
|
|
///
|
|
/// This operation is cheap and does not require to copy any memory.
|
|
/// On the other hand, both `left` and `right` retain a handle over
|
|
/// the entire slice of memory. In other words, the memory will only
|
|
/// be released when both left and right are dropped.
|
|
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
|
let right_box_stable_deref = self.box_stable_deref.clone();
|
|
let left = OwnedBytes {
|
|
data: &self.data[..split_len],
|
|
box_stable_deref: self.box_stable_deref,
|
|
};
|
|
let right = OwnedBytes {
|
|
data: &self.data[split_len..],
|
|
box_stable_deref: right_box_stable_deref,
|
|
};
|
|
(left, right)
|
|
}
|
|
|
|
/// Returns true iff this `OwnedBytes` is empty.
|
|
#[inline]
|
|
pub fn is_empty(&self) -> bool {
|
|
self.as_slice().is_empty()
|
|
}
|
|
|
|
/// Drops the left most `advance_len` bytes.
|
|
///
|
|
/// See also [.clip(clip_len: usize))](#method.clip).
|
|
#[inline]
|
|
pub fn advance(&mut self, advance_len: usize) {
|
|
self.data = &self.data[advance_len..]
|
|
}
|
|
|
|
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
|
|
pub fn read_u8(&mut self) -> u8 {
|
|
assert!(!self.is_empty());
|
|
|
|
let byte = self.as_slice()[0];
|
|
self.advance(1);
|
|
byte
|
|
}
|
|
|
|
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
|
|
pub fn read_u64(&mut self) -> u64 {
|
|
assert!(self.len() > 7);
|
|
|
|
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
|
|
self.advance(8);
|
|
u64::from_le_bytes(octlet)
|
|
}
|
|
}
|
|
|
|
impl fmt::Debug for OwnedBytes {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
// We truncate the bytes in order to make sure the debug string
|
|
// is not too long.
|
|
let bytes_truncated: &[u8] = if self.len() > 8 {
|
|
&self.as_slice()[..10]
|
|
} else {
|
|
self.as_slice()
|
|
};
|
|
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
|
|
}
|
|
}
|
|
|
|
impl Deref for OwnedBytes {
|
|
type Target = [u8];
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
self.as_slice()
|
|
}
|
|
}
|
|
|
|
impl io::Read for OwnedBytes {
|
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
|
let read_len = {
|
|
let data = self.as_slice();
|
|
if data.len() >= buf.len() {
|
|
let buf_len = buf.len();
|
|
buf.copy_from_slice(&data[..buf_len]);
|
|
buf.len()
|
|
} else {
|
|
let data_len = data.len();
|
|
buf[..data_len].copy_from_slice(data);
|
|
data_len
|
|
}
|
|
};
|
|
self.advance(read_len);
|
|
Ok(read_len)
|
|
}
|
|
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
|
let read_len = {
|
|
let data = self.as_slice();
|
|
buf.extend(data);
|
|
data.len()
|
|
};
|
|
self.advance(read_len);
|
|
Ok(read_len)
|
|
}
|
|
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
|
|
let read_len = self.read(buf)?;
|
|
if read_len != buf.len() {
|
|
return Err(io::Error::new(
|
|
io::ErrorKind::UnexpectedEof,
|
|
"failed to fill whole buffer",
|
|
));
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl AsRef<[u8]> for OwnedBytes {
|
|
fn as_ref(&self) -> &[u8] {
|
|
self.as_slice()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::io::{self, Read};
|
|
|
|
use super::OwnedBytes;
|
|
|
|
#[test]
|
|
fn test_owned_bytes_debug() {
|
|
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
|
|
assert_eq!(
|
|
format!("{:?}", short_bytes),
|
|
"OwnedBytes([97, 98, 99, 100], len=4)"
|
|
);
|
|
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
|
assert_eq!(
|
|
format!("{:?}", long_bytes),
|
|
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_owned_bytes_read() -> io::Result<()> {
|
|
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
|
|
{
|
|
let mut buf = [0u8; 5];
|
|
bytes.read_exact(&mut buf[..]).unwrap();
|
|
assert_eq!(&buf, b"abcde");
|
|
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
|
|
}
|
|
{
|
|
let mut buf = [0u8; 2];
|
|
bytes.read_exact(&mut buf[..]).unwrap();
|
|
assert_eq!(&buf, b"fg");
|
|
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
|
|
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
|
let mut buf = [0u8; 5];
|
|
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
|
assert_eq!(&buf, b"abcde");
|
|
assert_eq!(bytes.as_slice(), b"");
|
|
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
|
assert_eq!(&buf, b"abcde");
|
|
Ok(())
|
|
}
|
|
#[test]
|
|
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
|
|
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
|
let mut buf = [0u8; 7];
|
|
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
|
assert_eq!(&buf[..5], b"abcde");
|
|
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_owned_bytes_read_to_end() -> io::Result<()> {
|
|
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
|
let mut buf = Vec::new();
|
|
bytes.read_to_end(&mut buf)?;
|
|
assert_eq!(buf.as_slice(), b"abcde".as_ref());
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_owned_bytes_read_u8() -> io::Result<()> {
|
|
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
|
|
assert_eq!(bytes.read_u8(), 255);
|
|
assert_eq!(bytes.len(), 0);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_owned_bytes_read_u64() -> io::Result<()> {
|
|
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
|
|
assert_eq!(bytes.read_u64(), u64::MAX - 255);
|
|
assert_eq!(bytes.len(), 0);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_owned_bytes_split() {
|
|
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
|
let (left, right) = bytes.split(3);
|
|
assert_eq!(left.as_slice(), b"abc");
|
|
assert_eq!(right.as_slice(), b"defghi");
|
|
}
|
|
|
|
#[test]
|
|
fn test_owned_bytes_split_boundary() {
|
|
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
|
{
|
|
let (left, right) = bytes.clone().split(0);
|
|
assert_eq!(left.as_slice(), b"");
|
|
assert_eq!(right.as_slice(), b"abcdefghi");
|
|
}
|
|
{
|
|
let (left, right) = bytes.split(9);
|
|
assert_eq!(left.as_slice(), b"abcdefghi");
|
|
assert_eq!(right.as_slice(), b"");
|
|
}
|
|
}
|
|
}
|