mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 10:00:41 +00:00
* sort index by field add sort info to IndexSettings generate docid mapping for sorted field (only fastfield) remap singlevalue fastfield * support docid mapping in multivalue fastfield move docid mapping to serialization step (less intermediate data for mapping) add support for docid mapping in multivalue fastfield * handle docid map in bytes fastfield * forward docid mapping, remap postings * fix merge conflicts * move test to index_sorter * add docid index mapping old->new add docid mapping for both directions old->new (used in postings) and new->old (used in fast field) handle mapping in postings recorder warn instead of info for MAX_TOKEN_LEN * remap docid in fielnorm * resort docids in recorder, more extensive tests * handle index sorting in docstore handle index sort in docstore, by saving all the docs in a temp docstore file (SegmentComponent::TempStore). On serialization the docid mapping is used to create a docstore in the correct order by reader the old docstore. add docstore sort tests refactor tests * refactor rename docid doc_id rename docid_map doc_id_map rename DocidMapping DocIdMapping fix typo * u32 to DocId * better doc_id_map creation remove unstable sort * add non mut method to FastFieldWriters add _mut prefix to &mut methods * remove sort_index * fix clippy issues * fix SegmentComponent iterator use std::mem::replace * fix test * fmt * handle indexsettings deserialize * add reading, writing bytes to doc store get bytes of document in doc store add store_bytes method doc writer to accept serialized document add serialization index settings test * rename index_sorter to doc_id_mapping use bufferlender in recorder * fix compile issue, make sort_by_field optional * fix test compile * validate index settings on merge validate index settings on merge forward merge info to SegmentSerializer (for TempStore) * fix doctest * add itertools, use kmerge add itertools, use kmerge push because rustfmt fails * implement/test merge for fastfield implement/test merge for fastfield rename len to num_deleted in DeleteBitSet * Use precalculated docid mapping in merger Use precalculated docid mapping in merger for sorted indices instead of on the fly calculation Add index creation macro benchmark, but commented out for now, since it is not really usable due to long runtimes, and extreme fluctuations. May be better suited in criterion or an external bench bin * fix fast field reader docs fix fast field reader docs, Error instead of None returned add u64s_lenient to fastreader add create docid mapping benchmark * add test for multifast field merge refactor test add test for multifast field merge * add num_bytes to BytesFastFieldReader equivalent to num_vals in MultiValuedFastFieldReader * add MultiValueLength trait add MultiValueLength trait in order to unify index creation for BytesFastFieldReader and MultiValuedFastFieldReader in merger * Add ReaderWithOrdinal, fix Add ReaderWithOrdinal to associate data to a reader in merger Fix bytes offset index creation in merger * add test for merging bytes with sorted docids * Merge fieldnorm for sorted index * handle posting list in merge in sorted index handle posting list in merge in sorted index by using doc id mapping for sorting reuse SegmentOrdinal type * handle doc store order in merge in sorted index * fix typo, cleanup * make IndexSetting non-optional * fix type, rename test file fix type rename test file add type * remove SegmentReaderWithOrdinal accessors * cargo fmt * add index sort & merge test to include deletes * Fix posting list merge issue Fix posting list merge issue - ensure serializer always gets monotonically increasing doc ids handle sorting and merging for facets field * performance: cache field readers, use bytes for doc store merge * change facet merge test to cover index sorting * add RawDocument abstraction to access bytes in doc store * fix deserialization, update changelog fix deserialization update changelog forward error on merge failed * cache store readers to utilize lru cache (4x performance) cache store readers, to utilize lru cache (4x faster performance, due to less decompress calls on the block) * add include_temp_doc_store flag in InnerSegmentMeta unset flag on deserialization and after finalize of a segment set flag when creating new instances
140 lines
4.3 KiB
Rust
140 lines
4.3 KiB
Rust
use std::{convert::TryInto, io};
|
|
|
|
pub struct BitPacker {
|
|
mini_buffer: u64,
|
|
mini_buffer_written: usize,
|
|
}
|
|
impl Default for BitPacker {
|
|
fn default() -> Self {
|
|
BitPacker::new()
|
|
}
|
|
}
|
|
impl BitPacker {
|
|
pub fn new() -> BitPacker {
|
|
BitPacker {
|
|
mini_buffer: 0u64,
|
|
mini_buffer_written: 0,
|
|
}
|
|
}
|
|
|
|
pub fn write<TWrite: io::Write>(
|
|
&mut self,
|
|
val: u64,
|
|
num_bits: u8,
|
|
output: &mut TWrite,
|
|
) -> io::Result<()> {
|
|
let val_u64 = val as u64;
|
|
let num_bits = num_bits as usize;
|
|
if self.mini_buffer_written + num_bits > 64 {
|
|
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
|
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
|
|
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
|
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
|
} else {
|
|
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
|
self.mini_buffer_written += num_bits;
|
|
if self.mini_buffer_written == 64 {
|
|
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
|
|
self.mini_buffer_written = 0;
|
|
self.mini_buffer = 0u64;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
|
if self.mini_buffer_written > 0 {
|
|
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
|
let bytes = self.mini_buffer.to_le_bytes();
|
|
output.write_all(&bytes[..num_bytes])?;
|
|
self.mini_buffer_written = 0;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
|
self.flush(output)?;
|
|
// Padding the write file to simplify reads.
|
|
output.write_all(&[0u8; 7])?;
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct BitUnpacker {
|
|
num_bits: u64,
|
|
mask: u64,
|
|
}
|
|
|
|
impl BitUnpacker {
|
|
pub fn new(num_bits: u8) -> BitUnpacker {
|
|
let mask: u64 = if num_bits == 64 {
|
|
!0u64
|
|
} else {
|
|
(1u64 << num_bits) - 1u64
|
|
};
|
|
BitUnpacker {
|
|
num_bits: u64::from(num_bits),
|
|
mask,
|
|
}
|
|
}
|
|
|
|
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
|
|
if self.num_bits == 0 {
|
|
return 0u64;
|
|
}
|
|
let num_bits = self.num_bits;
|
|
let mask = self.mask;
|
|
let addr_in_bits = idx * num_bits;
|
|
let addr = addr_in_bits >> 3;
|
|
let bit_shift = addr_in_bits & 7;
|
|
debug_assert!(
|
|
addr + 8 <= data.len() as u64,
|
|
"The fast field field should have been padded with 7 bytes."
|
|
);
|
|
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
|
|
.try_into()
|
|
.unwrap();
|
|
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
|
val_shifted & mask
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::{BitPacker, BitUnpacker};
|
|
|
|
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
|
|
let mut data = Vec::new();
|
|
let mut bitpacker = BitPacker::new();
|
|
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
|
|
let vals: Vec<u64> = (0u64..len as u64)
|
|
.map(|i| if max_val == 0 { 0 } else { i % max_val })
|
|
.collect();
|
|
for &val in &vals {
|
|
bitpacker.write(val, num_bits, &mut data).unwrap();
|
|
}
|
|
bitpacker.close(&mut data).unwrap();
|
|
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
|
|
let bitunpacker = BitUnpacker::new(num_bits);
|
|
(bitunpacker, vals, data)
|
|
}
|
|
|
|
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
|
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
|
|
for (i, val) in vals.iter().enumerate() {
|
|
assert_eq!(bitunpacker.get(i as u64, &data), *val);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_bitpacker() {
|
|
test_bitpacker_util(10, 3);
|
|
test_bitpacker_util(10, 0);
|
|
test_bitpacker_util(10, 1);
|
|
test_bitpacker_util(6, 14);
|
|
test_bitpacker_util(1000, 14);
|
|
}
|
|
}
|