Compare commits

..

1 Commits

Author SHA1 Message Date
Pascal Seitz
6037cdfe7e remove dynamic dispatch in collect_segment 2023-03-02 20:16:45 +08:00
12 changed files with 59 additions and 55 deletions

View File

@@ -16,6 +16,7 @@ rust-version = "1.62"
[dependencies]
oneshot = "0.1.5"
base64 = "0.21.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
@@ -43,7 +44,7 @@ rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = "0.5.0"
murmurhash32 = "0.3.0"
murmurhash32 = "0.2.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"

View File

@@ -13,6 +13,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version= "0.5", path="../ownedbytes" }
async-trait = "0.1"
time = { version = "0.3.10", features = ["serde-well-known"] }

View File

@@ -2,6 +2,8 @@
use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness;
mod bitset;
mod datetime;
pub mod file_slice;

View File

@@ -1,7 +1,9 @@
use std::io::{Read, Write};
use std::{fmt, io};
use crate::VInt;
use byteorder::{ReadBytesExt, WriteBytesExt};
use crate::{Endianness, VInt};
#[derive(Default)]
struct Counter(u64);
@@ -105,13 +107,11 @@ impl<Left: BinarySerializable + FixedSize, Right: BinarySerializable + FixedSize
impl BinarySerializable for u32 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_u32::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u32> {
let mut buf = [0u8; 4];
reader.read_exact(&mut buf)?;
Ok(u32::from_le_bytes(buf))
reader.read_u32::<Endianness>()
}
}
@@ -121,13 +121,11 @@ impl FixedSize for u32 {
impl BinarySerializable for u16 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_u16::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u16> {
let mut buf = [0u8; 2];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
reader.read_u16::<Endianness>()
}
}
@@ -137,12 +135,10 @@ impl FixedSize for u16 {
impl BinarySerializable for u64 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_u64::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut buf = [0u8; 8];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
reader.read_u64::<Endianness>()
}
}
@@ -152,12 +148,10 @@ impl FixedSize for u64 {
impl BinarySerializable for u128 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_u128::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut buf = [0u8; 16];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
reader.read_u128::<Endianness>()
}
}
@@ -167,12 +161,10 @@ impl FixedSize for u128 {
impl BinarySerializable for f32 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_f32::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut buf = [0u8; 4];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
reader.read_f32::<Endianness>()
}
}
@@ -182,12 +174,10 @@ impl FixedSize for f32 {
impl BinarySerializable for i64 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_i64::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut buf = [0u8; Self::SIZE_IN_BYTES];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
reader.read_i64::<Endianness>()
}
}
@@ -197,12 +187,10 @@ impl FixedSize for i64 {
impl BinarySerializable for f64 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_f64::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut buf = [0u8; Self::SIZE_IN_BYTES];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
reader.read_f64::<Endianness>()
}
}
@@ -212,12 +200,10 @@ impl FixedSize for f64 {
impl BinarySerializable for u8 {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
writer.write_all(&self.to_le_bytes())
writer.write_u8(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut buf = [0u8; Self::SIZE_IN_BYTES];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u8> {
reader.read_u8()
}
}
@@ -227,10 +213,10 @@ impl FixedSize for u8 {
impl BinarySerializable for bool {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
(*self as u8).serialize(writer)
writer.write_u8(u8::from(*self))
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<bool> {
let val = u8::deserialize(reader)?;
let val = reader.read_u8()?;
match val {
0 => Ok(false),
1 => Ok(true),

View File

@@ -1,6 +1,8 @@
use std::io;
use std::io::{Read, Write};
use byteorder::{ByteOrder, LittleEndian};
use super::BinarySerializable;
/// Variable int serializes a u128 number
@@ -125,7 +127,7 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
5,
),
};
buf.copy_from_slice(&res.to_le_bytes());
LittleEndian::write_u64(&mut buf[..], res);
&buf[0..num_bytes]
}

View File

@@ -106,7 +106,7 @@ mod tweak_score_top_collector;
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::{FacetCollector, FacetCounts};
use crate::query::Weight;
use crate::query::{for_each_docset, for_each_scorer, Weight};
mod docset_collector;
pub use self::docset_collector::DocSetCollector;
@@ -173,28 +173,32 @@ pub trait Collector: Sync + Send {
match (reader.alive_bitset(), self.requires_scoring()) {
(Some(alive_bitset), true) => {
weight.for_each(reader, &mut |doc, score| {
let mut scorer = weight.scorer(reader, 1.0)?;
for_each_scorer(scorer.as_mut(), |doc, score| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;
});
}
(Some(alive_bitset), false) => {
weight.for_each_no_score(reader, &mut |doc| {
let mut docset = weight.scorer(reader, 1.0)?;
for_each_docset(docset.as_mut(), |doc| {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, 0.0);
}
})?;
});
}
(None, true) => {
weight.for_each(reader, &mut |doc, score| {
let mut scorer = weight.scorer(reader, 1.0)?;
for_each_scorer(scorer.as_mut(), |doc, score| {
segment_collector.collect(doc, score);
})?;
});
}
(None, false) => {
weight.for_each_no_score(reader, &mut |doc| {
let mut docset = weight.scorer(reader, 1.0)?;
for_each_docset(docset.as_mut(), |doc| {
segment_collector.collect(doc, 0.0);
})?;
});
}
}

View File

@@ -65,6 +65,7 @@ pub use self::union::Union;
#[cfg(test)]
pub use self::vec_docset::VecDocSet;
pub use self::weight::Weight;
pub(crate) use self::weight::{for_each_docset, for_each_pruning_scorer, for_each_scorer};
#[cfg(test)]
mod tests {

View File

@@ -5,9 +5,10 @@ use crate::{DocId, DocSet, Score, TERMINATED};
/// Iterates through all of the documents and scores matched by the DocSet
/// `DocSet`.
#[inline]
pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
scorer: &mut TScorer,
callback: &mut dyn FnMut(DocId, Score),
mut callback: impl FnMut(DocId, Score),
) {
let mut doc = scorer.doc();
while doc != TERMINATED {
@@ -18,7 +19,8 @@ pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
/// Iterates through all of the documents matched by the DocSet
/// `DocSet`.
pub(crate) fn for_each_docset<T: DocSet + ?Sized>(docset: &mut T, callback: &mut dyn FnMut(DocId)) {
#[inline]
pub(crate) fn for_each_docset<T: DocSet + ?Sized>(docset: &mut T, mut callback: impl FnMut(DocId)) {
let mut doc = docset.doc();
while doc != TERMINATED {
callback(doc);
@@ -36,6 +38,7 @@ pub(crate) fn for_each_docset<T: DocSet + ?Sized>(docset: &mut T, callback: &mut
///
/// More importantly, it makes it possible for scorers to implement
/// important optimization (e.g. BlockWAND for union).
#[inline]
pub(crate) fn for_each_pruning_scorer<TScorer: Scorer + ?Sized>(
scorer: &mut TScorer,
mut threshold: Score,

View File

@@ -364,7 +364,8 @@ where B: AsRef<[u8]>
/// (this does not include the field.)
///
/// If the term is a string, its value is utf-8 encoded.
/// If the term is a u64, its value is encoded in big endian.
/// If the term is a u64, its value is encoded according
/// to `byteorder::BigEndian`.
pub fn value_bytes(&self) -> &[u8] {
&self.0.as_ref()[TERM_METADATA_LENGTH..]
}

View File

@@ -1,6 +1,7 @@
use std::cmp;
use std::io::{self, Read, Write};
use byteorder::{ByteOrder, LittleEndian};
use common::{BinarySerializable, FixedSize};
use tantivy_bitpacker::{compute_num_bits, BitPacker};
@@ -103,7 +104,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
let addr_byte = addr_bits / 8;
let bit_shift = (addr_bits % 8) as u64;
let val_unshifted_unmasked: u64 = if data.len() >= addr_byte + 8 {
u64::from_le_bytes(data[addr_byte..][..8].try_into().unwrap())
LittleEndian::read_u64(&data[addr_byte..][..8])
} else {
// the buffer is not large enough.
// Let's copy the few remaining bytes to a 8 byte buffer
@@ -112,7 +113,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
let data_to_copy = &data[addr_byte..];
let nbytes = data_to_copy.len();
buf[..nbytes].copy_from_slice(data_to_copy);
u64::from_le_bytes(buf)
LittleEndian::read_u64(&buf)
};
let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift;
let mask = (1u64 << u64::from(num_bits)) - 1;

View File

@@ -5,5 +5,6 @@ edition = "2021"
license = "MIT"
[dependencies]
murmurhash32 = "0.3"
murmurhash32 = "0.2"
byteorder = "1"
common = { version = "0.5", path = "../common/", package = "tantivy-common" }

View File

@@ -1,5 +1,6 @@
use std::{iter, mem, slice};
use byteorder::{ByteOrder, NativeEndian};
use murmurhash32::murmurhash2;
use super::{Addr, MemoryArena};
@@ -154,7 +155,7 @@ impl ArenaHashMap {
#[inline]
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let data = self.memory_arena.slice_from(addr);
let key_bytes_len = u16::from_ne_bytes(data[..2].try_into().unwrap()) as usize;
let key_bytes_len = NativeEndian::read_u16(data) as usize;
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
}
@@ -272,7 +273,7 @@ impl ArenaHashMap {
let key_addr = self.memory_arena.allocate_space(num_bytes);
{
let data = self.memory_arena.slice_mut(key_addr, num_bytes);
data[..2].copy_from_slice(&u16::to_ne_bytes(key.len() as u16));
NativeEndian::write_u16(data, key.len() as u16);
let stop = 2 + key.len();
data[2..stop].copy_from_slice(key);
store(&mut data[stop..], val);