mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 21:12:54 +00:00
Compare commits
1 Commits
remove-byt
...
remove_dyn
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6037cdfe7e |
@@ -16,6 +16,7 @@ rust-version = "1.62"
|
||||
[dependencies]
|
||||
oneshot = "0.1.5"
|
||||
base64 = "0.21.0"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
@@ -43,7 +44,7 @@ rustc-hash = "1.1.0"
|
||||
thiserror = "1.0.30"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.5.0"
|
||||
murmurhash32 = "0.3.0"
|
||||
murmurhash32 = "0.2.0"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
|
||||
@@ -13,6 +13,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version= "0.5", path="../ownedbytes" }
|
||||
async-trait = "0.1"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod bitset;
|
||||
mod datetime;
|
||||
pub mod file_slice;
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::io::{Read, Write};
|
||||
use std::{fmt, io};
|
||||
|
||||
use crate::VInt;
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
|
||||
use crate::{Endianness, VInt};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Counter(u64);
|
||||
@@ -105,13 +107,11 @@ impl<Left: BinarySerializable + FixedSize, Right: BinarySerializable + FixedSize
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_u32::<Endianness>(*self)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u32> {
|
||||
let mut buf = [0u8; 4];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(u32::from_le_bytes(buf))
|
||||
reader.read_u32::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,13 +121,11 @@ impl FixedSize for u32 {
|
||||
|
||||
impl BinarySerializable for u16 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_u16::<Endianness>(*self)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u16> {
|
||||
let mut buf = [0u8; 2];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(Self::from_le_bytes(buf))
|
||||
reader.read_u16::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,12 +135,10 @@ impl FixedSize for u16 {
|
||||
|
||||
impl BinarySerializable for u64 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_u64::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let mut buf = [0u8; 8];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(Self::from_le_bytes(buf))
|
||||
reader.read_u64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,12 +148,10 @@ impl FixedSize for u64 {
|
||||
|
||||
impl BinarySerializable for u128 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_u128::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let mut buf = [0u8; 16];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(Self::from_le_bytes(buf))
|
||||
reader.read_u128::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,12 +161,10 @@ impl FixedSize for u128 {
|
||||
|
||||
impl BinarySerializable for f32 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_f32::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let mut buf = [0u8; 4];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(Self::from_le_bytes(buf))
|
||||
reader.read_f32::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,12 +174,10 @@ impl FixedSize for f32 {
|
||||
|
||||
impl BinarySerializable for i64 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_i64::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let mut buf = [0u8; Self::SIZE_IN_BYTES];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(Self::from_le_bytes(buf))
|
||||
reader.read_i64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,12 +187,10 @@ impl FixedSize for i64 {
|
||||
|
||||
impl BinarySerializable for f64 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_f64::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let mut buf = [0u8; Self::SIZE_IN_BYTES];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(Self::from_le_bytes(buf))
|
||||
reader.read_f64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,12 +200,10 @@ impl FixedSize for f64 {
|
||||
|
||||
impl BinarySerializable for u8 {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_all(&self.to_le_bytes())
|
||||
writer.write_u8(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let mut buf = [0u8; Self::SIZE_IN_BYTES];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(Self::from_le_bytes(buf))
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u8> {
|
||||
reader.read_u8()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,10 +213,10 @@ impl FixedSize for u8 {
|
||||
|
||||
impl BinarySerializable for bool {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
(*self as u8).serialize(writer)
|
||||
writer.write_u8(u8::from(*self))
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<bool> {
|
||||
let val = u8::deserialize(reader)?;
|
||||
let val = reader.read_u8()?;
|
||||
match val {
|
||||
0 => Ok(false),
|
||||
1 => Ok(true),
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::io;
|
||||
use std::io::{Read, Write};
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
|
||||
use super::BinarySerializable;
|
||||
|
||||
/// Variable int serializes a u128 number
|
||||
@@ -125,7 +127,7 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
|
||||
5,
|
||||
),
|
||||
};
|
||||
buf.copy_from_slice(&res.to_le_bytes());
|
||||
LittleEndian::write_u64(&mut buf[..], res);
|
||||
&buf[0..num_bytes]
|
||||
}
|
||||
|
||||
|
||||
@@ -106,7 +106,7 @@ mod tweak_score_top_collector;
|
||||
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::{FacetCollector, FacetCounts};
|
||||
use crate::query::Weight;
|
||||
use crate::query::{for_each_docset, for_each_scorer, Weight};
|
||||
|
||||
mod docset_collector;
|
||||
pub use self::docset_collector::DocSetCollector;
|
||||
@@ -173,28 +173,32 @@ pub trait Collector: Sync + Send {
|
||||
|
||||
match (reader.alive_bitset(), self.requires_scoring()) {
|
||||
(Some(alive_bitset), true) => {
|
||||
weight.for_each(reader, &mut |doc, score| {
|
||||
let mut scorer = weight.scorer(reader, 1.0)?;
|
||||
for_each_scorer(scorer.as_mut(), |doc, score| {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
})?;
|
||||
});
|
||||
}
|
||||
(Some(alive_bitset), false) => {
|
||||
weight.for_each_no_score(reader, &mut |doc| {
|
||||
let mut docset = weight.scorer(reader, 1.0)?;
|
||||
for_each_docset(docset.as_mut(), |doc| {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, 0.0);
|
||||
}
|
||||
})?;
|
||||
});
|
||||
}
|
||||
(None, true) => {
|
||||
weight.for_each(reader, &mut |doc, score| {
|
||||
let mut scorer = weight.scorer(reader, 1.0)?;
|
||||
for_each_scorer(scorer.as_mut(), |doc, score| {
|
||||
segment_collector.collect(doc, score);
|
||||
})?;
|
||||
});
|
||||
}
|
||||
(None, false) => {
|
||||
weight.for_each_no_score(reader, &mut |doc| {
|
||||
let mut docset = weight.scorer(reader, 1.0)?;
|
||||
for_each_docset(docset.as_mut(), |doc| {
|
||||
segment_collector.collect(doc, 0.0);
|
||||
})?;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ pub use self::union::Union;
|
||||
#[cfg(test)]
|
||||
pub use self::vec_docset::VecDocSet;
|
||||
pub use self::weight::Weight;
|
||||
pub(crate) use self::weight::{for_each_docset, for_each_pruning_scorer, for_each_scorer};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -5,9 +5,10 @@ use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
/// Iterates through all of the documents and scores matched by the DocSet
|
||||
/// `DocSet`.
|
||||
#[inline]
|
||||
pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
|
||||
scorer: &mut TScorer,
|
||||
callback: &mut dyn FnMut(DocId, Score),
|
||||
mut callback: impl FnMut(DocId, Score),
|
||||
) {
|
||||
let mut doc = scorer.doc();
|
||||
while doc != TERMINATED {
|
||||
@@ -18,7 +19,8 @@ pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
|
||||
|
||||
/// Iterates through all of the documents matched by the DocSet
|
||||
/// `DocSet`.
|
||||
pub(crate) fn for_each_docset<T: DocSet + ?Sized>(docset: &mut T, callback: &mut dyn FnMut(DocId)) {
|
||||
#[inline]
|
||||
pub(crate) fn for_each_docset<T: DocSet + ?Sized>(docset: &mut T, mut callback: impl FnMut(DocId)) {
|
||||
let mut doc = docset.doc();
|
||||
while doc != TERMINATED {
|
||||
callback(doc);
|
||||
@@ -36,6 +38,7 @@ pub(crate) fn for_each_docset<T: DocSet + ?Sized>(docset: &mut T, callback: &mut
|
||||
///
|
||||
/// More importantly, it makes it possible for scorers to implement
|
||||
/// important optimization (e.g. BlockWAND for union).
|
||||
#[inline]
|
||||
pub(crate) fn for_each_pruning_scorer<TScorer: Scorer + ?Sized>(
|
||||
scorer: &mut TScorer,
|
||||
mut threshold: Score,
|
||||
|
||||
@@ -364,7 +364,8 @@ where B: AsRef<[u8]>
|
||||
/// (this does not include the field.)
|
||||
///
|
||||
/// If the term is a string, its value is utf-8 encoded.
|
||||
/// If the term is a u64, its value is encoded in big endian.
|
||||
/// If the term is a u64, its value is encoded according
|
||||
/// to `byteorder::BigEndian`.
|
||||
pub fn value_bytes(&self) -> &[u8] {
|
||||
&self.0.as_ref()[TERM_METADATA_LENGTH..]
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::cmp;
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker};
|
||||
|
||||
@@ -103,7 +104,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
|
||||
let addr_byte = addr_bits / 8;
|
||||
let bit_shift = (addr_bits % 8) as u64;
|
||||
let val_unshifted_unmasked: u64 = if data.len() >= addr_byte + 8 {
|
||||
u64::from_le_bytes(data[addr_byte..][..8].try_into().unwrap())
|
||||
LittleEndian::read_u64(&data[addr_byte..][..8])
|
||||
} else {
|
||||
// the buffer is not large enough.
|
||||
// Let's copy the few remaining bytes to a 8 byte buffer
|
||||
@@ -112,7 +113,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 {
|
||||
let data_to_copy = &data[addr_byte..];
|
||||
let nbytes = data_to_copy.len();
|
||||
buf[..nbytes].copy_from_slice(data_to_copy);
|
||||
u64::from_le_bytes(buf)
|
||||
LittleEndian::read_u64(&buf)
|
||||
};
|
||||
let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift;
|
||||
let mask = (1u64 << u64::from(num_bits)) - 1;
|
||||
|
||||
@@ -5,5 +5,6 @@ edition = "2021"
|
||||
license = "MIT"
|
||||
|
||||
[dependencies]
|
||||
murmurhash32 = "0.3"
|
||||
murmurhash32 = "0.2"
|
||||
byteorder = "1"
|
||||
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::{iter, mem, slice};
|
||||
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
@@ -154,7 +155,7 @@ impl ArenaHashMap {
|
||||
#[inline]
|
||||
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let data = self.memory_arena.slice_from(addr);
|
||||
let key_bytes_len = u16::from_ne_bytes(data[..2].try_into().unwrap()) as usize;
|
||||
let key_bytes_len = NativeEndian::read_u16(data) as usize;
|
||||
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
|
||||
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
|
||||
}
|
||||
@@ -272,7 +273,7 @@ impl ArenaHashMap {
|
||||
let key_addr = self.memory_arena.allocate_space(num_bytes);
|
||||
{
|
||||
let data = self.memory_arena.slice_mut(key_addr, num_bytes);
|
||||
data[..2].copy_from_slice(&u16::to_ne_bytes(key.len() as u16));
|
||||
NativeEndian::write_u16(data, key.len() as u16);
|
||||
let stop = 2 + key.len();
|
||||
data[2..stop].copy_from_slice(key);
|
||||
store(&mut data[stop..], val);
|
||||
|
||||
Reference in New Issue
Block a user