mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 20:42:54 +00:00
Compare commits
2 Commits
sparse_cod
...
common-cra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1658be3792 | ||
|
|
23fad88b35 |
@@ -58,10 +58,9 @@ mod murmurhash2 {
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let table_size_limit: usize = per_thread_memory_budget / 5;
|
||||
let compute_table_size = |num_bits: usize| {
|
||||
let table_size: usize = (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
table_size * mem::size_of::<KeyValue>()
|
||||
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||
};
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
|
||||
@@ -105,6 +105,7 @@ impl IndexMerger {
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
let fieldnorm_fastfields: Vec<Field> = self.schema
|
||||
.fields()
|
||||
@@ -120,6 +121,7 @@ impl IndexMerger {
|
||||
)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
let fast_fields: Vec<Field> = self.schema
|
||||
.fields()
|
||||
@@ -198,6 +200,7 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
|
||||
@@ -332,6 +335,7 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader();
|
||||
|
||||
@@ -56,6 +56,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Finalize the segment serialization.
|
||||
#[inline(never)]
|
||||
pub fn close(self) -> Result<()> {
|
||||
self.fast_field_serializer.close()?;
|
||||
self.postings_serializer.close()?;
|
||||
|
||||
@@ -100,13 +100,13 @@ impl TermInfoDeltaEncoder {
|
||||
pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo {
|
||||
let mut delta_term_info = DeltaTermInfo {
|
||||
doc_freq: term_info.doc_freq,
|
||||
delta_postings_offset: term_info.postings_offset - self.term_info.postings_offset,
|
||||
delta_postings_offset: (term_info.postings_offset - self.term_info.postings_offset) as u32,
|
||||
delta_positions_offset: 0,
|
||||
positions_inner_offset: 0,
|
||||
};
|
||||
if self.has_positions {
|
||||
delta_term_info.delta_positions_offset =
|
||||
term_info.positions_offset - self.term_info.positions_offset;
|
||||
(term_info.positions_offset - self.term_info.positions_offset) as u32;
|
||||
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
|
||||
}
|
||||
mem::replace(&mut self.term_info, term_info);
|
||||
@@ -155,12 +155,12 @@ impl TermInfoDeltaDecoder {
|
||||
let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset);
|
||||
cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..];
|
||||
self.term_info.doc_freq = doc_freq;
|
||||
self.term_info.postings_offset += delta_postings_offset;
|
||||
self.term_info.postings_offset += delta_postings_offset as u64;
|
||||
if self.has_positions {
|
||||
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
|
||||
let delta_positions_offset: u32 =
|
||||
unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset);
|
||||
self.term_info.positions_offset += delta_positions_offset;
|
||||
self.term_info.positions_offset += delta_positions_offset as u64;
|
||||
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
|
||||
&cursor[num_bytes_positions_offset + 1..]
|
||||
} else {
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use std::io::{self, Read, Write};
|
||||
use common::BinarySerializable;
|
||||
use common::{VInt, BinarySerializable};
|
||||
|
||||
mod termdict;
|
||||
mod streamer;
|
||||
mod delta_encoder;
|
||||
|
||||
|
||||
pub use self::delta_encoder::{TermDeltaDecoder, TermDeltaEncoder};
|
||||
pub use self::delta_encoder::{DeltaTermInfo, TermInfoDeltaDecoder, TermInfoDeltaEncoder};
|
||||
|
||||
@@ -15,23 +16,23 @@ pub use self::streamer::TermStreamerBuilderImpl;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CheckPoint {
|
||||
pub stream_offset: u32,
|
||||
pub postings_offset: u32,
|
||||
pub positions_offset: u32,
|
||||
pub stream_offset: u64,
|
||||
pub postings_offset: u64,
|
||||
pub positions_offset: u64,
|
||||
}
|
||||
|
||||
impl BinarySerializable for CheckPoint {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.stream_offset.serialize(writer)?;
|
||||
self.postings_offset.serialize(writer)?;
|
||||
self.positions_offset.serialize(writer)?;
|
||||
VInt(self.stream_offset).serialize(writer)?;
|
||||
VInt(self.postings_offset).serialize(writer)?;
|
||||
VInt(self.positions_offset).serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let stream_offset = u32::deserialize(reader)?;
|
||||
let postings_offset = u32::deserialize(reader)?;
|
||||
let positions_offset = u32::deserialize(reader)?;
|
||||
let stream_offset = VInt::deserialize(reader)?.0;
|
||||
let postings_offset = VInt::deserialize(reader)?.0;
|
||||
let positions_offset = VInt::deserialize(reader)?.0;
|
||||
Ok(CheckPoint {
|
||||
stream_offset,
|
||||
postings_offset,
|
||||
|
||||
@@ -28,11 +28,10 @@ fn has_positions(field_type: &FieldType) -> bool {
|
||||
match *field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let indexing_options = text_options.get_indexing_options();
|
||||
if indexing_options.is_position_enabled() {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
if let Some(text_field_indexing) = indexing_options {
|
||||
return text_field_indexing.index_option().has_positions()
|
||||
}
|
||||
return false;
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
@@ -60,10 +59,10 @@ where
|
||||
W: Write,
|
||||
{
|
||||
fn add_index_entry(&mut self) {
|
||||
let stream_offset = self.write.written_bytes() as u32;
|
||||
let stream_offset: u64 = self.write.written_bytes() as u64;
|
||||
let term_info = self.term_info_encoder.term_info();
|
||||
let postings_offset = term_info.postings_offset as u32;
|
||||
let positions_offset = term_info.positions_offset as u32;
|
||||
let postings_offset: u64 = term_info.postings_offset;
|
||||
let positions_offset: u64 = term_info.positions_offset;
|
||||
let checkpoint = CheckPoint {
|
||||
stream_offset,
|
||||
postings_offset,
|
||||
|
||||
65
src/tokenizer/alphanum_only.rs
Normal file
65
src/tokenizer/alphanum_only.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
|
||||
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||
}
|
||||
|
||||
fn wrap(
|
||||
tail: TailTokenStream,
|
||||
) -> AlphaNumOnlyFilterStream<TailTokenStream> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
AlphaNumOnlyFilterStream::wrap(token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
if self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -137,7 +137,10 @@ mod tokenizer_manager;
|
||||
mod japanese_tokenizer;
|
||||
mod token_stream_chain;
|
||||
mod raw_tokenizer;
|
||||
mod alphanum_only;
|
||||
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
@@ -7,6 +7,7 @@ use tokenizer::RawTokenizer;
|
||||
use tokenizer::SimpleTokenizer;
|
||||
use tokenizer::JapaneseTokenizer;
|
||||
use tokenizer::RemoveLongFilter;
|
||||
use tokenizer::AlphaNumOnlyFilter;
|
||||
use tokenizer::LowerCaser;
|
||||
use tokenizer::Stemmer;
|
||||
|
||||
@@ -70,6 +71,7 @@ impl Default for TokenizerManager {
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.filter(Stemmer::new()),
|
||||
);
|
||||
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
|
||||
|
||||
Reference in New Issue
Block a user