mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-29 21:42:55 +00:00
Compare commits
2 Commits
python-bin
...
common-cra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1658be3792 | ||
|
|
23fad88b35 |
@@ -58,10 +58,9 @@ mod murmurhash2 {
|
|||||||
///
|
///
|
||||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
let table_size_limit: usize = per_thread_memory_budget / 5;
|
||||||
let compute_table_size = |num_bits: usize| {
|
let compute_table_size = |num_bits: usize| {
|
||||||
let table_size: usize = (1 << num_bits) * mem::size_of::<KeyValue>();
|
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||||
table_size * mem::size_of::<KeyValue>()
|
|
||||||
};
|
};
|
||||||
let table_num_bits: usize = (1..)
|
let table_num_bits: usize = (1..)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|||||||
@@ -105,6 +105,7 @@ impl IndexMerger {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(never)]
|
||||||
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||||
let fieldnorm_fastfields: Vec<Field> = self.schema
|
let fieldnorm_fastfields: Vec<Field> = self.schema
|
||||||
.fields()
|
.fields()
|
||||||
@@ -120,6 +121,7 @@ impl IndexMerger {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(never)]
|
||||||
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||||
let fast_fields: Vec<Field> = self.schema
|
let fast_fields: Vec<Field> = self.schema
|
||||||
.fields()
|
.fields()
|
||||||
@@ -198,6 +200,7 @@ impl IndexMerger {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(never)]
|
||||||
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
||||||
let mut delta_computer = DeltaComputer::new();
|
let mut delta_computer = DeltaComputer::new();
|
||||||
|
|
||||||
@@ -332,6 +335,7 @@ impl IndexMerger {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(never)]
|
||||||
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
|
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let store_reader = reader.get_store_reader();
|
let store_reader = reader.get_store_reader();
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ impl SegmentSerializer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Finalize the segment serialization.
|
/// Finalize the segment serialization.
|
||||||
|
#[inline(never)]
|
||||||
pub fn close(self) -> Result<()> {
|
pub fn close(self) -> Result<()> {
|
||||||
self.fast_field_serializer.close()?;
|
self.fast_field_serializer.close()?;
|
||||||
self.postings_serializer.close()?;
|
self.postings_serializer.close()?;
|
||||||
|
|||||||
@@ -100,13 +100,13 @@ impl TermInfoDeltaEncoder {
|
|||||||
pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo {
|
pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo {
|
||||||
let mut delta_term_info = DeltaTermInfo {
|
let mut delta_term_info = DeltaTermInfo {
|
||||||
doc_freq: term_info.doc_freq,
|
doc_freq: term_info.doc_freq,
|
||||||
delta_postings_offset: term_info.postings_offset - self.term_info.postings_offset,
|
delta_postings_offset: (term_info.postings_offset - self.term_info.postings_offset) as u32,
|
||||||
delta_positions_offset: 0,
|
delta_positions_offset: 0,
|
||||||
positions_inner_offset: 0,
|
positions_inner_offset: 0,
|
||||||
};
|
};
|
||||||
if self.has_positions {
|
if self.has_positions {
|
||||||
delta_term_info.delta_positions_offset =
|
delta_term_info.delta_positions_offset =
|
||||||
term_info.positions_offset - self.term_info.positions_offset;
|
(term_info.positions_offset - self.term_info.positions_offset) as u32;
|
||||||
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
|
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
|
||||||
}
|
}
|
||||||
mem::replace(&mut self.term_info, term_info);
|
mem::replace(&mut self.term_info, term_info);
|
||||||
@@ -155,12 +155,12 @@ impl TermInfoDeltaDecoder {
|
|||||||
let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset);
|
let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset);
|
||||||
cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..];
|
cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..];
|
||||||
self.term_info.doc_freq = doc_freq;
|
self.term_info.doc_freq = doc_freq;
|
||||||
self.term_info.postings_offset += delta_postings_offset;
|
self.term_info.postings_offset += delta_postings_offset as u64;
|
||||||
if self.has_positions {
|
if self.has_positions {
|
||||||
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
|
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
|
||||||
let delta_positions_offset: u32 =
|
let delta_positions_offset: u32 =
|
||||||
unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset);
|
unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset);
|
||||||
self.term_info.positions_offset += delta_positions_offset;
|
self.term_info.positions_offset += delta_positions_offset as u64;
|
||||||
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
|
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
|
||||||
&cursor[num_bytes_positions_offset + 1..]
|
&cursor[num_bytes_positions_offset + 1..]
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
use common::BinarySerializable;
|
use common::{VInt, BinarySerializable};
|
||||||
|
|
||||||
mod termdict;
|
mod termdict;
|
||||||
mod streamer;
|
mod streamer;
|
||||||
mod delta_encoder;
|
mod delta_encoder;
|
||||||
|
|
||||||
|
|
||||||
pub use self::delta_encoder::{TermDeltaDecoder, TermDeltaEncoder};
|
pub use self::delta_encoder::{TermDeltaDecoder, TermDeltaEncoder};
|
||||||
pub use self::delta_encoder::{DeltaTermInfo, TermInfoDeltaDecoder, TermInfoDeltaEncoder};
|
pub use self::delta_encoder::{DeltaTermInfo, TermInfoDeltaDecoder, TermInfoDeltaEncoder};
|
||||||
|
|
||||||
@@ -15,23 +16,23 @@ pub use self::streamer::TermStreamerBuilderImpl;
|
|||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct CheckPoint {
|
pub struct CheckPoint {
|
||||||
pub stream_offset: u32,
|
pub stream_offset: u64,
|
||||||
pub postings_offset: u32,
|
pub postings_offset: u64,
|
||||||
pub positions_offset: u32,
|
pub positions_offset: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BinarySerializable for CheckPoint {
|
impl BinarySerializable for CheckPoint {
|
||||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
self.stream_offset.serialize(writer)?;
|
VInt(self.stream_offset).serialize(writer)?;
|
||||||
self.postings_offset.serialize(writer)?;
|
VInt(self.postings_offset).serialize(writer)?;
|
||||||
self.positions_offset.serialize(writer)?;
|
VInt(self.positions_offset).serialize(writer)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
let stream_offset = u32::deserialize(reader)?;
|
let stream_offset = VInt::deserialize(reader)?.0;
|
||||||
let postings_offset = u32::deserialize(reader)?;
|
let postings_offset = VInt::deserialize(reader)?.0;
|
||||||
let positions_offset = u32::deserialize(reader)?;
|
let positions_offset = VInt::deserialize(reader)?.0;
|
||||||
Ok(CheckPoint {
|
Ok(CheckPoint {
|
||||||
stream_offset,
|
stream_offset,
|
||||||
postings_offset,
|
postings_offset,
|
||||||
|
|||||||
@@ -28,11 +28,10 @@ fn has_positions(field_type: &FieldType) -> bool {
|
|||||||
match *field_type {
|
match *field_type {
|
||||||
FieldType::Str(ref text_options) => {
|
FieldType::Str(ref text_options) => {
|
||||||
let indexing_options = text_options.get_indexing_options();
|
let indexing_options = text_options.get_indexing_options();
|
||||||
if indexing_options.is_position_enabled() {
|
if let Some(text_field_indexing) = indexing_options {
|
||||||
true
|
return text_field_indexing.index_option().has_positions()
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
_ => false,
|
_ => false,
|
||||||
}
|
}
|
||||||
@@ -60,10 +59,10 @@ where
|
|||||||
W: Write,
|
W: Write,
|
||||||
{
|
{
|
||||||
fn add_index_entry(&mut self) {
|
fn add_index_entry(&mut self) {
|
||||||
let stream_offset = self.write.written_bytes() as u32;
|
let stream_offset: u64 = self.write.written_bytes() as u64;
|
||||||
let term_info = self.term_info_encoder.term_info();
|
let term_info = self.term_info_encoder.term_info();
|
||||||
let postings_offset = term_info.postings_offset as u32;
|
let postings_offset: u64 = term_info.postings_offset;
|
||||||
let positions_offset = term_info.positions_offset as u32;
|
let positions_offset: u64 = term_info.positions_offset;
|
||||||
let checkpoint = CheckPoint {
|
let checkpoint = CheckPoint {
|
||||||
stream_offset,
|
stream_offset,
|
||||||
postings_offset,
|
postings_offset,
|
||||||
|
|||||||
65
src/tokenizer/alphanum_only.rs
Normal file
65
src/tokenizer/alphanum_only.rs
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
use super::{Token, TokenFilter, TokenStream};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct AlphaNumOnlyFilter;
|
||||||
|
|
||||||
|
|
||||||
|
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
|
||||||
|
where TailTokenStream: TokenStream
|
||||||
|
{
|
||||||
|
tail: TailTokenStream,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
|
||||||
|
where TailTokenStream: TokenStream
|
||||||
|
{
|
||||||
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
|
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn wrap(
|
||||||
|
tail: TailTokenStream,
|
||||||
|
) -> AlphaNumOnlyFilterStream<TailTokenStream> {
|
||||||
|
AlphaNumOnlyFilterStream {
|
||||||
|
tail
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
|
||||||
|
where
|
||||||
|
TailTokenStream: TokenStream,
|
||||||
|
{
|
||||||
|
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
|
||||||
|
|
||||||
|
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||||
|
AlphaNumOnlyFilterStream::wrap(token_stream)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
|
||||||
|
where
|
||||||
|
TailTokenStream: TokenStream
|
||||||
|
{
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
loop {
|
||||||
|
if self.tail.advance() {
|
||||||
|
if self.predicate(self.tail.token()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -137,7 +137,10 @@ mod tokenizer_manager;
|
|||||||
mod japanese_tokenizer;
|
mod japanese_tokenizer;
|
||||||
mod token_stream_chain;
|
mod token_stream_chain;
|
||||||
mod raw_tokenizer;
|
mod raw_tokenizer;
|
||||||
|
mod alphanum_only;
|
||||||
|
|
||||||
|
|
||||||
|
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
pub use self::tokenizer::BoxedTokenizer;
|
pub use self::tokenizer::BoxedTokenizer;
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ use tokenizer::RawTokenizer;
|
|||||||
use tokenizer::SimpleTokenizer;
|
use tokenizer::SimpleTokenizer;
|
||||||
use tokenizer::JapaneseTokenizer;
|
use tokenizer::JapaneseTokenizer;
|
||||||
use tokenizer::RemoveLongFilter;
|
use tokenizer::RemoveLongFilter;
|
||||||
|
use tokenizer::AlphaNumOnlyFilter;
|
||||||
use tokenizer::LowerCaser;
|
use tokenizer::LowerCaser;
|
||||||
use tokenizer::Stemmer;
|
use tokenizer::Stemmer;
|
||||||
|
|
||||||
@@ -70,6 +71,7 @@ impl Default for TokenizerManager {
|
|||||||
SimpleTokenizer
|
SimpleTokenizer
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
|
.filter(AlphaNumOnlyFilter)
|
||||||
.filter(Stemmer::new()),
|
.filter(Stemmer::new()),
|
||||||
);
|
);
|
||||||
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
|
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));
|
||||||
|
|||||||
Reference in New Issue
Block a user