Compare commits

...

2 Commits

Author SHA1 Message Date
Paul Masurel
1658be3792 Various changes. Need to cherrypick some of them and put them into master 2017-12-25 10:35:10 +09:00
Paul Masurel
23fad88b35 NOBUG common crawl, streamdict works with 64 bits (hopefully) 2017-12-21 22:44:50 +09:00
9 changed files with 98 additions and 24 deletions

View File

@@ -58,10 +58,9 @@ mod murmurhash2 {
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
let table_size_limit: usize = per_thread_memory_budget / 3;
let table_size_limit: usize = per_thread_memory_budget / 5;
let compute_table_size = |num_bits: usize| {
let table_size: usize = (1 << num_bits) * mem::size_of::<KeyValue>();
table_size * mem::size_of::<KeyValue>()
(1 << num_bits) * mem::size_of::<KeyValue>()
};
let table_num_bits: usize = (1..)
.into_iter()

View File

@@ -105,6 +105,7 @@ impl IndexMerger {
})
}
#[inline(never)]
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fieldnorm_fastfields: Vec<Field> = self.schema
.fields()
@@ -120,6 +121,7 @@ impl IndexMerger {
)
}
#[inline(never)]
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fast_fields: Vec<Field> = self.schema
.fields()
@@ -198,6 +200,7 @@ impl IndexMerger {
Ok(())
}
#[inline(never)]
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut delta_computer = DeltaComputer::new();
@@ -332,6 +335,7 @@ impl IndexMerger {
Ok(())
}
#[inline(never)]
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
for reader in &self.readers {
let store_reader = reader.get_store_reader();

View File

@@ -56,6 +56,7 @@ impl SegmentSerializer {
}
/// Finalize the segment serialization.
#[inline(never)]
pub fn close(self) -> Result<()> {
self.fast_field_serializer.close()?;
self.postings_serializer.close()?;

View File

@@ -100,13 +100,13 @@ impl TermInfoDeltaEncoder {
pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo {
let mut delta_term_info = DeltaTermInfo {
doc_freq: term_info.doc_freq,
delta_postings_offset: term_info.postings_offset - self.term_info.postings_offset,
delta_postings_offset: (term_info.postings_offset - self.term_info.postings_offset) as u32,
delta_positions_offset: 0,
positions_inner_offset: 0,
};
if self.has_positions {
delta_term_info.delta_positions_offset =
term_info.positions_offset - self.term_info.positions_offset;
(term_info.positions_offset - self.term_info.positions_offset) as u32;
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
}
mem::replace(&mut self.term_info, term_info);
@@ -155,12 +155,12 @@ impl TermInfoDeltaDecoder {
let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset);
cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..];
self.term_info.doc_freq = doc_freq;
self.term_info.postings_offset += delta_postings_offset;
self.term_info.postings_offset += delta_postings_offset as u64;
if self.has_positions {
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
let delta_positions_offset: u32 =
unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset);
self.term_info.positions_offset += delta_positions_offset;
self.term_info.positions_offset += delta_positions_offset as u64;
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
&cursor[num_bytes_positions_offset + 1..]
} else {

View File

@@ -1,10 +1,11 @@
use std::io::{self, Read, Write};
use common::BinarySerializable;
use common::{VInt, BinarySerializable};
mod termdict;
mod streamer;
mod delta_encoder;
pub use self::delta_encoder::{TermDeltaDecoder, TermDeltaEncoder};
pub use self::delta_encoder::{DeltaTermInfo, TermInfoDeltaDecoder, TermInfoDeltaEncoder};
@@ -15,23 +16,23 @@ pub use self::streamer::TermStreamerBuilderImpl;
#[derive(Debug)]
pub struct CheckPoint {
pub stream_offset: u32,
pub postings_offset: u32,
pub positions_offset: u32,
pub stream_offset: u64,
pub postings_offset: u64,
pub positions_offset: u64,
}
impl BinarySerializable for CheckPoint {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.stream_offset.serialize(writer)?;
self.postings_offset.serialize(writer)?;
self.positions_offset.serialize(writer)?;
VInt(self.stream_offset).serialize(writer)?;
VInt(self.postings_offset).serialize(writer)?;
VInt(self.positions_offset).serialize(writer)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let stream_offset = u32::deserialize(reader)?;
let postings_offset = u32::deserialize(reader)?;
let positions_offset = u32::deserialize(reader)?;
let stream_offset = VInt::deserialize(reader)?.0;
let postings_offset = VInt::deserialize(reader)?.0;
let positions_offset = VInt::deserialize(reader)?.0;
Ok(CheckPoint {
stream_offset,
postings_offset,

View File

@@ -28,11 +28,10 @@ fn has_positions(field_type: &FieldType) -> bool {
match *field_type {
FieldType::Str(ref text_options) => {
let indexing_options = text_options.get_indexing_options();
if indexing_options.is_position_enabled() {
true
} else {
false
if let Some(text_field_indexing) = indexing_options {
return text_field_indexing.index_option().has_positions()
}
return false;
}
_ => false,
}
@@ -60,10 +59,10 @@ where
W: Write,
{
fn add_index_entry(&mut self) {
let stream_offset = self.write.written_bytes() as u32;
let stream_offset: u64 = self.write.written_bytes() as u64;
let term_info = self.term_info_encoder.term_info();
let postings_offset = term_info.postings_offset as u32;
let positions_offset = term_info.positions_offset as u32;
let postings_offset: u64 = term_info.postings_offset;
let positions_offset: u64 = term_info.positions_offset;
let checkpoint = CheckPoint {
stream_offset,
postings_offset,

View File

@@ -0,0 +1,65 @@
use super::{Token, TokenFilter, TokenStream};
#[derive(Clone)]
pub struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
tail: TailTokenStream,
}
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
fn predicate(&self, token: &Token) -> bool {
token.text.chars().all(|c| c.is_ascii_alphanumeric())
}
fn wrap(
tail: TailTokenStream,
) -> AlphaNumOnlyFilterStream<TailTokenStream> {
AlphaNumOnlyFilterStream {
tail
}
}
}
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
AlphaNumOnlyFilterStream::wrap(token_stream)
}
}
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
where
TailTokenStream: TokenStream
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
fn advance(&mut self) -> bool {
loop {
if self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} else {
return false;
}
}
}
}

View File

@@ -137,7 +137,10 @@ mod tokenizer_manager;
mod japanese_tokenizer;
mod token_stream_chain;
mod raw_tokenizer;
mod alphanum_only;
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer_manager::TokenizerManager;

View File

@@ -7,6 +7,7 @@ use tokenizer::RawTokenizer;
use tokenizer::SimpleTokenizer;
use tokenizer::JapaneseTokenizer;
use tokenizer::RemoveLongFilter;
use tokenizer::AlphaNumOnlyFilter;
use tokenizer::LowerCaser;
use tokenizer::Stemmer;
@@ -70,6 +71,7 @@ impl Default for TokenizerManager {
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AlphaNumOnlyFilter)
.filter(Stemmer::new()),
);
manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40)));