Removing Term Vec allocation (#881)

This commit is contained in:
Paul Masurel
2020-09-08 23:11:00 +09:00
committed by GitHub
parent ac2a7273e6
commit 70bae7ce4c
3 changed files with 69 additions and 58 deletions

View File

@@ -17,7 +17,6 @@ use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp;
use crate::{DocId, SegmentComponent};
use std::io;
use std::str;
/// Computes the initial size of the hash table.
///
@@ -48,6 +47,7 @@ pub struct SegmentWriter {
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<TextAnalyzer>>,
term_buffer: Term,
}
impl SegmentWriter {
@@ -91,6 +91,7 @@ impl SegmentWriter {
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
tokenizers,
term_buffer: Term::new(),
})
}
@@ -128,24 +129,26 @@ impl SegmentWriter {
if !field_options.is_indexed() {
continue;
}
let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings);
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
let mut term = Term::for_field(field); // we set the Term
term_buffer.set_field(field);
let facets =
field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => {
panic!("Expected hierarchical facet");
}
});
for fake_str in facets {
let mut unordered_term_id_opt = None;
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
term_buffer.set_text(&token.text);
let unordered_term_id =
self.multifield_postings.subscribe(doc_id, &term);
multifield_postings.subscribe(doc_id, &term_buffer);
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
@@ -168,7 +171,6 @@ impl SegmentWriter {
if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to;
}
token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into());
}
@@ -178,7 +180,6 @@ impl SegmentWriter {
{
offsets.push(total_offset);
total_offset += text.len();
token_streams.push(tokenizer.token_stream(text));
}
}
@@ -190,8 +191,12 @@ impl SegmentWriter {
0
} else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
multifield_postings.index_text(
doc_id,
field,
&mut token_stream,
term_buffer,
)
};
self.fieldnorms_writer.record(doc_id, field, num_tokens);
@@ -199,44 +204,36 @@ impl SegmentWriter {
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(
field_value.field(),
field_value.value().u64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_u64(field_value.value().u64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_i64(field_value.value().date_value().timestamp());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().i64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_i64(field_value.value().i64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}
FieldType::F64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_f64(
field_value.field(),
field_value.value().f64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
term_buffer.set_field(field_value.field());
term_buffer.set_f64(field_value.value().f64_value());
multifield_postings.subscribe(doc_id, &term_buffer);
}
}
}

View File

@@ -105,6 +105,7 @@ impl MultiFieldPostingsWriter {
doc: DocId,
field: Field,
token_stream: &mut dyn TokenStream,
term_buffer: &mut Term,
) -> u32 {
let postings_writer =
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
@@ -114,6 +115,7 @@ impl MultiFieldPostingsWriter {
field,
token_stream,
&mut self.heap,
term_buffer,
)
}
@@ -220,13 +222,20 @@ pub trait PostingsWriter {
field: Field,
token_stream: &mut dyn TokenStream,
heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 {
let mut term = Term::for_field(field);
term_buffer.set_field(field);
let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN {
term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
term_buffer.set_text(token.text.as_str());
self.subscribe(
term_index,
doc_id,
token.position as u32,
&term_buffer,
heap,
);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \

View File

@@ -4,7 +4,6 @@ use super::Field;
use crate::common;
use crate::schema::Facet;
use crate::DateTime;
use byteorder::{BigEndian, ByteOrder};
use std::str;
/// Size (in bytes) of the buffer of a int field.
@@ -19,6 +18,10 @@ where
B: AsRef<[u8]>;
impl Term {
pub(crate) fn new() -> Term {
Term(Vec::with_capacity(100))
}
/// Builds a term given a field, and a i64-value
///
/// Assuming the term has a field id of 1, and a i64 value of 3234,
@@ -93,6 +96,12 @@ impl Term {
term
}
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Creates a new Term for a given field.
pub(crate) fn for_field(field: Field) -> Term {
let mut term = Term(Vec::with_capacity(100));
@@ -100,12 +109,10 @@ impl Term {
term
}
/// Returns the field.
pub fn set_field(&mut self, field: Field) {
if self.0.len() < 4 {
self.0.resize(4, 0u8);
}
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
pub(crate) fn set_field(&mut self, field: Field) {
self.0.clear();
self.0
.extend_from_slice(&field.field_id().to_be_bytes()[..]);
}
/// Sets a u64 value in the term.
@@ -116,7 +123,7 @@ impl Term {
/// the natural order of the values.
pub fn set_u64(&mut self, val: u64) {
self.0.resize(INT_TERM_LEN, 0u8);
BigEndian::write_u64(&mut self.0[4..], val);
self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref());
}
/// Sets a `i64` value in the term.
@@ -134,12 +141,6 @@ impl Term {
self.0.extend(bytes);
}
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
let mut term = Term::for_field(field);
term.set_bytes(bytes);
term
}
/// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) {
self.set_bytes(text.as_bytes());
@@ -157,7 +158,9 @@ where
/// Returns the field.
pub fn field(&self) -> Field {
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4]))
let mut field_id_bytes = [0u8; 4];
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
}
/// Returns the `u64` value stored in a term.
@@ -166,7 +169,9 @@ where
/// ... or returns an invalid value
/// if the term is not a `u64` field.
pub fn get_u64(&self) -> u64 {
BigEndian::read_u64(&self.0.as_ref()[4..])
let mut field_id_bytes = [0u8; 8];
field_id_bytes.copy_from_slice(self.value_bytes());
u64::from_be_bytes(field_id_bytes)
}
/// Returns the `i64` value stored in a term.
@@ -175,7 +180,7 @@ where
/// ... or returns an invalid value
/// if the term is not a `i64` field.
pub fn get_i64(&self) -> i64 {
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
common::u64_to_i64(self.get_u64())
}
/// Returns the `f64` value stored in a term.
@@ -184,7 +189,7 @@ where
/// ... or returns an invalid value
/// if the term is not a `f64` field.
pub fn get_f64(&self) -> f64 {
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..]))
common::u64_to_f64(self.get_u64())
}
/// Returns the text associated with the term.