mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-19 17:50:42 +00:00
Removing Term Vec allocation (#881)
This commit is contained in:
@@ -17,7 +17,6 @@ use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||
use crate::Opstamp;
|
||||
use crate::{DocId, SegmentComponent};
|
||||
use std::io;
|
||||
use std::str;
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
@@ -48,6 +47,7 @@ pub struct SegmentWriter {
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<TextAnalyzer>>,
|
||||
term_buffer: Term,
|
||||
}
|
||||
|
||||
impl SegmentWriter {
|
||||
@@ -91,6 +91,7 @@ impl SegmentWriter {
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
tokenizers,
|
||||
term_buffer: Term::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -128,24 +129,26 @@ impl SegmentWriter {
|
||||
if !field_options.is_indexed() {
|
||||
continue;
|
||||
}
|
||||
let (term_buffer, multifield_postings) =
|
||||
(&mut self.term_buffer, &mut self.multifield_postings);
|
||||
match *field_options.field_type() {
|
||||
FieldType::HierarchicalFacet => {
|
||||
let facets: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = Term::for_field(field); // we set the Term
|
||||
term_buffer.set_field(field);
|
||||
let facets =
|
||||
field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
});
|
||||
for fake_str in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||
term.set_text(&token.text);
|
||||
term_buffer.set_text(&token.text);
|
||||
let unordered_term_id =
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||
unordered_term_id_opt = Some(unordered_term_id);
|
||||
});
|
||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||
@@ -168,7 +171,6 @@ impl SegmentWriter {
|
||||
if let Some(last_token) = tok_str.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
|
||||
token_streams
|
||||
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
}
|
||||
@@ -178,7 +180,6 @@ impl SegmentWriter {
|
||||
{
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
}
|
||||
}
|
||||
@@ -190,8 +191,12 @@ impl SegmentWriter {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &mut token_stream)
|
||||
multifield_postings.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
&mut token_stream,
|
||||
term_buffer,
|
||||
)
|
||||
};
|
||||
|
||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||
@@ -199,44 +204,36 @@ impl SegmentWriter {
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_u64(
|
||||
field_value.field(),
|
||||
field_value.value().u64_value(),
|
||||
);
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_u64(field_value.value().u64_value());
|
||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Date(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(
|
||||
field_value.field(),
|
||||
field_value.value().date_value().timestamp(),
|
||||
);
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_i64(field_value.value().date_value().timestamp());
|
||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::I64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(
|
||||
field_value.field(),
|
||||
field_value.value().i64_value(),
|
||||
);
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_i64(field_value.value().i64_value());
|
||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::F64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_f64(
|
||||
field_value.field(),
|
||||
field_value.value().f64_value(),
|
||||
);
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
term_buffer.set_field(field_value.field());
|
||||
term_buffer.set_f64(field_value.value().f64_value());
|
||||
multifield_postings.subscribe(doc_id, &term_buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,6 +105,7 @@ impl MultiFieldPostingsWriter {
|
||||
doc: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
term_buffer: &mut Term,
|
||||
) -> u32 {
|
||||
let postings_writer =
|
||||
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
|
||||
@@ -114,6 +115,7 @@ impl MultiFieldPostingsWriter {
|
||||
field,
|
||||
token_stream,
|
||||
&mut self.heap,
|
||||
term_buffer,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -220,13 +222,20 @@ pub trait PostingsWriter {
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
heap: &mut MemoryArena,
|
||||
term_buffer: &mut Term,
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
term_buffer.set_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
term_buffer.set_text(token.text.as_str());
|
||||
self.subscribe(
|
||||
term_index,
|
||||
doc_id,
|
||||
token.position as u32,
|
||||
&term_buffer,
|
||||
heap,
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||
|
||||
@@ -4,7 +4,6 @@ use super::Field;
|
||||
use crate::common;
|
||||
use crate::schema::Facet;
|
||||
use crate::DateTime;
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use std::str;
|
||||
|
||||
/// Size (in bytes) of the buffer of a int field.
|
||||
@@ -19,6 +18,10 @@ where
|
||||
B: AsRef<[u8]>;
|
||||
|
||||
impl Term {
|
||||
pub(crate) fn new() -> Term {
|
||||
Term(Vec::with_capacity(100))
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a i64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a i64 value of 3234,
|
||||
@@ -93,6 +96,12 @@ impl Term {
|
||||
term
|
||||
}
|
||||
|
||||
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
|
||||
let mut term = Term::for_field(field);
|
||||
term.set_bytes(bytes);
|
||||
term
|
||||
}
|
||||
|
||||
/// Creates a new Term for a given field.
|
||||
pub(crate) fn for_field(field: Field) -> Term {
|
||||
let mut term = Term(Vec::with_capacity(100));
|
||||
@@ -100,12 +109,10 @@ impl Term {
|
||||
term
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
pub fn set_field(&mut self, field: Field) {
|
||||
if self.0.len() < 4 {
|
||||
self.0.resize(4, 0u8);
|
||||
}
|
||||
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
|
||||
pub(crate) fn set_field(&mut self, field: Field) {
|
||||
self.0.clear();
|
||||
self.0
|
||||
.extend_from_slice(&field.field_id().to_be_bytes()[..]);
|
||||
}
|
||||
|
||||
/// Sets a u64 value in the term.
|
||||
@@ -116,7 +123,7 @@ impl Term {
|
||||
/// the natural order of the values.
|
||||
pub fn set_u64(&mut self, val: u64) {
|
||||
self.0.resize(INT_TERM_LEN, 0u8);
|
||||
BigEndian::write_u64(&mut self.0[4..], val);
|
||||
self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
/// Sets a `i64` value in the term.
|
||||
@@ -134,12 +141,6 @@ impl Term {
|
||||
self.0.extend(bytes);
|
||||
}
|
||||
|
||||
pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term {
|
||||
let mut term = Term::for_field(field);
|
||||
term.set_bytes(bytes);
|
||||
term
|
||||
}
|
||||
|
||||
/// Set the texts only, keeping the field untouched.
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.set_bytes(text.as_bytes());
|
||||
@@ -157,7 +158,9 @@ where
|
||||
|
||||
/// Returns the field.
|
||||
pub fn field(&self) -> Field {
|
||||
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4]))
|
||||
let mut field_id_bytes = [0u8; 4];
|
||||
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
|
||||
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
|
||||
}
|
||||
|
||||
/// Returns the `u64` value stored in a term.
|
||||
@@ -166,7 +169,9 @@ where
|
||||
/// ... or returns an invalid value
|
||||
/// if the term is not a `u64` field.
|
||||
pub fn get_u64(&self) -> u64 {
|
||||
BigEndian::read_u64(&self.0.as_ref()[4..])
|
||||
let mut field_id_bytes = [0u8; 8];
|
||||
field_id_bytes.copy_from_slice(self.value_bytes());
|
||||
u64::from_be_bytes(field_id_bytes)
|
||||
}
|
||||
|
||||
/// Returns the `i64` value stored in a term.
|
||||
@@ -175,7 +180,7 @@ where
|
||||
/// ... or returns an invalid value
|
||||
/// if the term is not a `i64` field.
|
||||
pub fn get_i64(&self) -> i64 {
|
||||
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
|
||||
common::u64_to_i64(self.get_u64())
|
||||
}
|
||||
|
||||
/// Returns the `f64` value stored in a term.
|
||||
@@ -184,7 +189,7 @@ where
|
||||
/// ... or returns an invalid value
|
||||
/// if the term is not a `f64` field.
|
||||
pub fn get_f64(&self) -> f64 {
|
||||
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..]))
|
||||
common::u64_to_f64(self.get_u64())
|
||||
}
|
||||
|
||||
/// Returns the text associated with the term.
|
||||
|
||||
Reference in New Issue
Block a user