Compare commits

...

2 Commits

Author SHA1 Message Date
Pascal Seitz
dbd3aed24a improve log levels 2022-09-29 13:25:14 +08:00
Pascal Seitz
7707b8a6e1 add debug_time for ff serialization 2022-09-24 19:48:07 +08:00
5 changed files with 62 additions and 9 deletions

View File

@@ -17,7 +17,7 @@ rand = {version="0.8.3", optional= true}
fastdivide = "0.4" fastdivide = "0.4"
log = "0.4" log = "0.4"
itertools = { version = "0.10.3" } itertools = { version = "0.10.3" }
measure_time = { version="0.8.2", optional=true} measure_time = { version="0.8.2" }
[dev-dependencies] [dev-dependencies]
more-asserts = "0.3.0" more-asserts = "0.3.0"
@@ -25,7 +25,7 @@ proptest = "1.0.0"
rand = "0.8.3" rand = "0.8.3"
[features] [features]
bin = ["prettytable-rs", "rand", "measure_time"] bin = ["prettytable-rs", "rand"]
default = ["bin"] default = ["bin"]
unstable = [] unstable = []

View File

@@ -23,7 +23,8 @@ use std::sync::Arc;
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
use fastdivide::DividerU64; use fastdivide::DividerU64;
use log::warn; use log::{trace, warn};
use measure_time::trace_time;
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
use crate::bitpacked::BitpackedCodec; use crate::bitpacked::BitpackedCodec;
@@ -183,6 +184,7 @@ fn detect_codec(
) -> Option<FastFieldCodecType> { ) -> Option<FastFieldCodecType> {
let mut estimations = Vec::new(); let mut estimations = Vec::new();
for &codec in codecs { for &codec in codecs {
trace_time!("estimate time for codec: {:?}", codec);
let estimation_opt = match codec { let estimation_opt = match codec {
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&column), FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&column),
FastFieldCodecType::Linear => LinearCodec::estimate(&column), FastFieldCodecType::Linear => LinearCodec::estimate(&column),
@@ -202,6 +204,7 @@ fn detect_codec(
// codecs // codecs
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX); estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
estimations.sort_by(|(score_left, _), (score_right, _)| score_left.total_cmp(score_right)); estimations.sort_by(|(score_left, _), (score_right, _)| score_left.total_cmp(score_right));
trace!("Chosen Codec {:?}", estimations.first()?.1);
Some(estimations.first()?.1) Some(estimations.first()?.1)
} }
@@ -210,6 +213,12 @@ fn serialize_given_codec(
codec_type: FastFieldCodecType, codec_type: FastFieldCodecType,
output: &mut impl io::Write, output: &mut impl io::Write,
) -> io::Result<()> { ) -> io::Result<()> {
trace_time!(
"Serialize time for codec: {:?}, num_vals {}",
codec_type,
column.num_vals()
);
match codec_type { match codec_type {
FastFieldCodecType::Bitpacked => { FastFieldCodecType::Bitpacked => {
BitpackedCodec::serialize(&column, output)?; BitpackedCodec::serialize(&column, output)?;

View File

@@ -3,6 +3,7 @@ use std::sync::Mutex;
use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn}; use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn};
use fnv::FnvHashMap; use fnv::FnvHashMap;
use measure_time::{debug_time, trace_time};
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType}; use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType};
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
@@ -146,6 +147,13 @@ impl MultiValuedFastFieldWriter {
{ {
self.doc_index.push(self.vals.len() as u64); self.doc_index.push(self.vals.len() as u64);
let col = VecColumn::from(&self.doc_index[..]); let col = VecColumn::from(&self.doc_index[..]);
trace_time!(
"segment-serialize-multi-fast-field-idx, num_vals {}, field_id {:?}",
col.num_vals(),
self.field()
);
if let Some(doc_id_map) = doc_id_map { if let Some(doc_id_map) = doc_id_map {
let multi_value_start_index = MultivalueStartIndex::new(&col, doc_id_map); let multi_value_start_index = MultivalueStartIndex::new(&col, doc_id_map);
serializer.create_auto_detect_u64_fast_field_with_idx( serializer.create_auto_detect_u64_fast_field_with_idx(
@@ -158,6 +166,12 @@ impl MultiValuedFastFieldWriter {
} }
} }
{ {
trace_time!(
"segment-serialize-multi-fast-field-values, num_vals {}, field_id {:?}",
self.vals.len(),
self.field()
);
// Writing the values themselves. // Writing the values themselves.
// TODO FIXME: Use less memory. // TODO FIXME: Use less memory.
let mut values: Vec<u64> = Vec::new(); let mut values: Vec<u64> = Vec::new();

View File

@@ -4,6 +4,7 @@ use std::io;
use common; use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU64}; use fastfield_codecs::{Column, MonotonicallyMappableToU64};
use fnv::FnvHashMap; use fnv::FnvHashMap;
use measure_time::{debug_time, trace_time};
use tantivy_bitpacker::BlockedBitpacker; use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter; use super::multivalued::MultiValuedFastFieldWriter;
@@ -215,6 +216,7 @@ impl FastFieldsWriter {
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>, mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>, doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> { ) -> io::Result<()> {
debug_time!("segment-serialize-all-fast-fields",);
for field_writer in self.term_id_writers { for field_writer in self.term_id_writers {
let field = field_writer.field(); let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?; field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
@@ -367,6 +369,11 @@ impl IntFastFieldWriter {
num_vals: self.val_count as u64, num_vals: self.val_count as u64,
}; };
trace_time!(
"segment-serialize-single-value-field, field_id {:?}",
self.field()
);
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?; serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
Ok(()) Ok(())

View File

@@ -4,7 +4,7 @@ use std::sync::Arc;
use fastfield_codecs::VecColumn; use fastfield_codecs::VecColumn;
use itertools::Itertools; use itertools::Itertools;
use measure_time::debug_time; use measure_time::{debug_time, trace_time};
use crate::core::{Segment, SegmentReader}; use crate::core::{Segment, SegmentReader};
use crate::docset::{DocSet, TERMINATED}; use crate::docset::{DocSet, TERMINATED};
@@ -250,7 +250,11 @@ impl IndexMerger {
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>, mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
debug_time!("write-fast-fields"); debug_time!(
"merge-all-fast-fields, num_segments {}, num docs new segment {}",
self.readers.len(),
doc_id_mapping.len()
);
for (field, field_entry) in self.schema.fields() { for (field, field_entry) in self.schema.fields() {
let field_type = field_entry.field_type(); let field_type = field_entry.field_type();
@@ -311,6 +315,12 @@ impl IndexMerger {
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
let fast_field_accessor = SortedDocIdColumn::new(&self.readers, doc_id_mapping, field); let fast_field_accessor = SortedDocIdColumn::new(&self.readers, doc_id_mapping, field);
trace_time!(
"merge-single-fast-field, num_vals {}, num_segments {}, field_id {:?}",
fast_field_accessor.num_vals(),
self.readers.len(),
field
);
fast_field_serializer.create_auto_detect_u64_fast_field(field, fast_field_accessor)?; fast_field_serializer.create_auto_detect_u64_fast_field(field, fast_field_accessor)?;
Ok(()) Ok(())
@@ -458,6 +468,12 @@ impl IndexMerger {
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<Vec<u64>> { ) -> crate::Result<Vec<u64>> {
trace_time!(
"merge-multi-fast-field-idx, num_segments {}, field_id {:?}",
self.readers.len(),
field
);
let reader_ordinal_and_field_accessors = self let reader_ordinal_and_field_accessors = self
.readers .readers
.iter() .iter()
@@ -488,7 +504,7 @@ impl IndexMerger {
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
debug_time!("write-term-id-fast-field"); trace_time!("write-term-id-fast-field");
// Multifastfield consists of 2 fastfields. // Multifastfield consists of 2 fastfields.
// The first serves as an index into the second one and is strictly increasing. // The first serves as an index into the second one and is strictly increasing.
@@ -571,6 +587,13 @@ impl IndexMerger {
let fastfield_accessor = let fastfield_accessor =
SortedDocIdMultiValueColumn::new(&self.readers, doc_id_mapping, &offsets, field); SortedDocIdMultiValueColumn::new(&self.readers, doc_id_mapping, &offsets, field);
trace_time!(
"merge-multi-fast-field-values, num_vals {}, num_segments {}, field_id {:?}",
fastfield_accessor.num_vals(),
self.readers.len(),
field
);
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx( fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
field, field,
fastfield_accessor, fastfield_accessor,
@@ -624,7 +647,7 @@ impl IndexMerger {
fieldnorm_reader: Option<FieldNormReader>, fieldnorm_reader: Option<FieldNormReader>,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<Option<TermOrdinalMapping>> { ) -> crate::Result<Option<TermOrdinalMapping>> {
debug_time!("write-postings-for-field"); debug_time!("write-postings-for-field {:?}", indexed_field);
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000); let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new(); let mut delta_computer = DeltaComputer::new();
@@ -827,7 +850,7 @@ impl IndexMerger {
debug!("write-storable-field"); debug!("write-storable-field");
if !doc_id_mapping.is_trivial() { if !doc_id_mapping.is_trivial() {
debug!("non-trivial-doc-id-mapping"); debug!("non-trivial-doc-id-mapping (index is sorted)");
let store_readers: Vec<_> = self let store_readers: Vec<_> = self
.readers .readers
@@ -855,7 +878,7 @@ impl IndexMerger {
} }
} }
} else { } else {
debug!("trivial-doc-id-mapping"); debug!("trivial-doc-id-mapping (index is not sorted)");
for reader in &self.readers { for reader in &self.readers {
let store_reader = reader.get_store_reader(1)?; let store_reader = reader.get_store_reader(1)?;
if reader.has_deletes() if reader.has_deletes()