Disconnected facet / fast field merges / examples

This commit is contained in:
Paul Masurel
2023-01-17 23:55:57 +09:00
parent 007168ff4c
commit e435b6fdd1
45 changed files with 1715 additions and 2124 deletions

View File

@@ -15,7 +15,7 @@ use super::metric::{
use super::segment_agg_result::BucketCount;
use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
use crate::schema::{Cardinality, Type};
use crate::schema::Type;
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
#[derive(Clone, Default)]

View File

@@ -105,8 +105,8 @@ pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
mod tweak_score_top_collector;
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::{FacetCollector, FacetCounts};
// mod facet_collector;
// pub use self::facet_collector::{FacetCollector, FacetCounts};
use crate::query::Weight;
mod docset_collector;

View File

@@ -5,7 +5,6 @@ use fastfield_codecs::Column;
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader;
use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
@@ -164,8 +163,8 @@ pub struct FastFieldSegmentCollector {
}
impl FastFieldTestCollector {
pub fn for_field(field: String) -> FastFieldTestCollector {
FastFieldTestCollector { field }
pub fn for_field(field: impl ToString) -> FastFieldTestCollector {
FastFieldTestCollector { field: field.to_string() }
}
}
@@ -210,64 +209,62 @@ impl SegmentCollector for FastFieldSegmentCollector {
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct BytesFastFieldTestCollector {
field: Field,
}
// /// Collects in order all of the fast field bytes for all of the
// /// docs in the `DocSet`
// ///
// /// This collector is mainly useful for tests.
// pub struct BytesFastFieldTestCollector {
// field: Field,
// }
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
}
// pub struct BytesFastFieldSegmentCollector {
// vals: Vec<u8>,
// reader: BytesFastFieldReader,
// }
impl BytesFastFieldTestCollector {
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
BytesFastFieldTestCollector { field }
}
}
// impl BytesFastFieldTestCollector {
// pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
// BytesFastFieldTestCollector { field }
// }
// }
impl Collector for BytesFastFieldTestCollector {
type Fruit = Vec<u8>;
type Child = BytesFastFieldSegmentCollector;
// impl Collector for BytesFastFieldTestCollector {
// type Fruit = Vec<u8>;
// type Child = BytesFastFieldSegmentCollector;
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<BytesFastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
.bytes(segment_reader.schema().get_field_name(self.field))?;
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader,
})
}
// fn for_segment(
// &self,
// _segment_local_id: u32,
// segment_reader: &SegmentReader,
// ) -> crate::Result<BytesFastFieldSegmentCollector> {
// let reader = segment_reader.fast_fields().bytes(self.field)?;
// Ok(BytesFastFieldSegmentCollector {
// vals: Vec::new(),
// reader,
// })
// }
fn requires_scoring(&self) -> bool {
false
}
// fn requires_scoring(&self) -> bool {
// false
// }
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
}
}
// fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
// Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
// }
// }
impl SegmentCollector for BytesFastFieldSegmentCollector {
type Fruit = Vec<u8>;
// impl SegmentCollector for BytesFastFieldSegmentCollector {
// type Fruit = Vec<u8>;
fn collect(&mut self, doc: u32, _score: Score) {
let data = self.reader.get_bytes(doc);
self.vals.extend(data);
}
// fn collect(&mut self, doc: u32, _score: Score) {
// let data = self.reader.get_bytes(doc);
// self.vals.extend(data);
// }
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.vals
}
}
// fn harvest(self) -> <Self as SegmentCollector>::Fruit {
// self.vals
// }
// }
fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build();

View File

@@ -19,7 +19,7 @@ use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
@@ -245,12 +245,6 @@ impl IndexBuilder {
sort_by_field.field
)));
}
if entry.field_type().fastfield_cardinality() != Some(Cardinality::SingleValue) {
return Err(TantivyError::InvalidArgument(format!(
"Only single value fast field Cardinality supported for sorting index {}",
sort_by_field.field
)));
}
}
Ok(())
} else {

View File

@@ -7,7 +7,7 @@ use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
use crate::space_usage::SegmentSpaceUsage;
@@ -90,25 +90,8 @@ impl SegmentReader {
}
/// Accessor to the `FacetReader` associated with a given `Field`.
pub fn facet_reader(&self, field: Field) -> crate::Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
FieldType::Facet(_) => {
let term_ords_reader =
self.fast_fields().u64s(self.schema.get_field_name(field))?;
let termdict = self
.termdict_composite
.open_read(field)
.map(TermDictionary::open)
.unwrap_or_else(|| Ok(TermDictionary::empty()))?;
Ok(FacetReader::new(term_ords_reader, termdict))
}
_ => Err(crate::TantivyError::InvalidArgument(format!(
"Field {:?} is not a facet field.",
field_entry.name()
))),
}
pub fn facet_reader(&self, field: Field) -> crate::Result<()> {
todo!();
}
/// Accessor to the segment's `Field norms`'s reader.

View File

@@ -24,15 +24,10 @@ use std::net::Ipv6Addr;
use fastfield_codecs::MonotonicallyMappableToU64;
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
// pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex};
pub use self::multivalued::{
MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader,
MultiValuedFastFieldWriter,
};
pub(crate) use self::readers::type_and_cardinality;
// pub use self::facet_reader::FacetReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::{Column, CompositeFastFieldSerializer};
use self::writer::unexpected_value;
@@ -41,10 +36,10 @@ use crate::schema::{Type, Value};
use crate::DateTime;
mod alive_bitset;
mod bytes;
// mod bytes;
mod error;
mod facet_reader;
mod multivalued;
// mod facet_reader;
// mod multivalued;
mod readers;
mod serializer;
mod writer;
@@ -166,7 +161,7 @@ mod tests {
use std::path::Path;
use std::sync::Arc;
use common::HasLen;
use common::{HasLen, TerminatingWrite};
use fastfield_codecs::{open, FastFieldCodecType};
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
@@ -189,16 +184,9 @@ mod tests {
});
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
#[test]
pub fn test_fastfield() {
let test_fastfield = fastfield_codecs::serialize_and_load(&[100u64, 200u64, 300u64][..]);
assert_eq!(test_fastfield.get_val(0), 100);
assert_eq!(test_fastfield.get_val(1), 200);
assert_eq!(test_fastfield.get_val(2), 300);
}
#[test]
pub fn test_fastfield_i64_u64() {
pub fn test_convert_i64_u64() {
let datetime = DateTime::from_utc(OffsetDateTime::UNIX_EPOCH);
assert_eq!(i64::from_u64(datetime.to_u64()), 0i64);
}
@@ -208,22 +196,21 @@ mod tests {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers
.add_document(0, &doc!(*FIELD=>13u64))
.add_document(&doc!(*FIELD=>13u64))
.unwrap();
fast_field_writers
.add_document(1,&doc!(*FIELD=>14u64))
.add_document(&doc!(*FIELD=>14u64))
.unwrap();
fast_field_writers
.add_document(2,&doc!(*FIELD=>2u64))
.add_document(&doc!(*FIELD=>2u64))
.unwrap();
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.serialize(&mut write, None)
.unwrap();
serializer.close().unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 34);
@@ -241,38 +228,37 @@ mod tests {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut write: WritePtr = directory.open_write(Path::new("test"))?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers
.add_document(0, &doc!(*FIELD=>4u64))
.add_document(&doc!(*FIELD=>4u64))
.unwrap();
fast_field_writers
.add_document(1, &doc!(*FIELD=>14_082_001u64))
.add_document(&doc!(*FIELD=>14_082_001u64))
.unwrap();
fast_field_writers
.add_document(2, &doc!(*FIELD=>3_052u64))
.add_document(&doc!(*FIELD=>3_052u64))
.unwrap();
fast_field_writers
.add_document(3, &doc!(*FIELD=>9_002u64))
.add_document(&doc!(*FIELD=>9_002u64))
.unwrap();
fast_field_writers
.add_document(4, &doc!(*FIELD=>15_001u64))
.add_document(&doc!(*FIELD=>15_001u64))
.unwrap();
fast_field_writers
.add_document(5, &doc!(*FIELD=>777u64))
.add_document(&doc!(*FIELD=>777u64))
.unwrap();
fast_field_writers
.add_document(6, &doc!(*FIELD=>1_002u64))
.add_document(&doc!(*FIELD=>1_002u64))
.unwrap();
fast_field_writers
.add_document(7, &doc!(*FIELD=>1_501u64))
.add_document(&doc!(*FIELD=>1_501u64))
.unwrap();
fast_field_writers
.add_document(8, &doc!(*FIELD=>215u64))
.add_document(&doc!(*FIELD=>215u64))
.unwrap();
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
serializer.close()?;
fast_field_writers.serialize(&mut write, None)?;
write.terminate()?;
}
let file = directory.open_read(path)?;
assert_eq!(file.len(), 62);
@@ -302,18 +288,17 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for doc_id in 0..10_000 {
for _ in 0..10_000 {
fast_field_writers
.add_document(doc_id, &doc!(*FIELD=>100_000u64))
.add_document(&doc!(*FIELD=>100_000u64))
.unwrap();
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.serialize(&mut write, None)
.unwrap();
serializer.close().unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
@@ -337,22 +322,21 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
fast_field_writers
.add_document(0, &doc!(*FIELD=>0u64))
.add_document(&doc!(*FIELD=>0u64))
.unwrap();
for doc_id in 1u64..10_001u64 {
fast_field_writers
.add_document(doc_id as u32, &doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id as u64))
.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id as u64))
.unwrap();
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.serialize(&mut write, None)
.unwrap();
serializer.close().unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80049);
@@ -383,20 +367,17 @@ mod tests {
let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let mut doc_id = 0;
for i in -100i64..10_000i64 {
let mut doc = Document::default();
doc.add_i64(i64_field, i);
fast_field_writers.add_document(doc_id, &doc).unwrap();
doc_id += 1;
fast_field_writers.add_document(&doc).unwrap();
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.serialize(&mut write, None)
.unwrap();
serializer.close().unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 49_usize);
@@ -477,14 +458,13 @@ mod tests {
let n = permutation.len();
let directory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut write: WritePtr = directory.open_write(Path::new("test"))?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for (doc_id, &x) in permutation.iter().enumerate() {
fast_field_writers.add_document(doc_id as u32, &doc!(*FIELD=>x)).unwrap();
fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap();
}
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
serializer.close()?;
fast_field_writers.serialize(&mut write, None)?;
write.terminate()?;
}
let file = directory.open_read(path)?;
{
@@ -543,17 +523,18 @@ mod tests {
Ok(())
}
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
let mut all = vec![];
// fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
// let mut all = vec![];
for doc in docs {
let mut out: Vec<u64> = vec![];
ff.get_vals(doc, &mut out);
all.extend(out);
}
all
}
// for doc in docs {
// let mut out: Vec<u64> = vec![];
// ff.get_vals(doc, &mut out);
// all.extend(out);
// }
// all
// }
/*
#[test]
fn test_text_fastfield() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
@@ -651,156 +632,159 @@ mod tests {
Ok(())
}
*/
#[test]
fn test_string_fastfield() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", STRING | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// #[test]
// fn test_string_fastfield() -> crate::Result<()> {
// let mut schema_builder = Schema::builder();
// let text_field = schema_builder.add_text_field("text", STRING | FAST);
// let schema = schema_builder.build();
// let index = Index::create_in_ram(schema);
{
// first segment
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
text_field => "BBBBB", // term_ord 1
))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
))?;
// {
// // first segment
// let mut index_writer = index.writer_for_tests()?;
// index_writer.set_merge_policy(Box::new(NoMergePolicy));
// index_writer.add_document(doc!(
// text_field => "BBBBB", // term_ord 1
// ))?;
// index_writer.add_document(doc!())?;
// index_writer.add_document(doc!(
// text_field => "AAAAA", // term_ord 0
// ))?;
// index_writer.add_document(doc!(
// text_field => "AAAAA", // term_ord 0
// ))?;
// index_writer.add_document(doc!(
// text_field => "zumberthree", // term_ord 2, after merge term_ord 3
// ))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
// index_writer.add_document(doc!())?;
// index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s("text").unwrap();
assert_eq!(get_vals_for_docs(&text_fast_field, 0..6), vec![1, 0, 0, 2]);
// let reader = index.reader()?;
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let text_fast_field = fast_fields.u64s(text_field).unwrap();
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 3);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(bytes, "AAAAA".as_bytes());
}
// assert_eq!(get_vals_for_docs(&text_fast_field, 0..6), vec![1, 0, 0, 2]);
{
// second segment
let mut index_writer = index.writer_for_tests()?;
// let inverted_index = segment_reader.inverted_index(text_field)?;
// assert_eq!(inverted_index.terms().num_terms(), 3);
// let mut bytes = vec![];
// assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
// assert_eq!(bytes, "AAAAA".as_bytes());
// }
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
// {
// // second segment
// let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
text_field => "CCCCC", // term_ord 1, after merge 2
))?;
// index_writer.add_document(doc!(
// text_field => "AAAAA", // term_ord 0
// ))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
// index_writer.add_document(doc!(
// text_field => "CCCCC", // term_ord 1, after merge 2
// ))?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let segment_reader = searcher.segment_reader(1);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s("text").unwrap();
// index_writer.add_document(doc!())?;
// index_writer.commit()?;
assert_eq!(get_vals_for_docs(&text_fast_field, 0..2), vec![0, 1]);
}
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s("text").unwrap();
// let reader = index.reader()?;
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 2);
// let segment_reader = searcher.segment_reader(1);
// let fast_fields = segment_reader.fast_fields();
// let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(
get_vals_for_docs(&text_fast_field, 0..9),
vec![1, 0, 0, 3 /* next segment */, 0, 2]
);
// assert_eq!(get_vals_for_docs(&text_fast_field, 0..2), vec![0, 1]);
// }
// // Merging the segments
// {
// let segment_ids = index.searchable_segment_ids()?;
// let mut index_writer = index.writer_for_tests()?;
// index_writer.merge(&segment_ids).wait()?;
// index_writer.wait_merging_threads()?;
// }
Ok(())
}
// let reader = index.reader()?;
// let searcher = reader.searcher();
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let text_fast_field = fast_fields.u64s(text_field).unwrap();
#[test]
fn test_datefastfield() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"date",
DateOptions::from(FAST).set_precision(DatePrecision::Microseconds),
);
let multi_date_field = schema_builder.add_date_field(
"multi_date",
DateOptions::default()
.set_precision(DatePrecision::Microseconds)
.set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
date_field => DateTime::from_u64(1i64.to_u64()),
multi_date_field => DateTime::from_u64(2i64.to_u64()),
multi_date_field => DateTime::from_u64(3i64.to_u64())
))?;
index_writer.add_document(doc!(
date_field => DateTime::from_u64(4i64.to_u64())
))?;
index_writer.add_document(doc!(
multi_date_field => DateTime::from_u64(5i64.to_u64()),
multi_date_field => DateTime::from_u64(6i64.to_u64())
))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let date_fast_field = fast_fields.date("date").unwrap();
let dates_fast_field = fast_fields.dates("multi_date").unwrap();
let mut dates = vec![];
{
assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
dates_fast_field.get_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
}
{
assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
dates_fast_field.get_vals(1u32, &mut dates);
assert!(dates.is_empty());
}
{
assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
dates_fast_field.get_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
assert_eq!(dates[1].into_timestamp_micros(), 6i64);
}
Ok(())
}
// assert_eq!(
// get_vals_for_docs(&text_fast_field, 0..9),
// vec![1, 0, 0, 3 /* next segment */, 0, 2]
// );
// Ok(())
// }
// #[test]
// fn test_datefastfield() -> crate::Result<()> {
// let mut schema_builder = Schema::builder();
// let date_field = schema_builder.add_date_field(
// "date",
// DateOptions::from(FAST).set_precision(DatePrecision::Microseconds),
// );
// let multi_date_field = schema_builder.add_date_field(
// "multi_date",
// DateOptions::default()
// .set_precision(DatePrecision::Microseconds)
// .set_fast(),
// );
// let schema = schema_builder.build();
// let index = Index::create_in_ram(schema);
// let mut index_writer = index.writer_for_tests()?;
// index_writer.set_merge_policy(Box::new(NoMergePolicy));
// index_writer.add_document(doc!(
// date_field => DateTime::from_u64(1i64.to_u64()),
// multi_date_field => DateTime::from_u64(2i64.to_u64()),
// multi_date_field => DateTime::from_u64(3i64.to_u64())
// ))?;
// index_writer.add_document(doc!(
// date_field => DateTime::from_u64(4i64.to_u64())
// ))?;
// index_writer.add_document(doc!(
// multi_date_field => DateTime::from_u64(5i64.to_u64()),
// multi_date_field => DateTime::from_u64(6i64.to_u64())
// ))?;
// index_writer.commit()?;
// let reader = index.reader()?;
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let date_fast_field = fast_fields.date(date_field).unwrap();
// let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
// let mut dates = vec![];
// {
// assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
// dates_fast_field.get_vals(0u32, &mut dates);
// assert_eq!(dates.len(), 2);
// assert_eq!(dates[0].into_timestamp_micros(), 2i64);
// assert_eq!(dates[1].into_timestamp_micros(), 3i64);
// }
// {
// assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
// dates_fast_field.get_vals(1u32, &mut dates);
// assert!(dates.is_empty());
// }
// {
// assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
// dates_fast_field.get_vals(2u32, &mut dates);
// assert_eq!(dates.len(), 2);
// assert_eq!(dates[0].into_timestamp_micros(), 5i64);
// assert_eq!(dates[1].into_timestamp_micros(), 6i64);
// }
// Ok(())
// }
#[test]
pub fn test_fastfield_bool() {
@@ -823,21 +807,20 @@ mod tests {
let field = schema.get_field("field_bool").unwrap();
{
let write: WritePtr = directory.open_write(path).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
fast_field_writers.add_document(0u32, &doc!(field=>true)).unwrap();
fast_field_writers.add_document(&doc!(field=>true)).unwrap();
fast_field_writers
.add_document(1u32, &doc!(field=>false))
.add_document(&doc!(field=>false))
.unwrap();
fast_field_writers.add_document(2u32, &doc!(field=>true)).unwrap();
fast_field_writers.add_document(&doc!(field=>true)).unwrap();
fast_field_writers
.add_document(3u32, &doc!(field=>false))
.add_document(&doc!(field=>false))
.unwrap();
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.serialize(&mut write, None)
.unwrap();
serializer.close().unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 33);
@@ -863,19 +846,18 @@ mod tests {
let field = schema.get_field("field_bool").unwrap();
{
let write: WritePtr = directory.open_write(path).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for doc_id in 0..50 {
fast_field_writers.add_document(doc_id * 2, &doc!(field=>true)).unwrap();
fast_field_writers.add_document(&doc!(field=>true)).unwrap();
fast_field_writers
.add_document(doc_id * 2 + 1, &doc!(field=>false))
.add_document(&doc!(field=>false))
.unwrap();
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.serialize(&mut write, None)
.unwrap();
serializer.close().unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 45);
@@ -900,13 +882,12 @@ mod tests {
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(path).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut write: WritePtr = directory.open_write(path).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(0, &doc).unwrap();
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
serializer.close()?;
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None)?;
write.terminate()?;
}
let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?;
@@ -921,21 +902,18 @@ mod tests {
fn get_index(
docs: &[crate::Document],
schema: &Schema,
codec_types: &[FastFieldCodecType],
) -> crate::Result<RamDirectory> {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer =
CompositeFastFieldSerializer::from_write_with_codec(write, codec_types).unwrap();
let mut write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(schema);
for (doc_id, doc) in docs.into_iter().enumerate() {
fast_field_writers.add_document(doc_id as u32, doc).unwrap();
fast_field_writers.add_document(doc).unwrap();
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.serialize(&mut write, None)
.unwrap();
serializer.close().unwrap();
write.terminate().unwrap();
}
Ok(directory)
}
@@ -974,7 +952,7 @@ mod tests {
let docs: Vec<Document> = times.iter().map(|time| doc!(field=>*time)).collect();
let directory = get_index(&docs[..], &schema, &[codec_type])?;
let directory = get_index(&docs[..], &schema)?;
let path = Path::new("test");
let file = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&file)?;

View File

@@ -3,10 +3,9 @@ use std::sync::Arc;
use fastfield_codecs::{open, open_u128, Column};
use super::multivalued::MultiValuedFastFieldReader;
use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{BytesFastFieldReader, FastFieldNotAvailableError, FastValue};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::{Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use crate::{DateTime, TantivyError};
@@ -29,7 +28,7 @@ pub(crate) enum FastType {
Date,
}
pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<FastType> {
todo!();
// match field_type {
// FieldType::U64(options) => options
@@ -80,39 +79,6 @@ impl FastFieldReaders {
})
}
fn check_type(
&self,
field: Field,
expected_fast_type: FastType,
expected_cardinality: Cardinality,
) -> crate::Result<()> {
let field_entry = self.schema.get_field_entry(field);
let (fast_type, cardinality) =
type_and_cardinality(field_entry.field_type()).ok_or_else(|| {
crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
))
})?;
if fast_type != expected_fast_type {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}, expected {:?}.",
field_entry.name(),
fast_type,
expected_fast_type
)));
}
if cardinality != expected_cardinality {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of cardinality {:?}, expected {:?}.",
field_entry.name(),
cardinality,
expected_cardinality
)));
}
Ok(())
}
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field_name: &str,
@@ -130,80 +96,31 @@ impl FastFieldReaders {
&self,
field_name: &str,
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
self.typed_fast_field_reader_with_idx(field_name, 0)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field_name)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field_name, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
todo!();
}
/// Returns the `u64` fast field reader reader associated with `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
self.check_type(
self.schema.get_field(field_name)?,
FastType::U64,
Cardinality::SingleValue,
)?;
self.typed_fast_field_reader(field_name)
pub fn u64(&self, field: &str) -> crate::Result<Arc<dyn Column<u64>>> {
todo!();
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addr(&self, field_name: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<Ipv6Addr>(bytes)?)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addrs(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<Ipv6Addr>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field_name)?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
pub fn ip_addr(&self, field: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
todo!();
// self.check_type(field, FastType::U128)?;
// let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
// Ok(open_u128::<Ipv6Addr>(bytes)?)
}
/// Returns the `u128` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub(crate) fn u128(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u128>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<u128>(bytes)?)
}
/// Returns the `u128` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u128 multi-valued fast field, this method returns an Error.
pub fn u128s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u128>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> =
self.typed_fast_field_reader(self.schema.get_field_name(field))?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<u128>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
pub(crate) fn u128(&self, field: &str) -> crate::Result<Arc<dyn Column<u128>>> {
todo!();
}
/// Returns the `u64` fast field reader reader associated with `field`, regardless of whether
@@ -219,113 +136,49 @@ impl FastFieldReaders {
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<i64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(self.schema.get_field_name(field))
todo!()
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field_name: &str) -> crate::Result<Arc<dyn Column<DateTime>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
todo!()
}
/// Returns the `f64` fast field reader reader associated with `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<f64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
todo!();
}
/// Returns the `bool` fast field reader reader associated with `field`.
///
/// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field_name: &str) -> crate::Result<Arc<dyn Column<bool>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
todo!()
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`, regardless
/// of whether the given field is effectively of type `u64` or not.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s_lenient(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `i64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a i64 multi-valued fast field, this method returns an Error.
pub fn i64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<i64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `f64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a f64 multi-valued fast field, this method returns an Error.
pub fn f64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<f64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `bools` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a bool multi-valued fast field, this method returns an Error.
pub fn bools(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<bool>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated with
/// `field`.
///
/// If `field` is not a `time::OffsetDateTime` multi-valued fast field, this method returns an
/// Error.
pub fn dates(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<DateTime>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns the `bytes` fast field reader associated with `field`.
///
/// If `field` is not a bytes fast field, returns an Error.
pub fn bytes(&self, field_name: &str) -> crate::Result<BytesFastFieldReader> {
let field = self.schema.get_field(field_name)?;
let field_entry = self.schema.get_field_entry(field);
if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
if !bytes_option.is_fast() {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
let idx_reader = open(fast_field_idx_bytes)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {
Err(FastFieldNotAvailableError::new(field_entry).into())
}
}
// Returns the `bytes` fast field reader associated with `field`.
//
// If `field` is not a bytes fast field, returns an Error.
// pub fn bytes(&self, field: Field) -> crate::Result<BytesFastFieldReader> {
// let field_entry = self.schema.get_field_entry(field);
// if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
// if !bytes_option.is_fast() {
// return Err(crate::TantivyError::SchemaError(format!(
// "Field {:?} is not a fast field.",
// field_entry.name()
// )));
// }
// let fast_field_idx_file = self.fast_field_data(field, 0)?;
// let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
// let idx_reader = open(fast_field_idx_bytes)?;
// let data = self.fast_field_data(field, 1)?;
// BytesFastFieldReader::open(idx_reader, data)
// } else {
// Err(FastFieldNotAvailableError::new(field_entry).into())
// }
// }
}

View File

@@ -1,18 +1,17 @@
use std::collections::HashMap;
use std::io;
use super::FastFieldType;
use crate::fastfield::{CompositeFastFieldSerializer};
use columnar::{ColumnarWriter, NumericalType, NumericalValue};
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
use super::FastFieldType;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Value};
use crate::termdict::TermOrdinal;
use crate::{DatePrecision, DocId};
@@ -20,6 +19,7 @@ use crate::{DatePrecision, DocId};
pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter,
fast_fields: Vec<Option<String>>, //< TODO see if we can cash the field name hash too.
num_docs: DocId,
// term_id_writers: Vec<MultiValuedFastFieldWriter>,
// single_value_writers: Vec<IntFastFieldWriter>,
// u128_value_writers: Vec<U128FastFieldWriter>,
@@ -122,6 +122,7 @@ impl FastFieldsWriter {
FastFieldsWriter {
columnar_writer,
fast_fields,
num_docs: 0u32,
}
}
@@ -131,7 +132,8 @@ impl FastFieldsWriter {
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc_id: DocId, doc: &Document) -> crate::Result<()> {
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.num_docs;
for field_value in doc.field_values() {
if let Some(field_name) = self.fast_fields[field_value.field().field_id() as usize].as_ref() {
match &field_value.value {
@@ -155,40 +157,20 @@ impl FastFieldsWriter {
}
}
}
self.num_docs += 1;
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
self,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>,
mut self,
wrt: &mut dyn io::Write,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
todo!();
// for field_writer in self.term_id_writers {
// let field = field_writer.field();
// field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
// }
// for field_writer in &self.single_value_writers {
// field_writer.serialize(serializer, doc_id_map)?;
// }
// for field_writer in self.multi_values_writers {
// let field = field_writer.field();
// field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
// }
// for field_writer in self.bytes_value_writers {
// field_writer.serialize(serializer, doc_id_map)?;
// }
// for field_writer in self.u128_value_writers {
// field_writer.serialize(serializer, doc_id_map)?;
// }
// for field_writer in self.u128_multi_value_writers {
// field_writer.serialize(serializer, doc_id_map)?;
// }
assert!(doc_id_map.is_none()); // TODO handle doc id map
let num_docs = self.num_docs;
self.columnar_writer.serialize(num_docs, wrt)?;
Ok(())
}
}

View File

@@ -442,47 +442,49 @@ mod tests_indexsorting {
Ok(())
}
#[test]
fn test_sort_index_fast_field() -> crate::Result<()> {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string()
);
// #[test]
// fn test_sort_index_fast_field() -> crate::Result<()> {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "my_number".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// get_text_options(),
// )?;
// assert_eq!(
// index.settings().sort_by_field.as_ref().unwrap().field,
// "my_number".to_string()
// );
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields.u64("my_number").unwrap();
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
// let searcher = index.reader()?.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let my_number = index.schema().get_field("my_number").unwrap();
let multifield = fast_fields.u64s("multi_numbers").unwrap();
let mut vals = vec![];
multifield.get_vals(0u32, &mut vals);
assert_eq!(vals, &[] as &[u64]);
let mut vals = vec![];
multifield.get_vals(1u32, &mut vals);
assert_eq!(vals, &[5, 6]);
// let fast_field = fast_fields.u64(my_number).unwrap();
// assert_eq!(fast_field.get_val(0), 10u64);
// assert_eq!(fast_field.get_val(1), 20u64);
// assert_eq!(fast_field.get_val(2), 30u64);
let mut vals = vec![];
multifield.get_vals(2u32, &mut vals);
assert_eq!(vals, &[3]);
Ok(())
}
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let multifield = fast_fields.u64s(multi_numbers).unwrap();
// let mut vals = vec![];
// multifield.get_vals(0u32, &mut vals);
// assert_eq!(vals, &[] as &[u64]);
// let mut vals = vec![];
// multifield.get_vals(1u32, &mut vals);
// assert_eq!(vals, &[5, 6]);
// let mut vals = vec![];
// multifield.get_vals(2u32, &mut vals);
// assert_eq!(vals, &[3]);
// Ok(())
// }
#[test]
fn test_doc_mapping() {

File diff suppressed because it is too large Load Diff

View File

@@ -150,7 +150,6 @@ fn index_json_value(
json_term_writer.term_buffer,
ctx,
indexing_position,
None,
);
}
TextOrDateTime::DateTime(dt) => {

View File

@@ -7,21 +7,21 @@ use itertools::Itertools;
use measure_time::debug_time;
use super::flat_map_with_buffer::FlatMapWithBufferIter;
use super::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueIndexColumn;
// use super::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueIndexColumn;
use crate::core::{Segment, SegmentReader};
use crate::directory::WritePtr;
use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{
get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer,
MultiValueIndex, MultiValuedFastFieldReader,
AliveBitSet, Column, CompositeFastFieldSerializer,
};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
use crate::indexer::sorted_doc_id_column::RemappedDocIdColumn;
use crate::indexer::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueColumn;
// use crate::indexer::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueColumn;
use crate::indexer::SegmentSerializer;
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::schema::{Field, FieldType, Schema};
use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal};
use crate::{
@@ -249,11 +249,13 @@ impl IndexMerger {
fn write_fast_fields(
&self,
fast_field_serializer: &mut CompositeFastFieldSerializer,
fast_field_wrt: &mut WritePtr,
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
debug_time!("write-fast-fields");
debug_time!("wrie-fast-fields");
todo!();
/*
for (field, field_entry) in self.schema.fields() {
let field_type = field_entry.field_type();
@@ -306,74 +308,7 @@ impl IndexMerger {
}
}
}
Ok(())
}
// used to merge `u128` single fast fields.
fn write_u128_multi_fast_field(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
let segment_and_ff_readers: Vec<(&SegmentReader, MultiValuedFastFieldReader<u128>)> = self
.readers
.iter()
.map(|segment_reader| {
let ff_reader: MultiValuedFastFieldReader<u128> = segment_reader
.fast_fields()
.u128s(self.schema.get_field_name(field))
.expect(
"Failed to find index for multivalued field. This is a bug in tantivy, \
please report.",
);
(segment_reader, ff_reader)
})
.collect::<Vec<_>>();
Self::write_1_n_fast_field_idx_generic(
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)?;
let num_vals = segment_and_ff_readers
.iter()
.map(|(segment_reader, reader)| {
// TODO implement generic version, implement reverse scan, all - deletes
if let Some(alive_bitset) = segment_reader.alive_bitset() {
alive_bitset
.iter_alive()
.map(|doc| reader.num_vals(doc))
.sum()
} else {
reader.total_num_vals()
}
})
.sum();
let fast_field_readers = segment_and_ff_readers
.into_iter()
.map(|(_, ff_reader)| ff_reader)
.collect::<Vec<_>>();
let iter_gen = || {
doc_id_mapping
.iter_old_doc_addrs()
.flat_map_with_buffer(|doc_addr, buffer| {
let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize];
fast_field_reader.get_vals(doc_addr.doc_id, buffer);
})
};
fast_field_serializer.create_u128_fast_field_with_idx(field, iter_gen, num_vals, 1)?;
*/
Ok(())
}
@@ -535,111 +470,6 @@ impl IndexMerger {
Ok(SegmentDocIdMapping::new(sorted_doc_ids, false))
}
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
// `MultiValuedFastFieldReader`
//
fn write_1_n_fast_field_idx_generic(
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
segment_and_ff_readers: &[(&SegmentReader, &MultiValueIndex)],
) -> crate::Result<()> {
let column =
RemappedDocIdMultiValueIndexColumn::new(segment_and_ff_readers, doc_id_mapping);
fast_field_serializer.create_auto_detect_u64_fast_field(field, column)?;
Ok(())
}
/// Returns the fastfield index (index for the data, not the data).
fn write_multi_value_fast_field_idx(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
let segment_and_ff_readers = self
.readers
.iter()
.map(|reader| {
let u64s_reader: MultiValuedFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_multi_reader::<u64>(self.schema.get_field_name(field))
.expect(
"Failed to find index for multivalued field. This is a bug in tantivy, \
please report.",
);
(reader, u64s_reader)
})
.collect::<Vec<_>>();
Self::write_1_n_fast_field_idx_generic(
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)
}
fn write_term_id_fast_field(
&self,
field: Field,
term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
debug_time!("write-term-id-fast-field");
// Multifastfield consists of 2 fastfields.
// The first serves as an index into the second one and is strictly increasing.
// The second contains the actual values.
// First we merge the idx fast field.
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let fast_field_reader = self
.readers
.iter()
.map(|reader| {
let ff_reader: MultiValuedFastFieldReader<u64> = reader
.fast_fields()
.u64s(self.schema.get_field_name(field))
.expect("Could not find multivalued u64 fast value reader.");
ff_reader
})
.collect::<Vec<_>>();
// We can now write the actual fast field values.
// In the case of hierarchical facets, they are actually term ordinals.
{
let mut vals = Vec::new();
let mut buffer = Vec::new();
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(old_doc_addr.segment_ord as usize);
let ff_reader = &fast_field_reader[old_doc_addr.segment_ord as usize];
ff_reader.get_vals(old_doc_addr.doc_id, &mut buffer);
for &prev_term_ord in &buffer {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
vals.push(new_term_ord);
}
}
let col = VecColumn::from(&vals[..]);
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx_and_codecs(
field,
col,
1,
&get_fastfield_codecs_for_multivalue(),
)?;
}
Ok(())
}
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between
/// index sorting and the others
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
@@ -664,78 +494,6 @@ impl IndexMerger {
);
Ok(SegmentDocIdMapping::new(mapping, true))
}
fn write_multi_fast_field(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
// Multifastfield consists of 2 fastfields.
// The first serves as an index into the second one and is strictly increasing.
// The second contains the actual values.
// First we merge the idx fast field.
self.write_multi_value_fast_field_idx(field, fast_field_serializer, doc_id_mapping)?;
let fastfield_accessor = RemappedDocIdMultiValueColumn::new(
&self.readers,
doc_id_mapping,
self.schema.get_field_name(field),
);
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx_and_codecs(
field,
fastfield_accessor,
1,
&get_fastfield_codecs_for_multivalue(),
)?;
Ok(())
}
fn write_bytes_fast_field(
&self,
field: Field,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
let segment_and_ff_readers = self
.readers
.iter()
.map(|reader| {
let bytes_reader = reader
.fast_fields()
.bytes(self.schema.get_field_name(field))
.expect(
"Failed to find index for bytes field. This is a bug in tantivy, please \
report.",
);
(reader, bytes_reader)
})
.collect::<Vec<_>>();
Self::write_1_n_fast_field_idx_generic(
field,
fast_field_serializer,
doc_id_mapping,
&segment_and_ff_readers
.iter()
.map(|(segment_reader, u64s_reader)| {
(*segment_reader, u64s_reader.get_index_reader())
})
.collect::<Vec<_>>(),
)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field(field);
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let bytes_reader = &segment_and_ff_readers[old_doc_addr.segment_ord as usize].1;
let val = bytes_reader.get_bytes(old_doc_addr.doc_id);
serialize_vals.write_all(val)?;
}
serialize_vals.flush()?;
Ok(())
}
fn write_postings_for_field(
&self,
@@ -1042,7 +800,7 @@ impl IndexMerger {
)?;
debug!("write-fastfields");
self.write_fast_fields(
serializer.get_fast_field_serializer(),
serializer.get_fast_field_write(),
term_ord_mappings,
&doc_id_mapping,
)?;
@@ -1060,13 +818,13 @@ mod tests {
use schema::FAST;
use crate::collector::tests::{
BytesFastFieldTestCollector, FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
FastFieldTestCollector, TEST_COLLECTOR_WITH_SCORE,
};
use crate::collector::{Count, FacetCollector};
use crate::collector::Count;
use crate::core::Index;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::{
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,
Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,
TextFieldIndexing, INDEXED, TEXT,
};
use crate::time::OffsetDateTime;
@@ -1203,30 +961,28 @@ mod tests {
Some("a b c g")
);
}
{
let get_fast_vals = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
searcher.search(
&query,
&FastFieldTestCollector::for_field("score".to_string()),
)
};
let get_fast_vals_bytes = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
searcher.search(
&query,
&BytesFastFieldTestCollector::for_field(bytes_score_field),
)
};
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")])?,
vec![5, 7, 13]
);
assert_eq!(
get_fast_vals_bytes(vec![Term::from_field_text(text_field, "a")])?,
vec![0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0, 13]
);
}
// {
// let get_fast_vals = |terms: Vec<Term>| {
// let query = BooleanQuery::new_multiterms_query(terms);
// searcher.search(&query, &FastFieldTestCollector::for_field(score_field))
// };
// let get_fast_vals_bytes = |terms: Vec<Term>| {
// let query = BooleanQuery::new_multiterms_query(terms);
// searcher.search(
// &query,
// &BytesFastFieldTestCollector::for_field(bytes_score_field),
// )
// };
// assert_eq!(
// get_fast_vals(vec![Term::from_field_text(text_field, "a")])?,
// vec![5, 7, 13]
// );
// assert_eq!(
// get_fast_vals_bytes(vec![Term::from_field_text(text_field, "a")])?,
// vec![0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0, 13]
// );
// }
}
Ok(())
}
@@ -1247,18 +1003,20 @@ mod tests {
let mut index_writer = index.writer_for_tests()?;
let reader = index.reader().unwrap();
let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field("score".to_string());
let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
let collector = FastFieldTestCollector::for_field("score");
// let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
// searcher
// .search(&term_query, &(collector, bytes_collector))
// .map(|(scores, bytes)| {
// let mut score_bytes = &bytes[..];
// for &score in &scores {
// assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
// }
// scores
// })
searcher
.search(&term_query, &(collector, bytes_collector))
.map(|(scores, bytes)| {
let mut score_bytes = &bytes[..];
for &score in &scores {
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
}
scores
})
.search(&term_query, &collector)
};
let empty_vec = Vec::<u64>::new();
@@ -1537,207 +1295,211 @@ mod tests {
}
Ok(())
}
#[test]
fn test_merge_facets_sort_none() {
test_merge_facets(None, true)
}
#[test]
fn test_merge_facets_sort_asc() {
// In the merge case this will go through the doc_id mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
true,
);
// In the merge case this will not go through the doc_id mapping code, because the data is
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
false,
);
}
#[test]
fn test_merge_facets_sort_desc() {
// In the merge case this will go through the doc_id mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
true,
);
// In the merge case this will not go through the doc_id mapping code, because the data is
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
false,
);
}
// TODO re-enable
// #[test]
// fn test_merge_facets_sort_none() {
// test_merge_facets(None, true)
// }
// #[test]
// fn test_merge_facets_sort_asc() {
// // In the merge case this will go through the doc_id mapping code
// test_merge_facets(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Desc,
// }),
// ..Default::default()
// }),
// true,
// );
// // In the merge case this will not go through the doc_id mapping code, because the data is
// // sorted and disjunct
// test_merge_facets(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Desc,
// }),
// ..Default::default()
// }),
// false,
// );
// }
// #[test]
// fn test_merge_facets_sort_desc() {
// // In the merge case this will go through the doc_id mapping code
// test_merge_facets(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Desc,
// }),
// ..Default::default()
// }),
// true,
// );
// // In the merge case this will not go through the doc_id mapping code, because the data is
// // sorted and disjunct
// test_merge_facets(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Desc,
// }),
// ..Default::default()
// }),
// false,
// );
// }
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
// ranges between segments so that merge algorithm can't apply certain optimizations
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
let mut schema_builder = schema::Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let int_options = NumericOptions::default()
.set_fast()
.set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let mut index_builder = Index::builder().schema(schema_builder.build());
if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings);
}
let index = index_builder.create_in_ram().unwrap();
// let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
let mut int_val = 0;
{
let mut index_writer = index.writer_for_tests().unwrap();
let index_doc =
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
doc.add_u64(int_field, *int_val);
*int_val += 1;
index_writer.add_document(doc).unwrap();
};
// fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
// let mut schema_builder = schema::Schema::builder();
// let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
// let int_options = NumericOptions::default()
// .set_fast()
// .set_indexed();
// let int_field = schema_builder.add_u64_field("intval", int_options);
// let mut index_builder = Index::builder().schema(schema_builder.build());
// if let Some(settings) = index_settings {
// index_builder = index_builder.settings(settings);
// }
// let index = index_builder.create_in_ram().unwrap();
// // let index = Index::create_in_ram(schema_builder.build());
// let reader = index.reader().unwrap();
// let mut int_val = 0;
// {
// let mut index_writer = index.writer_for_tests().unwrap();
// let index_doc =
// |index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
// let mut doc = Document::default();
// for facet in doc_facets {
// doc.add_facet(facet_field, Facet::from(facet));
// }
// doc.add_u64(int_field, *int_val);
// *int_val += 1;
// index_writer.add_document(doc).unwrap();
// };
index_doc(
&mut index_writer,
&["/top/a/firstdoc", "/top/b"],
&mut int_val,
);
index_doc(
&mut index_writer,
&["/top/a/firstdoc", "/top/b", "/top/c"],
&mut int_val,
);
index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val);
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
// index_doc(
// &mut index_writer,
// &["/top/a/firstdoc", "/top/b"],
// &mut int_val,
// );
// index_doc(
// &mut index_writer,
// &["/top/a/firstdoc", "/top/b", "/top/c"],
// &mut int_val,
// );
// index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val);
// index_doc(&mut index_writer, &["/top/a"], &mut int_val);
index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val);
if force_segment_value_overlap {
index_doc(&mut index_writer, &["/top/d"], &mut 0);
index_doc(&mut index_writer, &["/top/e"], &mut 10);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the
// segments don' have disjunct
// ranges
} else {
index_doc(&mut index_writer, &["/top/d"], &mut int_val);
index_doc(&mut index_writer, &["/top/e"], &mut int_val);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
}
index_doc(&mut index_writer, &["/top/b"], &mut int_val);
index_doc(&mut index_writer, &["/top/c"], &mut int_val);
index_writer.commit().expect("committed");
// index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val);
// if force_segment_value_overlap {
// index_doc(&mut index_writer, &["/top/d"], &mut 0);
// index_doc(&mut index_writer, &["/top/e"], &mut 10);
// index_writer.commit().expect("committed");
// index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the
// // segments don' have disjunct
// // ranges
// } else {
// index_doc(&mut index_writer, &["/top/d"], &mut int_val);
// index_doc(&mut index_writer, &["/top/e"], &mut int_val);
// index_writer.commit().expect("committed");
// index_doc(&mut index_writer, &["/top/a"], &mut int_val);
// }
// index_doc(&mut index_writer, &["/top/b"], &mut int_val);
// index_doc(&mut index_writer, &["/top/c"], &mut int_val);
// index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val);
index_writer.commit().expect("committed");
}
// index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val);
// index_writer.commit().expect("committed");
// }
reader.reload().unwrap();
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
let searcher = reader.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top"));
let (count, facet_counts) = searcher
.search(&AllQuery, &(Count, facet_collector))
.unwrap();
assert_eq!(count, expected_num_docs);
let facets: Vec<(String, u64)> = facet_counts
.get("/top")
.map(|(facet, count)| (facet.to_string(), count))
.collect();
assert_eq!(
facets,
expected
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
};
test_searcher(
11,
&[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
test_searcher(
11,
&[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
}
// reader.reload().unwrap();
// let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
// let searcher = reader.searcher();
// let mut facet_collector = FacetCollector::for_field(facet_field);
// facet_collector.add_facet(Facet::from("/top"));
// let (count, facet_counts) = searcher
// .search(&AllQuery, &(Count, facet_collector))
// .unwrap();
// assert_eq!(count, expected_num_docs);
// let facets: Vec<(String, u64)> = facet_counts
// .get("/top")
// .map(|(facet, count)| (facet.to_string(), count))
// .collect();
// assert_eq!(
// facets,
// expected
// .iter()
// .map(|&(facet_str, count)| (String::from(facet_str), count))
// .collect::<Vec<_>>()
// );
// };
// test_searcher(
// 11,
// &[
// ("/top/a", 5),
// ("/top/b", 5),
// ("/top/c", 2),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
// // Merging the segments
// {
// let segment_ids = index
// .searchable_segment_ids()
// .expect("Searchable segments failed.");
// let mut index_writer = index.writer_for_tests().unwrap();
// index_writer
// .merge(&segment_ids)
// .wait()
// .expect("Merging failed");
// index_writer.wait_merging_threads().unwrap();
// reader.reload().unwrap();
// test_searcher(
// 11,
// &[
// ("/top/a", 5),
// ("/top/b", 5),
// ("/top/c", 2),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
// }
// Deleting one term
{
let mut index_writer = index.writer_for_tests().unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
index_writer.commit().unwrap();
reader.reload().unwrap();
test_searcher(
9,
&[
("/top/a", 3),
("/top/b", 3),
("/top/c", 1),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1),
],
);
}
}
// // Deleting one term
// {
// let mut index_writer = index.writer_for_tests().unwrap();
// let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
// let facet_term = Term::from_facet(facet_field, &facet);
// index_writer.delete_term(facet_term);
// index_writer.commit().unwrap();
// reader.reload().unwrap();
// test_searcher(
// 9,
// &[
// ("/top/a", 3),
// ("/top/b", 3),
// ("/top/c", 1),
// ("/top/d", 2),
// ("/top/e", 2),
// ("/top/f", 1),
// ],
// );
// }
// }
#[test]
fn test_bug_merge() -> crate::Result<()> {
@@ -1839,45 +1601,45 @@ mod tests {
{
let segment = searcher.segment_reader(0u32);
let ff_reader = segment.fast_fields().u64s("intvals").unwrap();
// let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]);
// ff_reader.get_vals(0, &mut vals);
// assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(1, &mut vals);
assert_eq!(&vals, &[1, 2, 3]);
// ff_reader.get_vals(1, &mut vals);
// assert_eq!(&vals, &[1, 2, 3]);
ff_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4, 5]);
// ff_reader.get_vals(2, &mut vals);
// assert_eq!(&vals, &[4, 5]);
ff_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[1, 2]);
// ff_reader.get_vals(3, &mut vals);
// assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(4, &mut vals);
assert_eq!(&vals, &[1, 5]);
// ff_reader.get_vals(4, &mut vals);
// assert_eq!(&vals, &[1, 5]);
ff_reader.get_vals(5, &mut vals);
assert_eq!(&vals, &[3]);
// ff_reader.get_vals(5, &mut vals);
// assert_eq!(&vals, &[3]);
ff_reader.get_vals(6, &mut vals);
assert_eq!(&vals, &[17]);
// ff_reader.get_vals(6, &mut vals);
// assert_eq!(&vals, &[17]);
}
{
let segment = searcher.segment_reader(1u32);
let ff_reader = segment.fast_fields().u64s("intvals").unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[28, 27]);
// let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
// ff_reader.get_vals(0, &mut vals);
// assert_eq!(&vals, &[28, 27]);
ff_reader.get_vals(1, &mut vals);
assert_eq!(&vals, &[1_000]);
// ff_reader.get_vals(1, &mut vals);
// assert_eq!(&vals, &[1_000]);
}
{
let segment = searcher.segment_reader(2u32);
let ff_reader = segment.fast_fields().u64s("intvals").unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[20]);
// let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
// ff_reader.get_vals(0, &mut vals);
// assert_eq!(&vals, &[20]);
}
// Merging the segments
@@ -1892,37 +1654,37 @@ mod tests {
{
let searcher = reader.searcher();
let segment = searcher.segment_reader(0u32);
let ff_reader = segment.fast_fields().u64s("intvals").unwrap();
// let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]);
// ff_reader.get_vals(0, &mut vals);
// assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(1, &mut vals);
assert_eq!(&vals, &[1, 2, 3]);
// ff_reader.get_vals(1, &mut vals);
// assert_eq!(&vals, &[1, 2, 3]);
ff_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4, 5]);
// ff_reader.get_vals(2, &mut vals);
// assert_eq!(&vals, &[4, 5]);
ff_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[1, 2]);
// ff_reader.get_vals(3, &mut vals);
// assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(4, &mut vals);
assert_eq!(&vals, &[1, 5]);
// ff_reader.get_vals(4, &mut vals);
// assert_eq!(&vals, &[1, 5]);
ff_reader.get_vals(5, &mut vals);
assert_eq!(&vals, &[3]);
// ff_reader.get_vals(5, &mut vals);
// assert_eq!(&vals, &[3]);
ff_reader.get_vals(6, &mut vals);
assert_eq!(&vals, &[17]);
// ff_reader.get_vals(6, &mut vals);
// assert_eq!(&vals, &[17]);
ff_reader.get_vals(7, &mut vals);
assert_eq!(&vals, &[28, 27]);
// ff_reader.get_vals(7, &mut vals);
// assert_eq!(&vals, &[28, 27]);
ff_reader.get_vals(8, &mut vals);
assert_eq!(&vals, &[1_000]);
// ff_reader.get_vals(8, &mut vals);
// assert_eq!(&vals, &[1_000]);
ff_reader.get_vals(9, &mut vals);
assert_eq!(&vals, &[20]);
// ff_reader.get_vals(9, &mut vals);
// assert_eq!(&vals, &[20]);
}
Ok(())
}

View File

@@ -2,10 +2,10 @@
mod tests {
use crate::collector::TopDocs;
use crate::core::Index;
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
use crate::fastfield::AliveBitSet;
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
@@ -349,128 +349,131 @@ mod tests {
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
false,
)
.unwrap();
// #[test]
// fn test_merge_sorted_index_asc() {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// false,
// )
// .unwrap();
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64("intval").unwrap();
assert_eq!(fast_field.get_val(0), 1u64);
assert_eq!(fast_field.get_val(1), 2u64);
assert_eq!(fast_field.get_val(2), 3u64);
assert_eq!(fast_field.get_val(3), 10u64);
assert_eq!(fast_field.get_val(4), 20u64);
assert_eq!(fast_field.get_val(5), 1_000u64);
// let int_field = index.schema().get_field("intval").unwrap();
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let bytes_field = index.schema().get_field("bytes").unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_readers().last().unwrap();
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
let mut vals = vec![];
fast_field.get_vals(doc_id, &mut vals);
vals
};
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64s("multi_numbers").unwrap();
assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
assert_eq!(&get_vals(&fast_field, 4), &[20]);
assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64(int_field).unwrap();
// assert_eq!(fast_field.get_val(0), 1u64);
// assert_eq!(fast_field.get_val(1), 2u64);
// assert_eq!(fast_field.get_val(2), 3u64);
// assert_eq!(fast_field.get_val(3), 10u64);
// assert_eq!(fast_field.get_val(4), 20u64);
// assert_eq!(fast_field.get_val(5), 1_000u64);
let fast_field = fast_fields.bytes("bytes").unwrap();
assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
// let mut vals = vec![];
// fast_field.get_vals(doc_id, &mut vals);
// vals
// };
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
}
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
// // test new field norm mapping
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
// }
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
// let searcher = index.reader().unwrap().searcher();
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
// let do_search = |term: &str| {
// let query = QueryParser::for_index(&index, vec![my_text_field])
// .parse_query(term)
// .unwrap();
// let top_docs: Vec<(f32, DocAddress)> =
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
assert_eq!(do_search("some"), vec![2]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![5]);
}
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
// };
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
// assert_eq!(do_search("some"), vec![2]);
// assert_eq!(do_search("blubber"), vec![3]);
// assert_eq!(do_search("biggest"), vec![5]);
// }
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
// // postings file
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let term_a = Term::from_field_text(my_text_field, "text");
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
// let mut postings = inverted_index
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
// .unwrap()
// .unwrap();
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
// assert_eq!(postings.doc_freq(), 2);
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
// assert_eq!(
// postings.doc_freq_given_deletes(
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
// ),
// 2
// );
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// let mut output = vec![];
// postings.positions(&mut output);
// assert_eq!(output, vec![1, 3]);
// postings.advance();
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
}
}
// postings.positions(&mut output);
// assert_eq!(output, vec![1]);
// }
// // access doc store
// {
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
// }
// }
}
#[cfg(all(test, feature = "unstable"))]

View File

@@ -20,7 +20,7 @@ pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
mod sorted_doc_id_column;
mod sorted_doc_id_multivalue_column;
// mod sorted_doc_id_multivalue_column;
mod stamper;
use crossbeam_channel as channel;
@@ -58,7 +58,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(test)]
mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
// use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
@@ -79,45 +79,45 @@ mod tests_mmap {
Ok(())
}
#[test]
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_disabled_dot_escaped_required() {
// let mut schema_builder = Schema::builder();
// let json_field = schema_builder.add_json_field("json", TEXT);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s\.container\.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s.container.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
// let mut schema_builder = Schema::builder();
// let json_options: JsonObjectOptions =
// JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
// let json_field = schema_builder.add_json_field("json", json_options);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s.container.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
}

View File

@@ -1,4 +1,7 @@
use common::TerminatingWrite;
use crate::core::{Segment, SegmentComponent};
use crate::directory::WritePtr;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
@@ -9,7 +12,7 @@ use crate::store::StoreWriter;
pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
fast_field_serializer: CompositeFastFieldSerializer,
fast_field_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
@@ -47,7 +50,6 @@ impl SegmentSerializer {
};
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
@@ -56,7 +58,7 @@ impl SegmentSerializer {
Ok(SegmentSerializer {
segment,
store_writer,
fast_field_serializer,
fast_field_write,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
})
@@ -81,8 +83,8 @@ impl SegmentSerializer {
}
/// Accessor to the `FastFieldSerializer`.
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
&mut self.fast_field_serializer
pub fn get_fast_field_write(&mut self) -> &mut WritePtr {
&mut self.fast_field_write
}
/// Extract the field norm serializer.
@@ -102,7 +104,7 @@ impl SegmentSerializer {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?;
self.fast_field_write.terminate()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
Ok(())

View File

@@ -348,8 +348,7 @@ impl SegmentWriter {
pub fn add_document(&mut self, add_operation: AddOperation) -> crate::Result<()> {
let doc = add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
let doc_id = self.max_doc;
self.fast_field_writers.add_document(doc_id, &doc)?;
self.fast_field_writers.add_document(&doc)?;
self.index_document(&doc)?;
let doc_writer = self.segment_serializer.get_store_writer();
doc_writer.store(&doc, &self.schema)?;
@@ -410,8 +409,7 @@ fn remap_and_write(
)?;
debug!("fastfield-serialize");
fast_field_writers.serialize(
serializer.get_fast_field_serializer(),
&term_ord_map,
serializer.get_fast_field_write(),
doc_id_map,
)?;

View File

@@ -263,7 +263,7 @@ mod indexer;
pub mod error;
pub mod tokenizer;
pub mod aggregation;
// pub mod aggregation;
pub mod collector;
pub mod directory;
pub mod fastfield;
@@ -1166,4 +1166,5 @@ pub mod tests {
);
assert_eq!(dt_from_ts_nanos.to_hms_micro(), offset_dt.to_hms_micro());
}
}

View File

@@ -2,7 +2,6 @@ use std::io;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
@@ -44,7 +43,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
) {
self.str_posting_writer.index_text(
doc_id,
@@ -52,7 +50,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer,
ctx,
indexing_position,
None,
);
}

View File

@@ -6,7 +6,6 @@ use std::ops::Range;
use rustc_hash::FxHashMap;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::recorder::{BufferLender, Recorder};
@@ -135,7 +134,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
pos: u32,
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId;
) -> UnorderedTermId; // TODO remove UnorderedTermId
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
@@ -155,7 +154,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
) {
let end_of_path_idx = term_buffer.len_bytes();
let mut num_tokens = 0;
@@ -175,11 +173,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = end_position.max(start_position + token.position_length as u32);
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id);
}
self.subscribe(doc_id, start_position, term_buffer, ctx);
num_tokens += 1;
});

View File

@@ -15,7 +15,7 @@ mod more_like_this;
mod phrase_query;
mod query;
mod query_parser;
mod range_query;
// mod range_query;
mod regex_query;
mod reqopt_scorer;
mod scorer;
@@ -50,7 +50,7 @@ pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::RangeQuery;
// pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{

View File

@@ -13,10 +13,11 @@ use crate::core::Index;
use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
// use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query,
RangeQuery, TermQuery, TermSetQuery,
// RangeQuery,
TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -334,6 +335,8 @@ impl QueryParser {
json_path: &str,
phrase: &str,
) -> Result<Term, QueryParserError> {
todo!();
/*
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let field_supports_ff_range_queries = field_type.is_fast()
@@ -417,6 +420,7 @@ impl QueryParser {
Ok(Term::from_field_ip_addr(field, ip_v6))
}
}
*/
}
fn compute_logical_ast_for_leaf(
@@ -740,9 +744,11 @@ fn convert_literal_to_query(
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
} => { todo!();
// Box::new(RangeQuery::new_term_bounds(
// field, value_type, &lower, &upper,
// ))
} ,
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}

View File

@@ -4,7 +4,7 @@ use std::sync::Arc;
use fastfield_codecs::Column;
use crate::fastfield::{MakeZero, MultiValuedFastFieldReader};
use crate::fastfield::MakeZero;
use crate::{DocId, DocSet, TERMINATED};
/// Helper to have a cursor over a vec of docids

View File

@@ -8,10 +8,13 @@ use std::ops::{Bound, RangeInclusive};
use common::BinarySerializable;
use fastfield_codecs::MonotonicallyMappableToU128;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
<<<<<<< HEAD
use crate::schema::Cardinality;
=======
use crate::schema::Field;
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
@@ -40,6 +43,7 @@ impl IPFastFieldRangeWeight {
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
<<<<<<< HEAD
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
@@ -74,6 +78,40 @@ impl Weight for IPFastFieldRangeWeight {
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
=======
todo!();
// let field_type = reader.schema().get_field_entry(self.field).field_type();
// match field_type.fastfield_cardinality().unwrap() {
// Cardinality::SingleValue => {
// let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// ip_addr_fast_field.min_value(),
// ip_addr_fast_field.max_value(),
// );
// let docset = RangeDocSet::new(
// value_range,
// FastFieldCardinality::SingleValue(ip_addr_fast_field),
// );
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// Cardinality::MultiValues => {
// let ip_addr_fast_field = reader.fast_fields().ip_addrs(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// ip_addr_fast_field.min_value(),
// ip_addr_fast_field.max_value(),
// );
// let docset = RangeDocSet::new(
// value_range,
// FastFieldCardinality::MultiValue(ip_addr_fast_field),
// );
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// }
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {

View File

@@ -6,10 +6,14 @@ use std::ops::{Bound, RangeInclusive};
use fastfield_codecs::MonotonicallyMappableToU64;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::fast_field_range_query::RangeDocSet;
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
<<<<<<< HEAD
use crate::schema::Cardinality;
=======
use crate::schema::Field;
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
@@ -33,6 +37,7 @@ impl FastFieldRangeWeight {
impl Weight for FastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
<<<<<<< HEAD
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
@@ -63,6 +68,36 @@ impl Weight for FastFieldRangeWeight {
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
=======
todo!();
// let field_type = reader.schema().get_field_entry(self.field).field_type();
// match field_type.fastfield_cardinality().unwrap() {
// Cardinality::SingleValue => {
// let fast_field = reader.fast_fields().u64_lenient(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// fast_field.min_value(),
// fast_field.max_value(),
// );
// let docset =
// RangeDocSet::new(value_range, FastFieldCardinality::SingleValue(fast_field));
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// Cardinality::MultiValues => {
// let fast_field = reader.fast_fields().u64s_lenient(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// fast_field.min_value(),
// fast_field.max_value(),
// );
// let docset =
// RangeDocSet::new(value_range, FastFieldCardinality::MultiValue(fast_field));
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// }
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {

View File

@@ -8,7 +8,7 @@ use serde_json::Value as JsonValue;
use thiserror::Error;
use super::ip_options::IpAddrOptions;
use super::{Cardinality, IntoIpv6Addr};
use super:: IntoIpv6Addr;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::{
@@ -241,27 +241,6 @@ impl FieldType {
}
}
/// returns true if the field is fast.
pub fn fastfield_cardinality(&self) -> Option<Cardinality> {
todo!();
// match *self {
// FieldType::Bytes(ref bytes_options) => {
// bytes_options.is_fast().then_some(Cardinality::SingleValue)
// }
// FieldType::Str(ref text_options) => {
// text_options.is_fast().then_some(Cardinality::MultiValues)
// }
// FieldType::U64(ref int_options)
// | FieldType::I64(ref int_options)
// | FieldType::F64(ref int_options)
// | FieldType::Bool(ref int_options) => int_options.get_fastfield_cardinality(),
// FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(),
// FieldType::Facet(_) => Some(Cardinality::MultiValues),
// FieldType::JsonObject(_) => None,
// FieldType::IpAddr(ref ip_addr_options) =>
// ip_addr_options.get_fastfield_cardinality(), }
}
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
pub fn has_fieldnorms(&self) -> bool {
match *self {

View File

@@ -143,7 +143,7 @@ pub use self::json_object_options::JsonObjectOptions;
pub use self::named_field_document::NamedFieldDocument;
pub use self::numeric_options::NumericOptions;
#[allow(deprecated)]
pub use self::numeric_options::{Cardinality, IntOptions};
pub use self::numeric_options::IntOptions;
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
pub use self::term::Term;
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};

View File

@@ -4,18 +4,6 @@ use serde::{Deserialize, Serialize};
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// Express whether a field is single-value or multi-valued.
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)]
pub enum Cardinality {
/// The document must have exactly one value associated with the document.
#[serde(rename = "single")]
SingleValue,
/// The document can have any number of values associated with the document.
/// This is more memory and CPU expensive than the `SingleValue` solution.
#[serde(rename = "multi")]
MultiValues,
}
#[deprecated(since = "0.17.0", note = "Use NumericOptions instead.")]
/// Deprecated use [`NumericOptions`] instead.
pub type IntOptions = NumericOptions;