From 1ff762bd8f3a1b9cf9f736b56327a221317535f1 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 23 Jan 2023 18:34:05 +0900 Subject: [PATCH] Facets --- columnar/src/column/dictionary_encoded.rs | 4 + columnar/src/columnar/writer/mod.rs | 2 +- src/core/segment_reader.rs | 19 +++- src/fastfield/facet_reader.rs | 124 ++++++++++++---------- src/fastfield/mod.rs | 4 +- src/fastfield/readers.rs | 1 + src/fastfield/writer.rs | 19 ++-- src/postings/postings_writer.rs | 2 +- src/schema/facet.rs | 8 +- src/tokenizer/facet_tokenizer.rs | 4 +- 10 files changed, 103 insertions(+), 84 deletions(-) diff --git a/columnar/src/column/dictionary_encoded.rs b/columnar/src/column/dictionary_encoded.rs index b650ba698..e60202bb2 100644 --- a/columnar/src/column/dictionary_encoded.rs +++ b/columnar/src/column/dictionary_encoded.rs @@ -39,6 +39,10 @@ impl BytesColumn { pub fn ords(&self) -> &Column { &self.term_ord_column } + + pub fn num_terms(&self) -> usize { + self.dictionary.num_terms() + } } #[derive(Clone)] diff --git a/columnar/src/columnar/writer/mod.rs b/columnar/src/columnar/writer/mod.rs index 741042a7a..96a56c071 100644 --- a/columnar/src/columnar/writer/mod.rs +++ b/columnar/src/columnar/writer/mod.rs @@ -532,7 +532,7 @@ fn serialize_ip_addr_column( } fn send_to_serialize_column_mappable_to_u128< - T: Copy + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd, + T: Copy + Ord + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd, >( op_iterator: impl Iterator>, cardinality: Cardinality, diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 49234ba0b..581332171 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -7,9 +7,9 @@ use fail::fail_point; use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId}; use crate::directory::{CompositeFile, FileSlice}; use crate::error::DataCorruption; -use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders}; +use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; -use crate::schema::{Field, FieldType, IndexRecordOption, Schema}; +use crate::schema::{Field, FieldType, IndexRecordOption, Schema, Type}; use crate::space_usage::SegmentSpaceUsage; use crate::store::StoreReader; use crate::termdict::TermDictionary; @@ -90,8 +90,19 @@ impl SegmentReader { } /// Accessor to the `FacetReader` associated with a given `Field`. - pub fn facet_reader(&self, field: Field) -> crate::Result<()> { - todo!(); + pub fn facet_reader(&self, field_name: &str) -> crate::Result { + let schema = self.schema(); + let field = schema.get_field(field_name)?; + let field_entry = schema.get_field_entry(field); + if field_entry.field_type().value_type() != Type::Facet { + return Err(crate::TantivyError::SchemaError(format!( + "`{field_name}` is not a facet field.`" + ))); + } + let Some(facet_column) = self.fast_fields().str_column_opt(field_name)? else { + panic!("Facet Field `{field_name}` is missing. This should not happen"); + }; + Ok(FacetReader::new(facet_column)) } /// Accessor to the segment's `Field norms`'s reader. diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index 47eabbe08..9c7e2f032 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -1,9 +1,7 @@ -use std::str; +use columnar::StrColumn; -use super::MultiValuedFastFieldReader; -use crate::error::DataCorruption; use crate::schema::Facet; -use crate::termdict::{TermDictionary, TermOrdinal}; +use crate::termdict::TermOrdinal; use crate::DocId; /// The facet reader makes it possible to access the list of @@ -20,9 +18,7 @@ use crate::DocId; /// list of facets. This ordinal is segment local and /// only makes sense for a given segment. pub struct FacetReader { - term_ords: MultiValuedFastFieldReader, - term_dict: TermDictionary, - buffer: Vec, + facet_column: StrColumn, } impl FacetReader { @@ -33,15 +29,8 @@ impl FacetReader { /// access the list of facet ords for a given document. /// - a `TermDictionary` that helps associating a facet to /// an ordinal and vice versa. - pub fn new( - term_ords: MultiValuedFastFieldReader, - term_dict: TermDictionary, - ) -> FacetReader { - FacetReader { - term_ords, - term_dict, - buffer: vec![], - } + pub fn new(facet_column: StrColumn) -> FacetReader { + FacetReader { facet_column } } /// Returns the size of the sets of facets in the segment. @@ -50,31 +39,20 @@ impl FacetReader { /// /// `Facet` ordinals range from `0` to `num_facets() - 1`. pub fn num_facets(&self) -> usize { - self.term_dict.num_terms() - } - - /// Accessor for the facet term dictionary. - pub fn facet_dict(&self) -> &TermDictionary { - &self.term_dict + self.facet_column.num_terms() } /// Given a term ordinal returns the term associated with it. - pub fn facet_from_ord( - &mut self, - facet_ord: TermOrdinal, - output: &mut Facet, - ) -> crate::Result<()> { - let found_term = self.term_dict.ord_to_term(facet_ord, &mut self.buffer)?; - assert!(found_term, "Term ordinal {} no found.", facet_ord); - let facet_str = str::from_utf8(&self.buffer[..]) - .map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?; - output.set_facet_str(facet_str); + pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) -> crate::Result<()> { + let found_term = self.facet_column.ord_to_str(facet_ord, &mut output.0)?; + assert!(found_term, "Term ordinal {facet_ord} no found."); Ok(()) } /// Return the list of facet ordinals associated with a document. pub fn facet_ords(&self, doc: DocId, output: &mut Vec) { - self.term_ords.get_vals(doc, output); + output.clear(); + output.extend(self.facet_column.ords().values(doc)); } } @@ -84,26 +62,65 @@ mod tests { use crate::{DocAddress, Document, Index}; #[test] - fn test_facet_only_indexed() -> crate::Result<()> { + fn test_facet_only_indexed() { let mut schema_builder = SchemaBuilder::default(); let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; - index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; - index_writer.commit()?; - let searcher = index.reader()?.searcher(); - let facet_reader = searcher - .segment_reader(0u32) - .facet_reader(facet_field) + let mut index_writer = index.writer_for_tests().unwrap(); + index_writer + .add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap())) .unwrap(); + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap(); let mut facet_ords = Vec::new(); facet_reader.facet_ords(0u32, &mut facet_ords); - assert_eq!(&facet_ords, &[2u64]); - let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; + assert_eq!(&facet_ords, &[0u64]); + assert_eq!(facet_reader.num_facets(), 1); + let mut facet = Facet::default(); + facet_reader.facet_from_ord(0, &mut facet).unwrap(); + assert_eq!(facet.to_path_string(), "/a/b"); + let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap(); let value = doc.get_first(facet_field).and_then(Value::as_facet); assert_eq!(value, None); - Ok(()) + } + + #[test] + fn test_facet_several_facets() { + let mut schema_builder = SchemaBuilder::default(); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests().unwrap(); + index_writer + .add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap())) + .unwrap(); + index_writer + .add_document(doc!( + facet_field=>Facet::from_text("/parent/child2").unwrap(), + facet_field=>Facet::from_text("/parent/child1/blop").unwrap(), + )) + .unwrap(); + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap(); + let mut facet_ords = Vec::new(); + + facet_reader.facet_ords(0u32, &mut facet_ords); + assert_eq!(&facet_ords, &[0u64]); + + facet_reader.facet_ords(1u32, &mut facet_ords); + assert_eq!(&facet_ords, &[1u64, 2u64]); + + assert_eq!(facet_reader.num_facets(), 3); + let mut facet = Facet::default(); + facet_reader.facet_from_ord(0, &mut facet).unwrap(); + assert_eq!(facet.to_path_string(), "/parent/child1"); + facet_reader.facet_from_ord(1, &mut facet).unwrap(); + assert_eq!(facet.to_path_string(), "/parent/child1/blop"); + facet_reader.facet_from_ord(2, &mut facet).unwrap(); + assert_eq!(facet.to_path_string(), "/parent/child2"); } #[test] @@ -116,13 +133,10 @@ mod tests { index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; index_writer.commit()?; let searcher = index.reader()?.searcher(); - let facet_reader = searcher - .segment_reader(0u32) - .facet_reader(facet_field) - .unwrap(); + let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap(); let mut facet_ords = Vec::new(); facet_reader.facet_ords(0u32, &mut facet_ords); - assert_eq!(&facet_ords, &[2u64]); + assert_eq!(&facet_ords, &[0u64]); let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet); assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); @@ -140,13 +154,10 @@ mod tests { index_writer.add_document(Document::default())?; index_writer.commit()?; let searcher = index.reader()?.searcher(); - let facet_reader = searcher - .segment_reader(0u32) - .facet_reader(facet_field) - .unwrap(); + let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap(); let mut facet_ords = Vec::new(); facet_reader.facet_ords(0u32, &mut facet_ords); - assert_eq!(&facet_ords, &[2u64]); + assert_eq!(&facet_ords, &[0u64]); facet_reader.facet_ords(1u32, &mut facet_ords); assert!(facet_ords.is_empty()); Ok(()) @@ -163,10 +174,7 @@ mod tests { index_writer.add_document(Document::default())?; index_writer.commit()?; let searcher = index.reader()?.searcher(); - let facet_reader = searcher - .segment_reader(0u32) - .facet_reader(facet_field) - .unwrap(); + let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap(); let mut facet_ords = Vec::new(); facet_reader.facet_ords(0u32, &mut facet_ords); assert!(facet_ords.is_empty()); diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 4f863466c..0107497fe 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -27,7 +27,7 @@ pub use fastfield_codecs::Column; pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet}; // pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; -// pub use self::facet_reader::FacetReader; +pub use self::facet_reader::FacetReader; pub use self::readers::FastFieldReaders; pub use self::writer::FastFieldsWriter; use crate::schema::{Type, Value}; @@ -36,7 +36,7 @@ use crate::DateTime; mod alive_bitset; // mod bytes; mod error; -// mod facet_reader; +mod facet_reader; mod readers; mod writer; diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 653941faa..f32d09f16 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -9,6 +9,7 @@ use columnar::{ use fastfield_codecs::Column; use crate::directory::FileSlice; +use crate::fastfield::FacetReader; use crate::space_usage::PerFieldSpaceUsage; /// Provides access to all of the BitpackedFastFieldReader. diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 3a0679694..0f0e83849 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,16 +1,9 @@ -use std::collections::HashMap; use std::io; -use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue}; -use common; -use rustc_hash::FxHashMap; -use tantivy_bitpacker::BlockedBitpacker; +use columnar::{ColumnType, ColumnarWriter, NumericalValue}; -use super::FastFieldType; use crate::indexer::doc_id_mapping::DocIdMapping; -use crate::postings::UnorderedTermId; -use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value}; -use crate::termdict::TermOrdinal; +use crate::schema::{Document, FieldType, Schema, Type, Value}; use crate::{DatePrecision, DocId}; /// The `FastFieldsWriter` groups all of the fast field writers. @@ -120,7 +113,13 @@ impl FastFieldsWriter { truncated_datetime.into(), ); } - Value::Facet(_) => todo!(), + Value::Facet(facet) => { + self.columnar_writer.record_str( + doc_id, + field_name.as_str(), + facet.encoded_str(), + ); + } Value::JsonObject(_) => todo!(), Value::IpAddr(ip_addr) => { self.columnar_writer diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index bf6d5474e..baf3c6ac8 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -78,7 +78,7 @@ pub(crate) fn serialize_postings( Ok(()) } -#[derive(Default)] +#[derive(Default, Debug)] pub(crate) struct IndexingPosition { pub num_tokens: u32, pub end_position: u32, diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 8c5c99916..8bbd3eb9b 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -41,7 +41,7 @@ pub enum FacetParseError { /// its facet. In the example above, `/electronics/tv_and_video/` /// and `/electronics`. #[derive(Clone, Default, Eq, Hash, PartialEq, Ord, PartialOrd)] -pub struct Facet(String); +pub struct Facet(pub(crate) String); impl Facet { /// Returns a new instance of the "root facet" @@ -145,12 +145,6 @@ impl Facet { Facet(facet_string) } - /// Accessor for the inner buffer of the `Facet`. - pub(crate) fn set_facet_str(&mut self, facet_str: &str) { - self.0.clear(); - self.0.push_str(facet_str); - } - /// Returns `true` if other is a `strict` subfacet of `self`. /// /// Disclaimer: By strict we mean that the relation is not reflexive. diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 963e4e30c..8a7f28226 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -27,10 +27,12 @@ pub struct FacetTokenStream<'a> { impl Tokenizer for FacetTokenizer { fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + let mut token = Token::default(); + token.position = 0; FacetTokenStream { text, state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. - token: Token::default(), + token, } .into() }