This commit is contained in:
Paul Masurel
2023-01-23 18:34:05 +09:00
parent d29d63a829
commit 1ff762bd8f
10 changed files with 103 additions and 84 deletions

View File

@@ -39,6 +39,10 @@ impl BytesColumn {
pub fn ords(&self) -> &Column<u64> {
&self.term_ord_column
}
pub fn num_terms(&self) -> usize {
self.dictionary.num_terms()
}
}
#[derive(Clone)]

View File

@@ -532,7 +532,7 @@ fn serialize_ip_addr_column(
}
fn send_to_serialize_column_mappable_to_u128<
T: Copy + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
T: Copy + Ord + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
>(
op_iterator: impl Iterator<Item = ColumnOperation<T>>,
cardinality: Cardinality,

View File

@@ -7,9 +7,9 @@ use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders};
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
use crate::schema::{Field, FieldType, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
@@ -90,8 +90,19 @@ impl SegmentReader {
}
/// Accessor to the `FacetReader` associated with a given `Field`.
pub fn facet_reader(&self, field: Field) -> crate::Result<()> {
todo!();
pub fn facet_reader(&self, field_name: &str) -> crate::Result<FacetReader> {
let schema = self.schema();
let field = schema.get_field(field_name)?;
let field_entry = schema.get_field_entry(field);
if field_entry.field_type().value_type() != Type::Facet {
return Err(crate::TantivyError::SchemaError(format!(
"`{field_name}` is not a facet field.`"
)));
}
let Some(facet_column) = self.fast_fields().str_column_opt(field_name)? else {
panic!("Facet Field `{field_name}` is missing. This should not happen");
};
Ok(FacetReader::new(facet_column))
}
/// Accessor to the segment's `Field norms`'s reader.

View File

@@ -1,9 +1,7 @@
use std::str;
use columnar::StrColumn;
use super::MultiValuedFastFieldReader;
use crate::error::DataCorruption;
use crate::schema::Facet;
use crate::termdict::{TermDictionary, TermOrdinal};
use crate::termdict::TermOrdinal;
use crate::DocId;
/// The facet reader makes it possible to access the list of
@@ -20,9 +18,7 @@ use crate::DocId;
/// list of facets. This ordinal is segment local and
/// only makes sense for a given segment.
pub struct FacetReader {
term_ords: MultiValuedFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
facet_column: StrColumn,
}
impl FacetReader {
@@ -33,15 +29,8 @@ impl FacetReader {
/// access the list of facet ords for a given document.
/// - a `TermDictionary` that helps associating a facet to
/// an ordinal and vice versa.
pub fn new(
term_ords: MultiValuedFastFieldReader<u64>,
term_dict: TermDictionary,
) -> FacetReader {
FacetReader {
term_ords,
term_dict,
buffer: vec![],
}
pub fn new(facet_column: StrColumn) -> FacetReader {
FacetReader { facet_column }
}
/// Returns the size of the sets of facets in the segment.
@@ -50,31 +39,20 @@ impl FacetReader {
///
/// `Facet` ordinals range from `0` to `num_facets() - 1`.
pub fn num_facets(&self) -> usize {
self.term_dict.num_terms()
}
/// Accessor for the facet term dictionary.
pub fn facet_dict(&self) -> &TermDictionary {
&self.term_dict
self.facet_column.num_terms()
}
/// Given a term ordinal returns the term associated with it.
pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> crate::Result<()> {
let found_term = self.term_dict.ord_to_term(facet_ord, &mut self.buffer)?;
assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])
.map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?;
output.set_facet_str(facet_str);
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) -> crate::Result<()> {
let found_term = self.facet_column.ord_to_str(facet_ord, &mut output.0)?;
assert!(found_term, "Term ordinal {facet_ord} no found.");
Ok(())
}
/// Return the list of facet ordinals associated with a document.
pub fn facet_ords(&self, doc: DocId, output: &mut Vec<u64>) {
self.term_ords.get_vals(doc, output);
output.clear();
output.extend(self.facet_column.ords().values(doc));
}
}
@@ -84,26 +62,65 @@ mod tests {
use crate::{DocAddress, Document, Index};
#[test]
fn test_facet_only_indexed() -> crate::Result<()> {
fn test_facet_only_indexed() {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]);
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
assert_eq!(&facet_ords, &[0u64]);
assert_eq!(facet_reader.num_facets(), 1);
let mut facet = Facet::default();
facet_reader.facet_from_ord(0, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/a/b");
let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
let value = doc.get_first(facet_field).and_then(Value::as_facet);
assert_eq!(value, None);
Ok(())
}
#[test]
fn test_facet_several_facets() {
let mut schema_builder = SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap()))
.unwrap();
index_writer
.add_document(doc!(
facet_field=>Facet::from_text("/parent/child2").unwrap(),
facet_field=>Facet::from_text("/parent/child1/blop").unwrap(),
))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[0u64]);
facet_reader.facet_ords(1u32, &mut facet_ords);
assert_eq!(&facet_ords, &[1u64, 2u64]);
assert_eq!(facet_reader.num_facets(), 3);
let mut facet = Facet::default();
facet_reader.facet_from_ord(0, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/parent/child1");
facet_reader.facet_from_ord(1, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/parent/child1/blop");
facet_reader.facet_from_ord(2, &mut facet).unwrap();
assert_eq!(facet.to_path_string(), "/parent/child2");
}
#[test]
@@ -116,13 +133,10 @@ mod tests {
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
.unwrap();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]);
assert_eq!(&facet_ords, &[0u64]);
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
@@ -140,13 +154,10 @@ mod tests {
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
.unwrap();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert_eq!(&facet_ords, &[2u64]);
assert_eq!(&facet_ords, &[0u64]);
facet_reader.facet_ords(1u32, &mut facet_ords);
assert!(facet_ords.is_empty());
Ok(())
@@ -163,10 +174,7 @@ mod tests {
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
.segment_reader(0u32)
.facet_reader(facet_field)
.unwrap();
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
let mut facet_ords = Vec::new();
facet_reader.facet_ords(0u32, &mut facet_ords);
assert!(facet_ords.is_empty());

View File

@@ -27,7 +27,7 @@ pub use fastfield_codecs::Column;
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
// pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result};
// pub use self::facet_reader::FacetReader;
pub use self::facet_reader::FacetReader;
pub use self::readers::FastFieldReaders;
pub use self::writer::FastFieldsWriter;
use crate::schema::{Type, Value};
@@ -36,7 +36,7 @@ use crate::DateTime;
mod alive_bitset;
// mod bytes;
mod error;
// mod facet_reader;
mod facet_reader;
mod readers;
mod writer;

View File

@@ -9,6 +9,7 @@ use columnar::{
use fastfield_codecs::Column;
use crate::directory::FileSlice;
use crate::fastfield::FacetReader;
use crate::space_usage::PerFieldSpaceUsage;
/// Provides access to all of the BitpackedFastFieldReader.

View File

@@ -1,16 +1,9 @@
use std::collections::HashMap;
use std::io;
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
use common;
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use columnar::{ColumnType, ColumnarWriter, NumericalValue};
use super::FastFieldType;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};
use crate::termdict::TermOrdinal;
use crate::schema::{Document, FieldType, Schema, Type, Value};
use crate::{DatePrecision, DocId};
/// The `FastFieldsWriter` groups all of the fast field writers.
@@ -120,7 +113,13 @@ impl FastFieldsWriter {
truncated_datetime.into(),
);
}
Value::Facet(_) => todo!(),
Value::Facet(facet) => {
self.columnar_writer.record_str(
doc_id,
field_name.as_str(),
facet.encoded_str(),
);
}
Value::JsonObject(_) => todo!(),
Value::IpAddr(ip_addr) => {
self.columnar_writer

View File

@@ -78,7 +78,7 @@ pub(crate) fn serialize_postings(
Ok(())
}
#[derive(Default)]
#[derive(Default, Debug)]
pub(crate) struct IndexingPosition {
pub num_tokens: u32,
pub end_position: u32,

View File

@@ -41,7 +41,7 @@ pub enum FacetParseError {
/// its facet. In the example above, `/electronics/tv_and_video/`
/// and `/electronics`.
#[derive(Clone, Default, Eq, Hash, PartialEq, Ord, PartialOrd)]
pub struct Facet(String);
pub struct Facet(pub(crate) String);
impl Facet {
/// Returns a new instance of the "root facet"
@@ -145,12 +145,6 @@ impl Facet {
Facet(facet_string)
}
/// Accessor for the inner buffer of the `Facet`.
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
self.0.clear();
self.0.push_str(facet_str);
}
/// Returns `true` if other is a `strict` subfacet of `self`.
///
/// Disclaimer: By strict we mean that the relation is not reflexive.

View File

@@ -27,10 +27,12 @@ pub struct FacetTokenStream<'a> {
impl Tokenizer for FacetTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut token = Token::default();
token.position = 0;
FacetTokenStream {
text,
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token: Token::default(),
token,
}
.into()
}