mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 02:22:54 +00:00
Facets
This commit is contained in:
@@ -39,6 +39,10 @@ impl BytesColumn {
|
||||
pub fn ords(&self) -> &Column<u64> {
|
||||
&self.term_ord_column
|
||||
}
|
||||
|
||||
pub fn num_terms(&self) -> usize {
|
||||
self.dictionary.num_terms()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -532,7 +532,7 @@ fn serialize_ip_addr_column(
|
||||
}
|
||||
|
||||
fn send_to_serialize_column_mappable_to_u128<
|
||||
T: Copy + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
|
||||
T: Copy + Ord + std::fmt::Debug + Send + Sync + MonotonicallyMappableToU128 + PartialOrd,
|
||||
>(
|
||||
op_iterator: impl Iterator<Item = ColumnOperation<T>>,
|
||||
cardinality: Cardinality,
|
||||
|
||||
@@ -7,9 +7,9 @@ use fail::fail_point;
|
||||
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders};
|
||||
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Schema, Type};
|
||||
use crate::space_usage::SegmentSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermDictionary;
|
||||
@@ -90,8 +90,19 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated with a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> crate::Result<()> {
|
||||
todo!();
|
||||
pub fn facet_reader(&self, field_name: &str) -> crate::Result<FacetReader> {
|
||||
let schema = self.schema();
|
||||
let field = schema.get_field(field_name)?;
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
if field_entry.field_type().value_type() != Type::Facet {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"`{field_name}` is not a facet field.`"
|
||||
)));
|
||||
}
|
||||
let Some(facet_column) = self.fast_fields().str_column_opt(field_name)? else {
|
||||
panic!("Facet Field `{field_name}` is missing. This should not happen");
|
||||
};
|
||||
Ok(FacetReader::new(facet_column))
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::str;
|
||||
use columnar::StrColumn;
|
||||
|
||||
use super::MultiValuedFastFieldReader;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::schema::Facet;
|
||||
use crate::termdict::{TermDictionary, TermOrdinal};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
@@ -20,9 +18,7 @@ use crate::DocId;
|
||||
/// list of facets. This ordinal is segment local and
|
||||
/// only makes sense for a given segment.
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValuedFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
buffer: Vec<u8>,
|
||||
facet_column: StrColumn,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -33,15 +29,8 @@ impl FacetReader {
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionary` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
pub fn new(
|
||||
term_ords: MultiValuedFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
) -> FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
term_dict,
|
||||
buffer: vec![],
|
||||
}
|
||||
pub fn new(facet_column: StrColumn) -> FacetReader {
|
||||
FacetReader { facet_column }
|
||||
}
|
||||
|
||||
/// Returns the size of the sets of facets in the segment.
|
||||
@@ -50,31 +39,20 @@ impl FacetReader {
|
||||
///
|
||||
/// `Facet` ordinals range from `0` to `num_facets() - 1`.
|
||||
pub fn num_facets(&self) -> usize {
|
||||
self.term_dict.num_terms()
|
||||
}
|
||||
|
||||
/// Accessor for the facet term dictionary.
|
||||
pub fn facet_dict(&self) -> &TermDictionary {
|
||||
&self.term_dict
|
||||
self.facet_column.num_terms()
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated with it.
|
||||
pub fn facet_from_ord(
|
||||
&mut self,
|
||||
facet_ord: TermOrdinal,
|
||||
output: &mut Facet,
|
||||
) -> crate::Result<()> {
|
||||
let found_term = self.term_dict.ord_to_term(facet_ord, &mut self.buffer)?;
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
let facet_str = str::from_utf8(&self.buffer[..])
|
||||
.map_err(|utf8_err| DataCorruption::comment_only(utf8_err.to_string()))?;
|
||||
output.set_facet_str(facet_str);
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) -> crate::Result<()> {
|
||||
let found_term = self.facet_column.ord_to_str(facet_ord, &mut output.0)?;
|
||||
assert!(found_term, "Term ordinal {facet_ord} no found.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated with a document.
|
||||
pub fn facet_ords(&self, doc: DocId, output: &mut Vec<u64>) {
|
||||
self.term_ords.get_vals(doc, output);
|
||||
output.clear();
|
||||
output.extend(self.facet_column.ords().values(doc));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,26 +62,65 @@ mod tests {
|
||||
use crate::{DocAddress, Document, Index};
|
||||
|
||||
#[test]
|
||||
fn test_facet_only_indexed() -> crate::Result<()> {
|
||||
fn test_facet_only_indexed() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
assert_eq!(&facet_ords, &[0u64]);
|
||||
assert_eq!(facet_reader.num_facets(), 1);
|
||||
let mut facet = Facet::default();
|
||||
facet_reader.facet_from_ord(0, &mut facet).unwrap();
|
||||
assert_eq!(facet.to_path_string(), "/a/b");
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32)).unwrap();
|
||||
let value = doc.get_first(facet_field).and_then(Value::as_facet);
|
||||
assert_eq!(value, None);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_several_facets() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(facet_field=>Facet::from_text("/parent/child1").unwrap()))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
facet_field=>Facet::from_text("/parent/child2").unwrap(),
|
||||
facet_field=>Facet::from_text("/parent/child1/blop").unwrap(),
|
||||
))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[0u64]);
|
||||
|
||||
facet_reader.facet_ords(1u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[1u64, 2u64]);
|
||||
|
||||
assert_eq!(facet_reader.num_facets(), 3);
|
||||
let mut facet = Facet::default();
|
||||
facet_reader.facet_from_ord(0, &mut facet).unwrap();
|
||||
assert_eq!(facet.to_path_string(), "/parent/child1");
|
||||
facet_reader.facet_from_ord(1, &mut facet).unwrap();
|
||||
assert_eq!(facet.to_path_string(), "/parent/child1/blop");
|
||||
facet_reader.facet_from_ord(2, &mut facet).unwrap();
|
||||
assert_eq!(facet.to_path_string(), "/parent/child2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -116,13 +133,10 @@ mod tests {
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
assert_eq!(&facet_ords, &[0u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::as_facet);
|
||||
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
|
||||
@@ -140,13 +154,10 @@ mod tests {
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
assert_eq!(&facet_ords, &[0u64]);
|
||||
facet_reader.facet_ords(1u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
Ok(())
|
||||
@@ -163,10 +174,7 @@ mod tests {
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let facet_reader = searcher.segment_reader(0u32).facet_reader("facet").unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
|
||||
@@ -27,7 +27,7 @@ pub use fastfield_codecs::Column;
|
||||
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
|
||||
// pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
// pub use self::facet_reader::FacetReader;
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::writer::FastFieldsWriter;
|
||||
use crate::schema::{Type, Value};
|
||||
@@ -36,7 +36,7 @@ use crate::DateTime;
|
||||
mod alive_bitset;
|
||||
// mod bytes;
|
||||
mod error;
|
||||
// mod facet_reader;
|
||||
mod facet_reader;
|
||||
mod readers;
|
||||
mod writer;
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ use columnar::{
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
|
||||
@@ -1,16 +1,9 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
|
||||
use common;
|
||||
use rustc_hash::FxHashMap;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
use columnar::{ColumnType, ColumnarWriter, NumericalValue};
|
||||
|
||||
use super::FastFieldType;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::schema::{Document, FieldType, Schema, Type, Value};
|
||||
use crate::{DatePrecision, DocId};
|
||||
|
||||
/// The `FastFieldsWriter` groups all of the fast field writers.
|
||||
@@ -120,7 +113,13 @@ impl FastFieldsWriter {
|
||||
truncated_datetime.into(),
|
||||
);
|
||||
}
|
||||
Value::Facet(_) => todo!(),
|
||||
Value::Facet(facet) => {
|
||||
self.columnar_writer.record_str(
|
||||
doc_id,
|
||||
field_name.as_str(),
|
||||
facet.encoded_str(),
|
||||
);
|
||||
}
|
||||
Value::JsonObject(_) => todo!(),
|
||||
Value::IpAddr(ip_addr) => {
|
||||
self.columnar_writer
|
||||
|
||||
@@ -78,7 +78,7 @@ pub(crate) fn serialize_postings(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Debug)]
|
||||
pub(crate) struct IndexingPosition {
|
||||
pub num_tokens: u32,
|
||||
pub end_position: u32,
|
||||
|
||||
@@ -41,7 +41,7 @@ pub enum FacetParseError {
|
||||
/// its facet. In the example above, `/electronics/tv_and_video/`
|
||||
/// and `/electronics`.
|
||||
#[derive(Clone, Default, Eq, Hash, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Facet(String);
|
||||
pub struct Facet(pub(crate) String);
|
||||
|
||||
impl Facet {
|
||||
/// Returns a new instance of the "root facet"
|
||||
@@ -145,12 +145,6 @@ impl Facet {
|
||||
Facet(facet_string)
|
||||
}
|
||||
|
||||
/// Accessor for the inner buffer of the `Facet`.
|
||||
pub(crate) fn set_facet_str(&mut self, facet_str: &str) {
|
||||
self.0.clear();
|
||||
self.0.push_str(facet_str);
|
||||
}
|
||||
|
||||
/// Returns `true` if other is a `strict` subfacet of `self`.
|
||||
///
|
||||
/// Disclaimer: By strict we mean that the relation is not reflexive.
|
||||
|
||||
@@ -27,10 +27,12 @@ pub struct FacetTokenStream<'a> {
|
||||
|
||||
impl Tokenizer for FacetTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
let mut token = Token::default();
|
||||
token.position = 0;
|
||||
FacetTokenStream {
|
||||
text,
|
||||
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
|
||||
token: Token::default(),
|
||||
token,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user