diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index c2dc28880..b8e828c9a 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -100,7 +100,7 @@ impl SegmentReader { let field_entry = self.schema.get_field_entry(field); match field_entry.field_type() { - FieldType::HierarchicalFacet(_) => { + FieldType::Facet(_) => { let term_ords_reader = self.fast_fields().u64s(field)?; let termdict = self .termdict_composite diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 46b9099c1..c5345cf97 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -110,7 +110,7 @@ impl FastValue for u64 { fn fast_field_cardinality(field_type: &FieldType) -> Option { match *field_type { FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(), - FieldType::HierarchicalFacet(_) => Some(Cardinality::MultiValues), + FieldType::Facet(_) => Some(Cardinality::MultiValues), _ => None, } } diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index b85754641..b20793f2b 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -40,7 +40,7 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality FieldType::Date(options) => options .get_fastfield_cardinality() .map(|cardinality| (FastType::Date, cardinality)), - FieldType::HierarchicalFacet(_) => Some((FastType::U64, Cardinality::MultiValues)), + FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)), _ => None, } } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index f7d4110ff..388ee19a9 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -54,7 +54,7 @@ impl FastFieldsWriter { None => {} } } - FieldType::HierarchicalFacet(_) => { + FieldType::Facet(_) => { let fast_field_writer = MultiValuedFastFieldWriter::new(field, true); multi_values_writers.push(fast_field_writer); } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index afb2bec03..d16b946ba 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -294,10 +294,10 @@ impl IndexMerger { for (field, field_entry) in self.schema.fields() { let field_type = field_entry.field_type(); match field_type { - FieldType::HierarchicalFacet(_) => { + FieldType::Facet(_) => { let term_ordinal_mapping = term_ord_mappings .remove(&field) - .expect("Logic Error in Tantivy (Please report). HierarchicalFact field should have required a\ + .expect("Logic Error in Tantivy (Please report). Facet field should have required a\ `term_ordinal_mapping`."); self.write_hierarchical_facet_field( field, @@ -844,7 +844,7 @@ impl IndexMerger { } let mut term_ord_mapping_opt = match field_type { - FieldType::HierarchicalFacet(_) => Some(TermOrdinalMapping::new(max_term_ords)), + FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)), _ => None, }; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 34228069d..eb40c86ef 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -2,7 +2,6 @@ use super::{ doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}, operation::AddOperation, }; -use crate::fastfield::FastFieldsWriter; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; use crate::indexer::segment_serializer::SegmentSerializer; use crate::postings::compute_table_size; @@ -18,6 +17,7 @@ use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::Opstamp; use crate::{core::Segment, store::StoreWriter}; +use crate::{fastfield::FastFieldsWriter, schema::Type}; use crate::{DocId, SegmentComponent}; /// Computes the initial size of the hash table. @@ -173,8 +173,8 @@ impl SegmentWriter { let (term_buffer, multifield_postings) = (&mut self.term_buffer, &mut self.multifield_postings); match *field_entry.field_type() { - FieldType::HierarchicalFacet(_) => { - term_buffer.set_field(field); + FieldType::Facet(_) => { + term_buffer.set_field(Type::Facet, field); for field_value in field_values { let facet = field_value.value().facet().ok_or_else(make_schema_error)?; let facet_str = facet.encoded_str(); @@ -238,7 +238,7 @@ impl SegmentWriter { } FieldType::U64(_) => { for field_value in field_values { - term_buffer.set_field(field_value.field()); + term_buffer.set_field(Type::U64, field_value.field()); let u64_val = field_value .value() .u64_value() @@ -249,7 +249,7 @@ impl SegmentWriter { } FieldType::Date(_) => { for field_value in field_values { - term_buffer.set_field(field_value.field()); + term_buffer.set_field(Type::Date, field_value.field()); let date_val = field_value .value() .date_value() @@ -260,7 +260,7 @@ impl SegmentWriter { } FieldType::I64(_) => { for field_value in field_values { - term_buffer.set_field(field_value.field()); + term_buffer.set_field(Type::I64, field_value.field()); let i64_val = field_value .value() .i64_value() @@ -271,7 +271,7 @@ impl SegmentWriter { } FieldType::F64(_) => { for field_value in field_values { - term_buffer.set_field(field_value.field()); + term_buffer.set_field(Type::F64, field_value.field()); let f64_val = field_value .value() .f64_value() @@ -282,7 +282,7 @@ impl SegmentWriter { } FieldType::Bytes(_) => { for field_value in field_values { - term_buffer.set_field(field_value.field()); + term_buffer.set_field(Type::Bytes, field_value.field()); let bytes = field_value .value() .bytes_value() diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index f1d90afae..1a7d45587 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -5,8 +5,8 @@ use crate::postings::recorder::{ }; use crate::postings::UnorderedTermId; use crate::postings::{FieldSerializer, InvertedIndexSerializer}; -use crate::schema::IndexRecordOption; use crate::schema::{Field, FieldEntry, FieldType, Schema, Term}; +use crate::schema::{IndexRecordOption, Type}; use crate::termdict::TermOrdinal; use crate::tokenizer::TokenStream; use crate::tokenizer::{Token, MAX_TOKEN_LEN}; @@ -39,9 +39,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box | FieldType::F64(_) | FieldType::Date(_) | FieldType::Bytes(_) - | FieldType::HierarchicalFacet(_) => { - SpecializedPostingsWriter::::new_boxed() - } + | FieldType::Facet(_) => SpecializedPostingsWriter::::new_boxed(), } } @@ -53,11 +51,11 @@ pub struct MultiFieldPostingsWriter { } fn make_field_partition( - term_offsets: &[(&[u8], Addr, UnorderedTermId)], + term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)], ) -> Vec<(Field, Range)> { let term_offsets_it = term_offsets .iter() - .map(|(key, _, _)| Term::wrap(key).field()) + .map(|(term, _, _)| term.field()) .enumerate(); let mut prev_field_opt = None; let mut fields = vec![]; @@ -132,10 +130,10 @@ impl MultiFieldPostingsWriter { fieldnorm_readers: FieldNormReaders, doc_id_map: Option<&DocIdMapping>, ) -> crate::Result>> { - let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = + let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> = Vec::with_capacity(self.term_index.len()); term_offsets.extend(self.term_index.iter()); - term_offsets.sort_unstable_by_key(|&(k, _, _)| k); + term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone()); let mut unordered_term_mappings: HashMap> = HashMap::new(); @@ -146,7 +144,7 @@ impl MultiFieldPostingsWriter { let field_entry = self.schema.get_field_entry(field); match *field_entry.field_type() { - FieldType::Str(_) | FieldType::HierarchicalFacet(_) => { + FieldType::Str(_) | FieldType::Facet(_) => { // populating the (unordered term ord) -> (ordered term ord) mapping // for the field. let unordered_term_ids = term_offsets[byte_offsets.clone()] @@ -210,7 +208,7 @@ pub trait PostingsWriter { /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize( &self, - term_addrs: &[(&[u8], Addr, UnorderedTermId)], + term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], serializer: &mut FieldSerializer<'_>, term_heap: &MemoryArena, heap: &MemoryArena, @@ -227,7 +225,7 @@ pub trait PostingsWriter { heap: &mut MemoryArena, term_buffer: &mut Term, ) -> u32 { - term_buffer.set_field(field); + term_buffer.set_field(Type::Str, field); let mut sink = |token: &Token| { // We skip all tokens with a len greater than u16. if token.text.len() <= MAX_TOKEN_LEN { @@ -281,7 +279,7 @@ impl PostingsWriter for SpecializedPostingsWriter ) -> UnorderedTermId { debug_assert!(term.as_slice().len() >= 4); self.total_num_tokens += 1; - term_index.mutate_or_create(term, |opt_recorder: Option| { + term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option| { if let Some(mut recorder) = opt_recorder { let current_doc = recorder.current_doc(); if current_doc != doc { @@ -301,17 +299,17 @@ impl PostingsWriter for SpecializedPostingsWriter fn serialize( &self, - term_addrs: &[(&[u8], Addr, UnorderedTermId)], + term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], serializer: &mut FieldSerializer<'_>, termdict_heap: &MemoryArena, heap: &MemoryArena, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { let mut buffer_lender = BufferLender::default(); - for &(term_bytes, addr, _) in term_addrs { - let recorder: Rec = termdict_heap.read(addr); + for (term, addr, _) in term_addrs { + let recorder: Rec = termdict_heap.read(*addr); let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); - serializer.new_term(&term_bytes[4..], term_doc_freq)?; + serializer.new_term(term.value_bytes(), term_doc_freq)?; recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map); serializer.close_term()?; } diff --git a/src/postings/stacker/term_hashmap.rs b/src/postings/stacker/term_hashmap.rs index 5676df00c..e19b0d121 100644 --- a/src/postings/stacker/term_hashmap.rs +++ b/src/postings/stacker/term_hashmap.rs @@ -3,6 +3,7 @@ use murmurhash32::murmurhash2; use super::{Addr, MemoryArena}; use crate::postings::stacker::memory_arena::store; use crate::postings::UnorderedTermId; +use crate::Term; use byteorder::{ByteOrder, NativeEndian}; use std::iter; use std::mem; @@ -81,13 +82,13 @@ pub struct Iter<'a> { } impl<'a> Iterator for Iter<'a> { - type Item = (&'a [u8], Addr, UnorderedTermId); + type Item = (Term<&'a [u8]>, Addr, UnorderedTermId); fn next(&mut self) -> Option { self.inner.next().cloned().map(move |bucket: usize| { let kv = self.hashmap.table[bucket]; let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr); - (key, offset, kv.unordered_term_id) + (Term::wrap(key), offset, kv.unordered_term_id) }) } } @@ -189,21 +190,19 @@ impl TermHashMap { /// will be in charge of returning a default value. /// If the key already as an associated value, then it will be passed /// `Some(previous_value)`. - pub fn mutate_or_create( + pub fn mutate_or_create( &mut self, - key: S, + key: &[u8], mut updater: TMutator, ) -> UnorderedTermId where - S: AsRef<[u8]>, V: Copy + 'static, TMutator: FnMut(Option) -> V, { if self.is_saturated() { self.resize(); } - let key_bytes: &[u8] = key.as_ref(); - let hash = murmurhash2(key.as_ref()); + let hash = murmurhash2(key); let mut probe = self.probe(hash); loop { let bucket = probe.next_probe(); @@ -211,21 +210,18 @@ impl TermHashMap { if kv.is_empty() { // The key does not exists yet. let val = updater(None); - let num_bytes = - std::mem::size_of::() + key_bytes.len() + std::mem::size_of::(); + let num_bytes = std::mem::size_of::() + key.len() + std::mem::size_of::(); let key_addr = self.heap.allocate_space(num_bytes); { let data = self.heap.slice_mut(key_addr, num_bytes); - NativeEndian::write_u16(data, key_bytes.len() as u16); - let stop = 2 + key_bytes.len(); - data[2..stop].copy_from_slice(key_bytes); + NativeEndian::write_u16(data, key.len() as u16); + let stop = 2 + key.len(); + data[2..stop].copy_from_slice(key); store(&mut data[stop..], val); } return self.set_bucket(hash, key_addr, bucket); } else if kv.hash == hash { - if let Some(val_addr) = - self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr) - { + if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) { let v = self.heap.read(val_addr); let new_v = updater(Some(v)); self.heap.write_at(val_addr, new_v); @@ -245,25 +241,18 @@ mod tests { #[test] fn test_hash_map() { let mut hash_map: TermHashMap = TermHashMap::new(18); - { - hash_map.mutate_or_create("abc", |opt_val: Option| { - assert_eq!(opt_val, None); - 3u32 - }); - } - { - hash_map.mutate_or_create("abcd", |opt_val: Option| { - assert_eq!(opt_val, None); - 4u32 - }); - } - { - hash_map.mutate_or_create("abc", |opt_val: Option| { - assert_eq!(opt_val, Some(3u32)); - 5u32 - }); - } - + hash_map.mutate_or_create(b"abc", |opt_val: Option| { + assert_eq!(opt_val, None); + 3u32 + }); + hash_map.mutate_or_create(b"abcd", |opt_val: Option| { + assert_eq!(opt_val, None); + 4u32 + }); + hash_map.mutate_or_create(b"abc", |opt_val: Option| { + assert_eq!(opt_val, Some(3u32)); + 5u32 + }); let mut vanilla_hash_map = HashMap::new(); let iter_values = hash_map.iter(); for (key, addr, _) in iter_values { diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 784ee6a40..f3f9e3c5f 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -132,10 +132,15 @@ impl FuzzyTermQuery { match LEV_BUILDER.get(&(self.distance, self.transposition_cost_one)) { // Unwrap the option and build the Ok(AutomatonWeight) Some(automaton_builder) => { + let term_text = self.term.as_str().ok_or_else(|| { + crate::TantivyError::InvalidArgument( + "The fuzzy term query requires a string term.".to_string(), + ) + })?; let automaton = if self.prefix { - automaton_builder.build_prefix_dfa(self.term.text()) + automaton_builder.build_prefix_dfa(term_text) } else { - automaton_builder.build_dfa(self.term.text()) + automaton_builder.build_dfa(term_text) }; Ok(AutomatonWeight::new( self.term.field(), diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 155eeda4b..02d0de819 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -180,7 +180,7 @@ impl MoreLikeThis { // extract the raw value, possibly tokenizing & filtering to update the term frequency map match field_entry.field_type() { - FieldType::HierarchicalFacet(_) => { + FieldType::Facet(_) => { let facets: Vec<&str> = field_values .iter() .map(|field_value| match *field_value.value() { diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 086c52c0b..96839b99c 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -367,8 +367,8 @@ impl QueryParser { )) } } - FieldType::HierarchicalFacet(_) => match Facet::from_text(phrase) { - Ok(facet) => Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))]), + FieldType::Facet(_) => match Facet::from_text(phrase) { + Ok(facet) => Ok(vec![(0, Term::from_facet(field, &facet))]), Err(e) => Err(QueryParserError::from(e)), }, FieldType::Bytes(_) => { @@ -665,7 +665,7 @@ mod test { let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap(); assert_eq!( format!("{:?}", query), - "TermQuery(Term(field=11,bytes=[114, 111, 111, 116, 0, 98, 114, 97, 110, 99, 104, 0, 108, 101, 97, 102]))" + r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"# ); } @@ -678,7 +678,7 @@ mod test { let query = query_parser.parse_query("text:hello").unwrap(); assert_eq!( format!("{:?}", query), - "Boost(query=TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111])), boost=2)" + r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"# ); } @@ -704,7 +704,7 @@ mod test { let query = query_parser.parse_query("text:hello^2").unwrap(); assert_eq!( format!("{:?}", query), - "Boost(query=Boost(query=TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111])), boost=2), boost=2)" + r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"# ); } @@ -739,8 +739,7 @@ mod test { pub fn test_parse_query_untokenized() { test_parse_query_to_logical_ast_helper( "nottokenized:\"wordone wordtwo\"", - "Term(field=7,bytes=[119, 111, 114, 100, 111, 110, \ - 101, 32, 119, 111, 114, 100, 116, 119, 111])", + r#"Term(type=Str, field=7, val="wordone wordtwo")"#, false, ); } @@ -783,7 +782,7 @@ mod test { .is_ok()); test_parse_query_to_logical_ast_helper( "unsigned:2324", - "Term(field=3,bytes=[0, 0, 0, 0, 0, 0, 9, 20])", + "Term(type=U64, field=3, val=2324)", false, ); @@ -810,7 +809,7 @@ mod test { fn test_parse_bytes() { test_parse_query_to_logical_ast_helper( "bytes:YnVidQ==", - "Term(field=12,bytes=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", false, ); } @@ -825,7 +824,7 @@ mod test { fn test_parse_bytes_phrase() { test_parse_query_to_logical_ast_helper( "bytes:\"YnVidQ==\"", - "Term(field=12,bytes=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", false, ); } @@ -841,12 +840,12 @@ mod test { fn test_parse_query_to_ast_ab_c() { test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - "((+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98])) Term(field=0,bytes=[99]))", + r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#, false, ); test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - "(+(+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98])) +Term(field=0,bytes=[99]))", + r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#, true, ); } @@ -855,19 +854,17 @@ mod test { pub fn test_parse_query_to_ast_single_term() { test_parse_query_to_logical_ast_helper( "title:toto", - "Term(field=0,bytes=[116, 111, 116, 111])", + r#"Term(type=Str, field=0, val="toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto", - "Term(field=0,bytes=[116, 111, 116, 111])", + r#"Term(type=Str, field=0, val="toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - "(+Term(field=0,bytes=[116, 111, 116, 111]) \ - -(Term(field=0,bytes=[116, 105, 116, 105]) \ - Term(field=1,bytes=[116, 105, 116, 105])))", + r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, false, ); } @@ -884,13 +881,12 @@ mod test { pub fn test_parse_query_to_ast_two_terms() { test_parse_query_to_logical_ast_helper( "title:a b", - "(Term(field=0,bytes=[97]) (Term(field=0,bytes=[98]) Term(field=1,bytes=[98])))", + r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, false, ); test_parse_query_to_logical_ast_helper( - "title:\"a b\"", - "\"[(0, Term(field=0,bytes=[97])), \ - (1, Term(field=0,bytes=[98]))]\"", + r#"title:"a b""#, + r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, false, ); } @@ -899,46 +895,39 @@ mod test { pub fn test_parse_query_to_ast_ranges() { test_parse_query_to_logical_ast_helper( "title:[a TO b]", - "(Included(Term(field=0,bytes=[97])) TO Included(Term(field=0,bytes=[98])))", + r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#, false, ); test_parse_query_to_logical_ast_helper( "[a TO b]", - "((Included(Term(field=0,bytes=[97])) TO \ - Included(Term(field=0,bytes=[98]))) \ - (Included(Term(field=1,bytes=[97])) TO \ - Included(Term(field=1,bytes=[98]))))", + r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO toto}", - "(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO \ - Excluded(Term(field=0,bytes=[116, 111, 116, 111])))", + r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{* TO toto}", - "(Unbounded TO Excluded(Term(field=0,bytes=[116, 111, 116, 111])))", + r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO *}", - "(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO Unbounded)", + r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#, false, ); test_parse_query_to_logical_ast_helper( "signed:{-5 TO 3}", - "(Excluded(Term(field=2,bytes=[127, 255, 255, 255, 255, 255, 255, 251])) TO \ - Excluded(Term(field=2,bytes=[128, 0, 0, 0, 0, 0, 0, 3])))", + r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#, false, ); test_parse_query_to_logical_ast_helper( "float:{-1.5 TO 1.5}", - "(Excluded(Term(field=10,bytes=[64, 7, 255, 255, 255, 255, 255, 255])) TO \ - Excluded(Term(field=10,bytes=[191, 248, 0, 0, 0, 0, 0, 0])))", + r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#, false, ); - test_parse_query_to_logical_ast_helper("*", "*", false); } @@ -1065,32 +1054,27 @@ mod test { pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( "title:toto", - "Term(field=0,bytes=[116, 111, 116, 111])", + r#"Term(type=Str, field=0, val="toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto", - "Term(field=0,bytes=[116, 111, 116, 111])", + r#"Term(type=Str, field=0, val="toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - "(+Term(field=0,bytes=[116, 111, 116, 111]) \ - -(Term(field=0,bytes=[116, 105, 116, 105]) \ - Term(field=1,bytes=[116, 105, 116, 105])))", + r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:a b", - "(+Term(field=0,bytes=[97]) \ - +(Term(field=0,bytes=[98]) \ - Term(field=1,bytes=[98])))", + r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:\"a b\"", - "\"[(0, Term(field=0,bytes=[97])), \ - (1, Term(field=0,bytes=[98]))]\"", + r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, true, ); } @@ -1099,8 +1083,8 @@ mod test { pub fn test_query_parser_hyphen() { test_parse_query_to_logical_ast_helper( "title:www-form-encoded", - "\"[(0, Term(field=0,bytes=[119, 119, 119])), (1, Term(field=0,bytes=[102, 111, 114, 109])), (2, Term(field=0,bytes=[101, 110, 99, 111, 100, 101, 100]))]\"", - false + r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#, + false, ); } @@ -1109,7 +1093,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a AND title:b", - "(+Term(field=0,bytes=[97]) +Term(field=0,bytes=[98]))", + r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#, default_conjunction, ); } @@ -1120,7 +1104,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a OR title:b", - "(Term(field=0,bytes=[97]) Term(field=0,bytes=[98]))", + r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#, default_conjunction, ); } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 3a7dfaa76..81cc0f0aa 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -175,7 +175,7 @@ mod tests { ); assert_eq!( format!("{:?}", term_query), - "TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111]))" + r#"TermQuery(Term(type=Str, field=1, val="hello"))"# ); } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index a0df46fb5..46a0dc392 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -31,11 +31,7 @@ impl Weight for TermWeight { return Err(does_not_match(doc)); } let mut explanation = scorer.explain(); - explanation.add_context(format!( - "Term ={:?}:{:?}", - self.term.field(), - self.term.value_bytes() - )); + explanation.add_context(format!("Term={:?}", self.term,)); Ok(explanation) } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index be1851ce0..c6271944a 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -85,7 +85,7 @@ impl FieldEntry { assert!(is_valid_field_name(&field_name)); FieldEntry { name: field_name, - field_type: FieldType::HierarchicalFacet(field_type), + field_type: FieldType::Facet(field_type), } } @@ -138,7 +138,7 @@ impl FieldEntry { | FieldType::F64(ref options) | FieldType::Date(ref options) => options.is_stored(), FieldType::Str(ref options) => options.is_stored(), - FieldType::HierarchicalFacet(ref options) => options.is_stored(), + FieldType::Facet(ref options) => options.is_stored(), FieldType::Bytes(ref options) => options.is_stored(), } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 4cd86aefa..3b1a87fe5 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -30,21 +30,60 @@ pub enum ValueParsingError { /// Contrary to FieldType, this does /// not include the way the field must be indexed. #[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[repr(u8)] pub enum Type { /// `&str` - Str, + Str = b's', /// `u64` - U64, + U64 = b'u', /// `i64` - I64, + I64 = b'i', /// `f64` - F64, + F64 = b'f', /// `date(i64) timestamp` - Date, + Date = b'd', /// `tantivy::schema::Facet`. Passed as a string in JSON. - HierarchicalFacet, + Facet = b'h', /// `Vec` - Bytes, + Bytes = b'b', +} + +const ALL_TYPES: [Type; 7] = [ + Type::Str, + Type::U64, + Type::I64, + Type::F64, + Type::Date, + Type::Facet, + Type::Bytes, +]; + +impl Type { + /// Returns an iterator over the different values + /// the Type enum can tape. + pub fn iter_values() -> impl Iterator { + ALL_TYPES.iter().cloned() + } + + /// Returns a 1 byte code used to identify the type. + pub fn to_code(&self) -> u8 { + *self as u8 + } + + /// Interprets a 1byte code as a type. + /// Returns None if the code is invalid. + pub fn from_code(code: u8) -> Option { + match code { + b's' => Some(Type::Str), + b'u' => Some(Type::U64), + b'i' => Some(Type::I64), + b'f' => Some(Type::F64), + b'd' => Some(Type::Date), + b'h' => Some(Type::Facet), + b'b' => Some(Type::Bytes), + _ => None, + } + } } /// A `FieldType` describes the type (text, u64) of a field as well as @@ -65,7 +104,7 @@ pub enum FieldType { /// Signed 64-bits Date 64 field type configuration, Date(IntOptions), /// Hierachical Facet - HierarchicalFacet(FacetOptions), + Facet(FacetOptions), /// Bytes (one per document) Bytes(BytesOptions), } @@ -79,7 +118,7 @@ impl FieldType { FieldType::I64(_) => Type::I64, FieldType::F64(_) => Type::F64, FieldType::Date(_) => Type::Date, - FieldType::HierarchicalFacet(_) => Type::HierarchicalFacet, + FieldType::Facet(_) => Type::Facet, FieldType::Bytes(_) => Type::Bytes, } } @@ -92,7 +131,7 @@ impl FieldType { | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) => int_options.is_indexed(), FieldType::Date(ref date_options) => date_options.is_indexed(), - FieldType::HierarchicalFacet(ref _facet_options) => true, + FieldType::Facet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), } } @@ -108,7 +147,7 @@ impl FieldType { | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) | FieldType::Date(ref int_options) => int_options.fieldnorms(), - FieldType::HierarchicalFacet(_) => false, + FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), } } @@ -132,7 +171,7 @@ impl FieldType { None } } - FieldType::HierarchicalFacet(ref _facet_options) => Some(IndexRecordOption::Basic), + FieldType::Facet(ref _facet_options) => Some(IndexRecordOption::Basic), FieldType::Bytes(ref bytes_options) => { if bytes_options.is_indexed() { Some(IndexRecordOption::Basic) @@ -165,7 +204,7 @@ impl FieldType { FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err( ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)), ), - FieldType::HierarchicalFacet(_) => Ok(Value::Facet(Facet::from(field_text))), + FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))), FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| { ValueParsingError::InvalidBase64(format!( "Expected base64 string, got {:?}", @@ -198,7 +237,7 @@ impl FieldType { Err(ValueParsingError::OverflowError(msg)) } } - FieldType::Str(_) | FieldType::HierarchicalFacet(_) | FieldType::Bytes(_) => { + FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => { let msg = format!("Expected a string, got {:?}", json); Err(ValueParsingError::TypeError(msg)) } @@ -241,6 +280,7 @@ mod tests { use super::FieldType; use crate::schema::field_type::ValueParsingError; use crate::schema::TextOptions; + use crate::schema::Type; use crate::schema::Value; use crate::schema::{Schema, INDEXED}; use crate::tokenizer::{PreTokenizedString, Token}; @@ -358,4 +398,13 @@ mod tests { assert_eq!(serialized_value_json, pre_tokenized_string_json); } + + #[test] + fn test_type_codes() { + for type_val in Type::iter_values() { + let code = type_val.to_code(); + assert_eq!(Type::from_code(code), Some(type_val)); + } + assert_eq!(Type::from_code(b'z'), None); + } } diff --git a/src/schema/term.rs b/src/schema/term.rs index 149eab9ee..9a05ad4c4 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,18 +1,20 @@ use std::fmt; +use std::hash::{Hash, Hasher}; use super::Field; -use crate::schema::Facet; +use crate::fastfield::FastValue; +use crate::schema::{Facet, Type}; use crate::DateTime; -use common; use std::str; -/// Size (in bytes) of the buffer of a int field. -const INT_TERM_LEN: usize = 4 + 8; +/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. +/// + + +const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; /// Term represents the value that the token can take. /// /// It actually wraps a `Vec`. -#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] +#[derive(Clone)] pub struct Term>(B) where B: AsRef<[u8]>; @@ -22,98 +24,61 @@ impl Term { Term(Vec::with_capacity(100)) } - /// Builds a term given a field, and a i64-value - /// - /// Assuming the term has a field id of 1, and a i64 value of 3234, - /// the Term will have 12 bytes. - /// - /// The first four byte are dedicated to storing the field id as a u64. - /// The 8 following bytes are encoding the u64 value. - pub fn from_field_i64(field: Field, val: i64) -> Term { - let val_u64: u64 = common::i64_to_u64(val); - Term::from_field_u64(field, val_u64) - } - - /// Builds a term given a field, and a f64-value - /// - /// Assuming the term has a field id of 1, and a f64 value of 1.5, - /// the Term will have 12 bytes. - /// - /// The first four byte are dedicated to storing the field id as a u64. - /// The 8 following bytes are encoding the f64 as a u64 value. - pub fn from_field_f64(field: Field, val: f64) -> Term { - let val_u64: u64 = common::f64_to_u64(val); - Term::from_field_u64(field, val_u64) - } - - /// Builds a term given a field, and a DateTime value - /// - /// Assuming the term has a field id of 1, and a timestamp i64 value of 3234, - /// the Term will have 12 bytes. - /// - /// The first four byte are dedicated to storing the field id as a u64. - /// The 8 following bytes are encoding the DateTime as i64 timestamp value. - pub fn from_field_date(field: Field, val: &DateTime) -> Term { - let val_timestamp = val.timestamp(); - Term::from_field_i64(field, val_timestamp) - } - - /// Creates a `Term` given a facet. - pub fn from_facet(field: Field, facet: &Facet) -> Term { - let bytes = facet.encoded_str().as_bytes(); - let buffer = Vec::with_capacity(4 + bytes.len()); - let mut term = Term(buffer); - term.set_field(field); - term.set_bytes(bytes); - term - } - - /// Builds a term given a field, and a string value - /// - /// Assuming the term has a field id of 2, and a text value of "abc", - /// the Term will have 4 bytes. - /// The first byte is 2, and the three following bytes are the utf-8 - /// representation of "abc". - pub fn from_field_text(field: Field, text: &str) -> Term { - let buffer = Vec::with_capacity(4 + text.len()); - let mut term = Term(buffer); - term.set_field(field); - term.set_text(text); + fn from_fast_value(field: Field, val: &T) -> Term { + let mut term = Term(vec![0u8; FAST_VALUE_TERM_LEN]); + term.set_field(T::to_type(), field); + term.set_u64(val.to_u64()); term } /// Builds a term given a field, and a u64-value - /// - /// Assuming the term has a field id of 1, and a u64 value of 3234, - /// the Term will have 12 bytes. - /// - /// The first four byte are dedicated to storing the field id as a u64. - /// The 8 following bytes are encoding the u64 value. pub fn from_field_u64(field: Field, val: u64) -> Term { - let mut term = Term(vec![0u8; INT_TERM_LEN]); - term.set_field(field); - term.set_u64(val); + Term::from_fast_value(field, &val) + } + + /// Builds a term given a field, and a i64-value + pub fn from_field_i64(field: Field, val: i64) -> Term { + Term::from_fast_value(field, &val) + } + + /// Builds a term given a field, and a f64-value + pub fn from_field_f64(field: Field, val: f64) -> Term { + Term::from_fast_value(field, &val) + } + + /// Builds a term given a field, and a DateTime value + pub fn from_field_date(field: Field, val: &DateTime) -> Term { + Term::from_fast_value(field, val) + } + + /// Creates a `Term` given a facet. + pub fn from_facet(field: Field, facet: &Facet) -> Term { + let facet_encoded_str = facet.encoded_str(); + Term::create_bytes_term(Type::Facet, field, facet_encoded_str.as_bytes()) + } + + /// Builds a term given a field, and a string value + pub fn from_field_text(field: Field, text: &str) -> Term { + Term::create_bytes_term(Type::Str, field, text.as_bytes()) + } + + fn create_bytes_term(typ: Type, field: Field, bytes: &[u8]) -> Term { + let mut term = Term(vec![0u8; 5 + bytes.len()]); + term.set_field(typ, field); + term.0.extend_from_slice(bytes); term } /// Builds a term bytes. pub fn from_field_bytes(field: Field, bytes: &[u8]) -> Term { - let mut term = Term::for_field(field); - term.set_bytes(bytes); - term + Term::create_bytes_term(Type::Bytes, field, bytes) } - /// Creates a new Term for a given field. - pub(crate) fn for_field(field: Field) -> Term { - let mut term = Term(Vec::with_capacity(100)); - term.set_field(field); - term - } - - pub(crate) fn set_field(&mut self, field: Field) { + pub(crate) fn set_field(&mut self, typ: Type, field: Field) { self.0.clear(); self.0 .extend_from_slice(field.field_id().to_be_bytes().as_ref()); + self.0.push(typ.to_code()); } /// Sets a u64 value in the term. @@ -123,23 +88,33 @@ impl Term { /// The use of BigEndian has the benefit of preserving /// the natural order of the values. pub fn set_u64(&mut self, val: u64) { - self.0.resize(INT_TERM_LEN, 0u8); + self.set_fast_value(val); self.set_bytes(val.to_be_bytes().as_ref()); } + fn set_fast_value(&mut self, val: T) { + self.0.resize(FAST_VALUE_TERM_LEN, 0u8); + self.set_bytes(val.to_u64().to_be_bytes().as_ref()); + } + /// Sets a `i64` value in the term. pub fn set_i64(&mut self, val: i64) { - self.set_u64(common::i64_to_u64(val)); + self.set_fast_value(val); + } + + /// Sets a `i64` value in the term. + pub fn set_date(&mut self, date: crate::DateTime) { + self.set_fast_value(date); } /// Sets a `f64` value in the term. pub fn set_f64(&mut self, val: f64) { - self.set_u64(common::f64_to_u64(val)); + self.set_fast_value(val); } /// Sets the value of a `Bytes` field. pub fn set_bytes(&mut self, bytes: &[u8]) { - self.0.resize(4, 0u8); + self.0.resize(5, 0u8); self.0.extend(bytes); } @@ -149,6 +124,44 @@ impl Term { } } +impl Ord for Term +where + B: AsRef<[u8]>, +{ + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.as_slice().cmp(other.as_slice()) + } +} + +impl PartialOrd for Term +where + B: AsRef<[u8]>, +{ + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for Term +where + B: AsRef<[u8]>, +{ + fn eq(&self, other: &Self) -> bool { + self.as_slice() == other.as_slice() + } +} + +impl Eq for Term where B: AsRef<[u8]> {} + +impl Hash for Term +where + B: AsRef<[u8]>, +{ + fn hash(&self, state: &mut H) { + self.0.as_ref().hash(state) + } +} + impl Term where B: AsRef<[u8]>, @@ -158,6 +171,15 @@ where Term(data) } + /// Return the type of the term. + pub fn typ(&self) -> Type { + assert!( + self.as_slice().len() >= 5, + "the type does byte representation is too short" + ); + Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code") + } + /// Returns the field. pub fn field(&self) -> Field { let mut field_id_bytes = [0u8; 4]; @@ -167,41 +189,86 @@ where /// Returns the `u64` value stored in a term. /// - /// # Panics - /// ... or returns an invalid value - /// if the term is not a `u64` field. - pub fn get_u64(&self) -> u64 { - let mut field_id_bytes = [0u8; 8]; - field_id_bytes.copy_from_slice(self.value_bytes()); - u64::from_be_bytes(field_id_bytes) + /// Returns None if the term is not of the u64 type, or if the term byte representation + /// is invalid. + pub fn as_u64(&self) -> Option { + self.get_fast_type::() + } + + fn get_fast_type(&self) -> Option { + if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN { + return None; + } + let mut value_bytes = [0u8; 8]; + value_bytes.copy_from_slice(self.value_bytes()); + let value_u64 = u64::from_be_bytes(value_bytes); + Some(FastValue::from_u64(value_u64)) } /// Returns the `i64` value stored in a term. /// - /// # Panics - /// ... or returns an invalid value - /// if the term is not a `i64` field. - pub fn get_i64(&self) -> i64 { - common::u64_to_i64(self.get_u64()) + /// Returns None if the term is not of the i64 type, or if the term byte representation + /// is invalid. + pub fn as_i64(&self) -> Option { + self.get_fast_type::() } /// Returns the `f64` value stored in a term. /// - /// # Panics - /// ... or returns an invalid value - /// if the term is not a `f64` field. - pub fn get_f64(&self) -> f64 { - common::u64_to_f64(self.get_u64()) + /// Returns None if the term is not of the f64 type, or if the term byte representation + /// is invalid. + pub fn as_f64(&self) -> Option { + self.get_fast_type::() + } + + /// Returns the `Date` value stored in a term. + /// + /// Returns None if the term is not of the Date type, or if the term byte representation + /// is invalid. + pub fn as_date(&self) -> Option { + self.get_fast_type::() } /// Returns the text associated with the term. /// - /// # Panics - /// If the value is not valid utf-8. This may happen - /// if the index is corrupted or if you try to - /// call this method on a non-string type. - pub fn text(&self) -> &str { - str::from_utf8(self.value_bytes()).expect("Term does not contain valid utf-8.") + /// Returns None if the field is not of string type + /// or if the bytes are not valid utf-8. + pub fn as_str(&self) -> Option<&str> { + if self.as_slice().len() < 5 { + return None; + } + if self.typ() != Type::Str { + return None; + } + str::from_utf8(self.value_bytes()).ok() + } + + /// Returns the facet associated with the term. + /// + /// Returns None if the field is not of facet type + /// or if the bytes are not valid utf-8. + pub fn as_facet(&self) -> Option { + if self.as_slice().len() < 5 { + return None; + } + if self.typ() != Type::Facet { + return None; + } + let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?; + Some(Facet::from_encoded_string(facet_encode_str.to_string())) + } + + /// Returns the bytes associated with the term. + /// + /// Returns None if the field is not of bytes type. + pub fn as_bytes(&self) -> Option<&[u8]> { + if self.as_slice().len() < 5 { + return None; + } + if self.typ() != Type::Bytes { + return None; + } + Some(self.value_bytes()) } /// Returns the serialized value of the term. @@ -211,32 +278,61 @@ where /// If the term is a u64, its value is encoded according /// to `byteorder::LittleEndian`. pub fn value_bytes(&self) -> &[u8] { - &self.0.as_ref()[4..] + &self.0.as_ref()[5..] } - /// Returns the underlying `&[u8]` - pub fn as_slice(&self) -> &[u8] { + /// Returns the underlying `&[u8]`. + /// + /// Do NOT rely on this byte representation in the index. + /// This value is likely to change in the future. + pub(crate) fn as_slice(&self) -> &[u8] { self.0.as_ref() } } -impl AsRef<[u8]> for Term -where - B: AsRef<[u8]>, -{ - fn as_ref(&self) -> &[u8] { - self.0.as_ref() +fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> fmt::Result { + if let Some(val) = val_opt { + write!(f, "{:?}", val)?; } + Ok(()) } impl fmt::Debug for Term { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "Term(field={},bytes={:?})", - self.field().field_id(), - self.value_bytes() - ) + let field_id = self.field().field_id(); + let typ = self.typ(); + write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?; + match typ { + Type::Str => { + let s = str::from_utf8(self.value_bytes()).ok(); + write_opt(f, s)?; + } + Type::U64 => { + write_opt(f, self.as_u64())?; + } + Type::I64 => { + let val_i64 = self.as_i64(); + write_opt(f, val_i64)?; + } + Type::F64 => { + let val_f64 = self.as_f64(); + write_opt(f, val_f64)?; + } + // TODO pretty print these types too. + Type::Date => { + let val_date = self.as_date(); + write_opt(f, val_date)?; + } + Type::Facet => { + let facet = self.as_facet().map(|facet| facet.to_path_string()); + write_opt(f, facet)?; + } + Type::Bytes => { + write_opt(f, self.as_bytes())?; + } + } + write!(f, ")",)?; + Ok(()) } } @@ -246,30 +342,24 @@ mod tests { use crate::schema::*; #[test] - pub fn test_term() { + pub fn test_term_str() { let mut schema_builder = Schema::builder(); schema_builder.add_text_field("text", STRING); let title_field = schema_builder.add_text_field("title", STRING); - let count_field = schema_builder.add_text_field("count", STRING); - { - let term = Term::from_field_text(title_field, "test"); - assert_eq!(term.field(), title_field); - assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 1u8]); - assert_eq!(&term.as_slice()[4..], "test".as_bytes()); - } - { - let term = Term::from_field_u64(count_field, 983u64); - assert_eq!(term.field(), count_field); - assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 2u8]); - assert_eq!(term.as_slice().len(), 4 + 8); - assert_eq!(term.as_slice()[4], 0u8); - assert_eq!(term.as_slice()[5], 0u8); - assert_eq!(term.as_slice()[6], 0u8); - assert_eq!(term.as_slice()[7], 0u8); - assert_eq!(term.as_slice()[8], 0u8); - assert_eq!(term.as_slice()[9], 0u8); - assert_eq!(term.as_slice()[10], (933u64 / 256u64) as u8); - assert_eq!(term.as_slice()[11], (983u64 % 256u64) as u8); - } + let term = Term::from_field_text(title_field, "test"); + assert_eq!(term.field(), title_field); + assert_eq!(term.typ(), Type::Str); + assert_eq!(term.as_str(), Some("test")) + } + + #[test] + pub fn test_term_u64() { + let mut schema_builder = Schema::builder(); + let count_field = schema_builder.add_u64_field("count", INDEXED); + let term = Term::from_field_u64(count_field, 983u64); + assert_eq!(term.field(), count_field); + assert_eq!(term.typ(), Type::U64); + assert_eq!(term.as_slice().len(), super::FAST_VALUE_TERM_LEN); + assert_eq!(term.as_u64(), Some(983u64)) } } diff --git a/src/schema/value.rs b/src/schema/value.rs index add1995d4..426643d79 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -21,7 +21,7 @@ pub enum Value { F64(f64), /// Signed 64-bits Date time stamp `date` Date(DateTime), - /// Hierarchical Facet + /// Facet Facet(Facet), /// Arbitrarily sized byte array Bytes(Vec), diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 1f494daf2..2dc98ab39 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -243,10 +243,15 @@ impl SnippetGenerator { if term.field() != field { continue; } + let term_str = if let Some(term_str) = term.as_str() { + term_str + } else { + continue; + }; let doc_freq = searcher.doc_freq(&term)?; if doc_freq > 0 { let score = 1.0 / (1.0 + doc_freq as Score); - terms_text.insert(term.text().to_string(), score); + terms_text.insert(term_str.to_string(), score); } } let tokenizer = searcher.index().tokenizer_for_field(field)?; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index aa385cbc1..95178acd1 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -157,8 +157,8 @@ pub use self::tokenizer_manager::TokenizerManager; /// /// Tokenizer are in charge of not emitting tokens larger than this value. /// Currently, if a faulty tokenizer implementation emits tokens with a length larger than -/// `2^16 - 1 - 4`, the token will simply be ignored downstream. -pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4; +/// `2^16 - 1 - 5`, the token will simply be ignored downstream. +pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 5; #[cfg(test)] pub mod tests {