mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
Minor mini fixes
This commit is contained in:
@@ -133,7 +133,7 @@ pub(super) trait SymbolValue: Clone + Copy {
|
||||
|
||||
impl SymbolValue for bool {
|
||||
fn serialize(self, buffer: &mut [u8]) -> u8 {
|
||||
buffer[0] = if self { 1u8 } else { 0u8 };
|
||||
buffer[0] = u8::from(self);
|
||||
1u8
|
||||
}
|
||||
|
||||
|
||||
@@ -9,18 +9,18 @@ use crate::{Cardinality, DocId, NumericalType, NumericalValue};
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
#[repr(u8)]
|
||||
enum DocumentStep {
|
||||
SameDoc = 0,
|
||||
NextDoc = 1,
|
||||
SkippedDoc = 2,
|
||||
Same = 0,
|
||||
Next = 1,
|
||||
Skipped = 2,
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn delta_with_last_doc(last_doc_opt: Option<u32>, doc: u32) -> DocumentStep {
|
||||
let expected_next_doc = last_doc_opt.map(|last_doc| last_doc + 1).unwrap_or(0u32);
|
||||
match doc.cmp(&expected_next_doc) {
|
||||
Ordering::Less => DocumentStep::SameDoc,
|
||||
Ordering::Equal => DocumentStep::NextDoc,
|
||||
Ordering::Greater => DocumentStep::SkippedDoc,
|
||||
Ordering::Less => DocumentStep::Same,
|
||||
Ordering::Equal => DocumentStep::Next,
|
||||
Ordering::Greater => DocumentStep::Skipped,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,15 +56,15 @@ impl ColumnWriter {
|
||||
pub(super) fn record<S: SymbolValue>(&mut self, doc: DocId, value: S, arena: &mut MemoryArena) {
|
||||
// Difference between `doc` and the last doc.
|
||||
match delta_with_last_doc(self.last_doc_opt, doc) {
|
||||
DocumentStep::SameDoc => {
|
||||
DocumentStep::Same => {
|
||||
// This is the last encounterred document.
|
||||
self.cardinality = Cardinality::Multivalued;
|
||||
}
|
||||
DocumentStep::NextDoc => {
|
||||
DocumentStep::Next => {
|
||||
self.last_doc_opt = Some(doc);
|
||||
self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena);
|
||||
}
|
||||
DocumentStep::SkippedDoc => {
|
||||
DocumentStep::Skipped => {
|
||||
self.cardinality = self.cardinality.max(Cardinality::Optional);
|
||||
self.last_doc_opt = Some(doc);
|
||||
self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena);
|
||||
@@ -79,8 +79,8 @@ impl ColumnWriter {
|
||||
// at the end of the column.
|
||||
pub(crate) fn get_cardinality(&self, num_docs: DocId) -> Cardinality {
|
||||
match delta_with_last_doc(self.last_doc_opt, num_docs) {
|
||||
DocumentStep::SameDoc | DocumentStep::NextDoc => self.cardinality,
|
||||
DocumentStep::SkippedDoc => self.cardinality.max(Cardinality::Optional),
|
||||
DocumentStep::Same | DocumentStep::Next => self.cardinality,
|
||||
DocumentStep::Skipped => self.cardinality.max(Cardinality::Optional),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -215,20 +215,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_delta_with_last_doc() {
|
||||
assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::NextDoc);
|
||||
assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::SkippedDoc);
|
||||
assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::SkippedDoc);
|
||||
assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::SameDoc);
|
||||
assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::SameDoc);
|
||||
assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::NextDoc);
|
||||
assert_eq!(
|
||||
delta_with_last_doc(Some(1u32), 3u32),
|
||||
DocumentStep::SkippedDoc
|
||||
);
|
||||
assert_eq!(
|
||||
delta_with_last_doc(Some(1u32), 4u32),
|
||||
DocumentStep::SkippedDoc
|
||||
);
|
||||
assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::Next);
|
||||
assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::Skipped);
|
||||
assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::Skipped);
|
||||
assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::Same);
|
||||
assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::Same);
|
||||
assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::Next);
|
||||
assert_eq!(delta_with_last_doc(Some(1u32), 3u32), DocumentStep::Skipped);
|
||||
assert_eq!(delta_with_last_doc(Some(1u32), 4u32), DocumentStep::Skipped);
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
|
||||
@@ -34,15 +34,14 @@ struct SpareBuffers {
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy_columnar::ColumnarWriter;
|
||||
/// fn main() {
|
||||
/// let mut columnar_writer = ColumnarWriter::default();
|
||||
/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64);
|
||||
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
|
||||
/// let mut wrt: Vec<u8> = Vec::new();
|
||||
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
|
||||
/// }
|
||||
///
|
||||
/// let mut columnar_writer = ColumnarWriter::default();
|
||||
/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64);
|
||||
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
|
||||
/// let mut wrt: Vec<u8> = Vec::new();
|
||||
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
|
||||
/// ```
|
||||
pub struct ColumnarWriter {
|
||||
numerical_field_hash_map: ArenaHashMap,
|
||||
|
||||
@@ -15,10 +15,10 @@ pub struct ColumnarSerializer<W: io::Write> {
|
||||
|
||||
/// Returns a key consisting of the concatenation of the key and the column_type_and_cardinality
|
||||
/// code.
|
||||
fn prepare_key<'a>(
|
||||
fn prepare_key(
|
||||
key: &[u8],
|
||||
column_type_cardinality: ColumnTypeAndCardinality,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
buffer: &mut Vec<u8>,
|
||||
) {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(key);
|
||||
|
||||
@@ -3,6 +3,8 @@ use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::{FromStr, ParseBoolError};
|
||||
|
||||
use base64::engine::general_purpose::STANDARD as BASE64;
|
||||
use base64::Engine;
|
||||
use rustc_hash::FxHashMap;
|
||||
use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
|
||||
@@ -403,7 +405,9 @@ impl QueryParser {
|
||||
Err(e) => Err(QueryParserError::from(e)),
|
||||
},
|
||||
FieldType::Bytes(_) => {
|
||||
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
|
||||
let bytes = BASE64
|
||||
.decode(phrase)
|
||||
.map_err(QueryParserError::ExpectedBase64)?;
|
||||
Ok(Term::from_field_bytes(field, &bytes))
|
||||
}
|
||||
FieldType::IpAddr(_) => {
|
||||
@@ -498,7 +502,9 @@ impl QueryParser {
|
||||
Err(e) => Err(QueryParserError::from(e)),
|
||||
},
|
||||
FieldType::Bytes(_) => {
|
||||
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
|
||||
let bytes = BASE64
|
||||
.decode(phrase)
|
||||
.map_err(QueryParserError::ExpectedBase64)?;
|
||||
let bytes_term = Term::from_field_bytes(field, &bytes);
|
||||
Ok(vec![LogicalLiteral::Term(bytes_term)])
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::net::IpAddr;
|
||||
use std::str::FromStr;
|
||||
|
||||
use base64::engine::general_purpose::STANDARD as BASE64;
|
||||
use base64::Engine;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
use thiserror::Error;
|
||||
@@ -358,7 +360,8 @@ impl FieldType {
|
||||
json: JsonValue::String(field_text),
|
||||
}),
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(&field_text)
|
||||
FieldType::Bytes(_) => BASE64
|
||||
.decode(&field_text)
|
||||
.map(Value::Bytes)
|
||||
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::fmt;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use base64::engine::general_purpose::STANDARD as BASE64;
|
||||
use base64::Engine;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::Map;
|
||||
@@ -51,7 +53,7 @@ impl Serialize for Value {
|
||||
Value::Bool(b) => serializer.serialize_bool(b),
|
||||
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_str(&base64::encode(bytes)),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
Value::IpAddr(ref obj) => {
|
||||
// Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback.
|
||||
|
||||
@@ -126,6 +126,7 @@ mod facet_tokenizer;
|
||||
mod lower_caser;
|
||||
mod ngram_tokenizer;
|
||||
mod raw_tokenizer;
|
||||
mod regex_tokenizer;
|
||||
mod remove_long;
|
||||
mod simple_tokenizer;
|
||||
mod split_compound_words;
|
||||
@@ -135,7 +136,6 @@ mod tokenized_string;
|
||||
mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod whitespace_tokenizer;
|
||||
mod regex_tokenizer;
|
||||
|
||||
pub use tokenizer_api::{
|
||||
BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
@@ -147,6 +147,7 @@ pub use self::facet_tokenizer::FacetTokenizer;
|
||||
pub use self::lower_caser::LowerCaser;
|
||||
pub use self::ngram_tokenizer::NgramTokenizer;
|
||||
pub use self::raw_tokenizer::RawTokenizer;
|
||||
pub use self::regex_tokenizer::RegexTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::split_compound_words::SplitCompoundWords;
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
use regex::Regex;
|
||||
use crate::TantivyError;
|
||||
|
||||
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Tokenize the text by using a regex pattern to split.
|
||||
/// Each match of the regex emits a distinct token, empty tokens will not be emitted. Anchors such
|
||||
/// as `\A` will match the text from the part where the last token was emitted or the beginning of
|
||||
/// the complete text if no token was emitted yet.
|
||||
///
|
||||
/// Example: `` 'aaa' bbb 'ccc' 'ddd' `` with the pattern `` '(?:\w*)' `` will be tokenized as followed:
|
||||
/// Example: `` 'aaa' bbb 'ccc' 'ddd' `` with the pattern `` '(?:\w*)' `` will be tokenized as
|
||||
/// followed:
|
||||
///
|
||||
/// | Term | aaa | ccc | ddd |
|
||||
/// |----------|------|--------|-------|
|
||||
@@ -21,7 +22,7 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'");
|
||||
/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap();
|
||||
/// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'");
|
||||
/// {
|
||||
/// let token = stream.next().unwrap();
|
||||
@@ -46,10 +47,11 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RegexTokenizer {
|
||||
regex: Regex
|
||||
regex: Regex,
|
||||
}
|
||||
|
||||
impl RegexTokenizer {
|
||||
/// Creates a new RegexTokenizer.
|
||||
pub fn new(regex_pattern: &str) -> crate::Result<RegexTokenizer> {
|
||||
Regex::new(regex_pattern)
|
||||
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned()))
|
||||
@@ -63,6 +65,7 @@ impl Tokenizer for RegexTokenizer {
|
||||
regex: self.regex.clone(),
|
||||
text,
|
||||
token: Token::default(),
|
||||
cursor: 0,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -71,25 +74,28 @@ pub struct RegexTokenStream<'a> {
|
||||
regex: Regex,
|
||||
text: &'a str,
|
||||
token: Token,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for RegexTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if let Some(m) = self.regex.find(self.text) {
|
||||
if !m.as_str().is_empty() {
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(&self.text[m.start()..m.end()]);
|
||||
|
||||
self.token.offset_from = self.token.offset_to + m.start();
|
||||
self.token.offset_to = self.token.offset_to + m.end();
|
||||
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
|
||||
self.text = &self.text[m.end()..];
|
||||
return true
|
||||
}
|
||||
let Some(regex_match) = self.regex.find(self.text) else {
|
||||
return false;
|
||||
};
|
||||
if regex_match.as_str().is_empty() {
|
||||
return false;
|
||||
}
|
||||
false
|
||||
self.token.text.clear();
|
||||
self.token.text.push_str(regex_match.as_str());
|
||||
|
||||
self.token.offset_from = self.cursor + regex_match.start();
|
||||
self.cursor += regex_match.end();
|
||||
self.token.offset_to = self.cursor;
|
||||
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
|
||||
self.text = &self.text[regex_match.end()..];
|
||||
true
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
@@ -103,10 +109,10 @@ impl<'a> TokenStream for RegexTokenStream<'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizer::regex_tokenizer::RegexTokenizer;
|
||||
use crate::tokenizer::tests::assert_token;
|
||||
use crate::tokenizer::{TextAnalyzer, Token};
|
||||
use crate::tokenizer::regex_tokenizer::RegexTokenizer;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_regex_tokenizer() {
|
||||
let tokens = token_stream_helper("'aaa' bbb 'ccc' 'ddd'", r"'(?:\w*)'");
|
||||
@@ -132,7 +138,10 @@ mod tests {
|
||||
fn test_regexp_tokenizer_error_on_invalid_regex() {
|
||||
let tokenizer = RegexTokenizer::new(r"\@");
|
||||
assert_eq!(tokenizer.is_err(), true);
|
||||
assert_eq!(tokenizer.err().unwrap().to_string(), "An invalid argument was passed: '\\@'");
|
||||
assert_eq!(
|
||||
tokenizer.err().unwrap().to_string(),
|
||||
"An invalid argument was passed: '\\@'"
|
||||
);
|
||||
}
|
||||
|
||||
fn token_stream_helper(text: &str, pattern: &str) -> Vec<Token> {
|
||||
@@ -146,4 +155,4 @@ mod tests {
|
||||
token_stream.process(&mut add_token);
|
||||
tokens
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user