add ip field

This commit is contained in:
Pascal Seitz
2022-08-04 08:48:21 +02:00
parent f4d7621370
commit 4a1b251a08
13 changed files with 223 additions and 14 deletions

View File

@@ -97,6 +97,7 @@ impl FastFieldsWriter {
bytes_value_writers.push(fast_field_writer);
}
}
FieldType::Ip(opt) => if opt.is_fast() {},
FieldType::Str(_) | FieldType::JsonObject(_) => {}
}
}

View File

@@ -321,6 +321,12 @@ impl IndexMerger {
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
}
}
FieldType::Ip(options) => {
if options.is_fast() {
// TODO create fast field for merge
}
}
FieldType::JsonObject(_) | FieldType::Facet(_) | FieldType::Str(_) => {
// We don't handle json fast field for the moment
// They can be implemented using what is done

View File

@@ -294,6 +294,13 @@ impl SegmentWriter {
ctx,
)?;
}
FieldType::Ip(_) => {
for value in values {
let ip_val = value.as_ip().ok_or_else(make_schema_error)?;
term_buffer.set_text(&ip_val.to_string());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
}
}
}
Ok(())

View File

@@ -50,6 +50,7 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn Postings
| FieldType::Bool(_)
| FieldType::Date(_)
| FieldType::Bytes(_)
| FieldType::Ip(_)
| FieldType::Facet(_) => Box::new(SpecializedPostingsWriter::<NothingRecorder>::default()),
FieldType::JsonObject(ref json_object_options) => {
if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() {

View File

@@ -89,6 +89,7 @@ pub(crate) fn serialize_postings(
| FieldType::Bool(_) => {}
FieldType::Bytes(_) => {}
FieldType::JsonObject(_) => {}
FieldType::Ip(_) => {} // TODO check
}
let postings_writer = per_field_postings_writers.get_for_field(field);

View File

@@ -400,6 +400,7 @@ impl QueryParser {
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
Ok(Term::from_field_bytes(field, &bytes))
}
FieldType::Ip(_) => Ok(Term::from_field_text(field, phrase)),
}
}
@@ -506,6 +507,13 @@ impl QueryParser {
let bytes_term = Term::from_field_bytes(field, &bytes);
Ok(vec![LogicalLiteral::Term(bytes_term)])
}
FieldType::Ip(ref option) => {
if !option.is_indexed() {
return Err(QueryParserError::FieldNotIndexed(field_name.to_string()));
}
let text_term = Term::from_field_text(field, phrase);
Ok(vec![LogicalLiteral::Term(text_term)])
}
}
}

View File

@@ -6,6 +6,8 @@ use crate::schema::{
TextOptions,
};
use super::ip_options::IpOptions;
/// A `FieldEntry` represents a field and its configuration.
/// `Schema` are a collection of `FieldEntry`
///
@@ -60,6 +62,11 @@ impl FieldEntry {
Self::new(field_name, FieldType::Date(date_options))
}
/// Creates a new ip field entry.
pub fn new_ip(field_name: String, ip_options: IpOptions) -> FieldEntry {
Self::new(field_name, FieldType::Ip(ip_options))
}
/// Creates a field entry for a facet.
pub fn new_facet(field_name: String, facet_options: FacetOptions) -> FieldEntry {
Self::new(field_name, FieldType::Facet(facet_options))
@@ -114,6 +121,7 @@ impl FieldEntry {
FieldType::Facet(ref options) => options.is_stored(),
FieldType::Bytes(ref options) => options.is_stored(),
FieldType::JsonObject(ref options) => options.is_stored(),
FieldType::Ip(ref options) => options.is_stored(),
}
}
}

View File

@@ -1,3 +1,6 @@
use std::net::IpAddr;
use std::str::FromStr;
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use thiserror::Error;
@@ -13,6 +16,8 @@ use crate::time::OffsetDateTime;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
use super::ip_options::IpOptions;
/// Possible error that may occur while parsing a field value
/// At this point the JSON is known to be valid.
#[derive(Debug, PartialEq, Error)]
@@ -61,9 +66,11 @@ pub enum Type {
Bytes = b'b',
/// Leaf in a Json object.
Json = b'j',
/// IpAddr
Ip = b'p',
}
const ALL_TYPES: [Type; 9] = [
const ALL_TYPES: [Type; 10] = [
Type::Str,
Type::U64,
Type::I64,
@@ -73,6 +80,7 @@ const ALL_TYPES: [Type; 9] = [
Type::Facet,
Type::Bytes,
Type::Json,
Type::Ip,
];
impl Type {
@@ -99,6 +107,7 @@ impl Type {
Type::Facet => "Facet",
Type::Bytes => "Bytes",
Type::Json => "Json",
Type::Ip => "Ip",
}
}
@@ -115,6 +124,7 @@ impl Type {
b'h' => Some(Type::Facet),
b'b' => Some(Type::Bytes),
b'j' => Some(Type::Json),
b'p' => Some(Type::Ip),
_ => None,
}
}
@@ -145,6 +155,8 @@ pub enum FieldType {
Bytes(BytesOptions),
/// Json object
JsonObject(JsonObjectOptions),
/// IpAddr field
Ip(IpOptions),
}
impl FieldType {
@@ -160,6 +172,7 @@ impl FieldType {
FieldType::Facet(_) => Type::Facet,
FieldType::Bytes(_) => Type::Bytes,
FieldType::JsonObject(_) => Type::Json,
FieldType::Ip(_) => Type::Ip,
}
}
@@ -175,6 +188,7 @@ impl FieldType {
FieldType::Facet(ref _facet_options) => true,
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(),
FieldType::Ip(ref ip_options) => ip_options.is_indexed(),
}
}
@@ -209,6 +223,7 @@ impl FieldType {
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => int_options.is_fast(),
FieldType::Date(ref date_options) => date_options.is_fast(),
FieldType::Ip(ref options) => options.is_fast(),
FieldType::Facet(_) => true,
FieldType::JsonObject(_) => false,
}
@@ -229,6 +244,7 @@ impl FieldType {
FieldType::Facet(_) => false,
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
FieldType::JsonObject(ref _json_object_options) => false,
FieldType::Ip(_) => false,
}
}
@@ -273,6 +289,13 @@ impl FieldType {
FieldType::JsonObject(ref json_obj_options) => json_obj_options
.get_text_indexing_options()
.map(TextFieldIndexing::index_option),
FieldType::Ip(ref ip_options) => {
if ip_options.is_indexed() {
Some(IndexRecordOption::Basic)
} else {
None
}
}
}
}
@@ -312,6 +335,14 @@ impl FieldType {
expected: "a json object",
json: JsonValue::String(field_text),
}),
FieldType::Ip(_) => {
Ok(Value::Ip(IpAddr::from_str(&field_text).map_err(|err| {
ValueParsingError::ParseError {
error: err.to_string(),
json: JsonValue::String(field_text),
}
})?))
}
}
}
JsonValue::Number(field_val_num) => match self {
@@ -359,6 +390,10 @@ impl FieldType {
expected: "a json object",
json: JsonValue::Number(field_val_num),
}),
FieldType::Ip(_) => Err(ValueParsingError::TypeError {
expected: "a string with an ip addr",
json: JsonValue::Number(field_val_num),
}),
},
JsonValue::Object(json_map) => match self {
FieldType::Str(_) => {

69
src/schema/ip_options.rs Normal file
View File

@@ -0,0 +1,69 @@
use serde::{Deserialize, Serialize};
/// Express whether a field is single-value or multi-valued.
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)]
pub enum Cardinality {
/// The document must have exactly one value associated to the document.
#[serde(rename = "single")]
SingleValue,
/// The document can have any number of values associated to the document.
/// This is more memory and CPU expensive than the SingleValue solution.
#[serde(rename = "multi")]
MultiValues,
}
/// Define how an u64, i64, of f64 field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct IpOptions {
indexed: bool,
fast: Option<Cardinality>,
stored: bool,
}
impl IpOptions {
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
}
/// Set the field as a single-valued fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated to a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> Self {
self.fast = Some(cardinality);
self
}
/// Returns `true` if the json object should be stored.
pub fn is_stored(&self) -> bool {
self.stored
}
/// Returns `true` iff the json object should be indexed.
pub fn is_indexed(&self) -> bool {
self.indexed
}
/// Set the field as indexed.
///
/// Setting an integer as indexed will generate
/// a posting list for each value taken by the integer.
///
/// This is required for the field to be searchable.
#[must_use]
pub fn set_indexed(mut self) -> Self {
self.indexed = true;
self
}
/// Sets the field as stored
#[must_use]
pub fn set_stored(mut self) -> Self {
self.stored = true;
self
}
}

View File

@@ -121,6 +121,7 @@ mod date_time_options;
mod field;
mod flags;
mod index_record_option;
mod ip_options;
mod json_object_options;
mod named_field_document;
mod numeric_options;

View File

@@ -7,6 +7,7 @@ use serde::ser::SerializeSeq;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::{self, Value as JsonValue};
use super::ip_options::IpOptions;
use super::*;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::field_type::ValueParsingError;
@@ -144,6 +145,28 @@ impl SchemaBuilder {
self.add_field(field_entry)
}
/// Adds a ip field.
/// Returns the associated field handle
/// Internally, Tantivy simply stores ips as u64,
/// while the user supplies IpAddr values for convenience.
///
/// # Caution
///
/// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_ip_field<T: Into<IpOptions>>(
&mut self,
field_name_str: &str,
field_options: T,
) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_ip(field_name, field_options.into());
self.add_field(field_entry)
}
/// Adds a new text field.
/// Returns the associated field handle
///
@@ -367,7 +390,9 @@ impl Schema {
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer {
where
S: Serializer,
{
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
for e in &self.0.fields {
seq.serialize_element(e)?;
@@ -378,7 +403,9 @@ impl Serialize for Schema {
impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> {
where
D: Deserializer<'de>,
{
struct SchemaVisitor;
impl<'de> Visitor<'de> for SchemaVisitor {
@@ -389,7 +416,9 @@ impl<'de> Deserialize<'de> for Schema {
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where A: SeqAccess<'de> {
where
A: SeqAccess<'de>,
{
let mut schema = SchemaBuilder {
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
fields_map: HashMap::with_capacity(seq.size_hint().unwrap_or(0)),

View File

@@ -34,7 +34,8 @@ pub const JSON_END_OF_PATH: u8 = 0u8;
/// It actually wraps a `Vec<u8>`.
#[derive(Clone)]
pub struct Term<B = Vec<u8>>(B)
where B: AsRef<[u8]>;
where
B: AsRef<[u8]>;
impl AsMut<Vec<u8>> for Term {
fn as_mut(&mut self) -> &mut Vec<u8> {
@@ -174,7 +175,8 @@ impl Term {
}
impl<B> Ord for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.as_slice().cmp(other.as_slice())
@@ -182,7 +184,8 @@ where B: AsRef<[u8]>
}
impl<B> PartialOrd for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
@@ -190,7 +193,8 @@ where B: AsRef<[u8]>
}
impl<B> PartialEq for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn eq(&self, other: &Self) -> bool {
self.as_slice() == other.as_slice()
@@ -200,7 +204,8 @@ where B: AsRef<[u8]>
impl<B> Eq for Term<B> where B: AsRef<[u8]> {}
impl<B> Hash for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn hash<H: Hasher>(&self, state: &mut H) {
self.0.as_ref().hash(state)
@@ -208,7 +213,8 @@ where B: AsRef<[u8]>
}
impl<B> Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
/// Wraps a object holding bytes
pub fn wrap(data: B) -> Term<B> {
@@ -415,12 +421,17 @@ fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Re
debug_value_bytes(typ, bytes, f)?;
}
}
Type::Ip => {
let s = as_str(bytes); // TODO: change when serialization changes
write_opt(f, s)?;
}
}
Ok(())
}
impl<B> fmt::Debug for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let field_id = self.field().field_id();

View File

@@ -1,4 +1,5 @@
use std::fmt;
use std::net::IpAddr;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
@@ -32,13 +33,17 @@ pub enum Value {
Bytes(Vec<u8>),
/// Json object value.
JsonObject(serde_json::Map<String, serde_json::Value>),
/// Ip
Ip(IpAddr),
}
impl Eq for Value {}
impl Serialize for Value {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer {
where
S: Serializer,
{
match *self {
Value::Str(ref v) => serializer.serialize_str(v),
Value::PreTokStr(ref v) => v.serialize(serializer),
@@ -50,13 +55,16 @@ impl Serialize for Value {
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
Value::JsonObject(ref obj) => obj.serialize(serializer),
Value::Ip(ref obj) => obj.serialize(serializer), // TODO check serialization
}
}
}
impl<'de> Deserialize<'de> for Value {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de> {
where
D: Deserializer<'de>,
{
struct ValueVisitor;
impl<'de> Visitor<'de> for ValueVisitor {
@@ -201,6 +209,16 @@ impl Value {
None
}
}
/// Returns the ip addr, provided the value is of the `Ip` type.
/// (Returns None if the value is not of the `Ip` type)
pub fn as_ip(&self) -> Option<IpAddr> {
if let Value::Ip(val) = self {
Some(*val)
} else {
None
}
}
}
impl From<String> for Value {
@@ -287,7 +305,9 @@ impl From<serde_json::Value> for Value {
}
mod binary_serialize {
use std::io::{self, Read, Write};
use std::io::{self, ErrorKind, Read, Write};
use std::net::IpAddr;
use std::str::FromStr;
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
@@ -306,6 +326,7 @@ mod binary_serialize {
const EXT_CODE: u8 = 7;
const JSON_OBJ_CODE: u8 = 8;
const BOOL_CODE: u8 = 9;
const IP_CODE: u8 = 10;
// extended types
@@ -366,6 +387,10 @@ mod binary_serialize {
serde_json::to_writer(writer, &map)?;
Ok(())
}
Value::Ip(ref ip) => {
IP_CODE.serialize(writer)?;
ip.to_string().serialize(writer) // TODO Check best format
}
}
}
@@ -436,6 +461,13 @@ mod binary_serialize {
let json_map = <serde_json::Map::<String, serde_json::Value> as serde::Deserialize>::deserialize(&mut de)?;
Ok(Value::JsonObject(json_map))
}
IP_CODE => {
let text = String::deserialize(reader)?;
Ok(Value::Ip(IpAddr::from_str(&text).map_err(|err| {
io::Error::new(ErrorKind::Other, err.to_string())
})?))
}
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("No field type is associated with code {:?}", type_code),