diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 7a0d9cb98..2ff099d2b 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1656,13 +1656,15 @@ mod tests { let old_reader = index.reader()?; + let ip_exists = |id| id % 3 != 0; // 0 does not exist + for &op in ops { match op { IndexingOp::AddDoc { id } => { let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); let ip_from_id = Ipv6Addr::from_u128(id as u128); - if id % 3 == 0 { + if !ip_exists(id) { // every 3rd doc has no ip field index_writer.add_document(doc!(id_field=>id, bytes_field => id.to_le_bytes().as_slice(), @@ -1803,7 +1805,7 @@ mod tests { let expected_ips = expected_ids_and_num_occurrences .keys() .flat_map(|id| { - if id % 3 == 0 { + if !ip_exists(*id) { None } else { Some(Ipv6Addr::from_u128(*id as u128)) @@ -1815,7 +1817,7 @@ mod tests { let expected_ips = expected_ids_and_num_occurrences .keys() .filter_map(|id| { - if id % 3 == 0 { + if !ip_exists(*id) { None } else { Some(Ipv6Addr::from_u128(*id as u128)) @@ -1918,7 +1920,8 @@ mod tests { top_docs.iter().map(|el| el.1).collect::>() }; - for (existing_id, count) in expected_ids_and_num_occurrences { + for (existing_id, count) in &expected_ids_and_num_occurrences { + let (existing_id, count) = (*existing_id, *count); let assert_field = |field| do_search(&existing_id.to_string(), field).len() as u64; assert_eq!(assert_field(text_field), count); assert_eq!(assert_field(i64_field), count); @@ -1954,6 +1957,26 @@ mod tests { Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64)); assert_eq!(do_search2(term).len() as u64, 0); } + // search ip address + // + for (existing_id, count) in &expected_ids_and_num_occurrences { + let (existing_id, count) = (*existing_id, *count); + if !ip_exists(existing_id) { + continue; + } + let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; + let ip_addr = Ipv6Addr::from_u128(existing_id as u128); + // Test incoming ip as ipv6 + assert_eq!(do_search_ip_field(&format!("\"{}\"", ip_addr)), count); + + let term = Term::from_field_ip_addr(ip_field, ip_addr); + assert_eq!(do_search2(term).len() as u64, count); + + // Test incoming ip as ipv4 + if let Some(ip_addr) = ip_addr.to_ipv4_mapped() { + assert_eq!(do_search_ip_field(&format!("\"{}\"", ip_addr)), count); + } + } // test facets for segment_reader in searcher.segment_readers().iter() { let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 29e9071b7..724c70e23 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -320,7 +320,18 @@ impl SegmentWriter { ctx, )?; } - FieldType::IpAddr(_) => {} + FieldType::IpAddr(_) => { + let mut num_vals = 0; + for value in values { + num_vals += 1; + let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?; + term_buffer.set_ip_addr(ip_addr); + postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); + } + if field_entry.has_fieldnorms() { + self.fieldnorms_writer.record(doc_id, field, num_vals); + } + } } } Ok(()) diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index 2dd2606e0..ad9a8b2ba 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -212,12 +212,12 @@ pub fn block_wand( } /// Specialized version of [`block_wand`] for a single scorer. -/// In this case, the algorithm is simple and readable and faster (~ x3) +/// In this case, the algorithm is simple, readable and faster (~ x3) /// than the generic algorithm. /// The algorithm behaves as follows: /// - While we don't hit the end of the docset: /// - While the block max score is under the `threshold`, go to the next block. -/// - On a block, advance until the end and execute `callback`` when the doc score is greater or +/// - On a block, advance until the end and execute `callback` when the doc score is greater or /// equal to the `threshold`. pub fn block_wand_single_scorer( mut scorer: TermScorer, diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 729ac10ce..6fcd9df68 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::net::{AddrParseError, IpAddr}; use std::num::{ParseFloatError, ParseIntError}; use std::ops::Bound; use std::str::{FromStr, ParseBoolError}; @@ -15,7 +16,7 @@ use crate::query::{ TermQuery, }; use crate::schema::{ - Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type, + Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, Schema, Term, Type, }; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; @@ -84,6 +85,9 @@ pub enum QueryParserError { /// The format for the facet field is invalid. #[error("The facet field is malformed: {0}")] FacetFormatError(#[from] FacetParseError), + /// The format for the ip field is invalid. + #[error("The ip field is malformed: {0}")] + IpFormatError(#[from] AddrParseError), } /// Recursively remove empty clause from the AST @@ -401,7 +405,7 @@ impl QueryParser { Ok(Term::from_field_bytes(field, &bytes)) } FieldType::IpAddr(_) => Err(QueryParserError::UnsupportedQuery( - "Range query are not supported on IpAddr field.".to_string(), + "Range query are not supported on ip field.".to_string(), )), } } @@ -509,7 +513,11 @@ impl QueryParser { let bytes_term = Term::from_field_bytes(field, &bytes); Ok(vec![LogicalLiteral::Term(bytes_term)]) } - FieldType::IpAddr(_) => Err(QueryParserError::FieldNotIndexed(field_name.to_string())), + FieldType::IpAddr(_) => { + let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr(); + let term = Term::from_field_ip_addr(field, ip_v6); + Ok(vec![LogicalLiteral::Term(term)]) + } } } diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index ee789ac5b..5d848d6c3 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -124,3 +124,70 @@ impl Query for TermQuery { visitor(&self.term, false); } } + +#[cfg(test)] +mod tests { + use std::net::{IpAddr, Ipv6Addr}; + use std::str::FromStr; + + use fastfield_codecs::MonotonicallyMappableToU128; + + use crate::collector::{Count, TopDocs}; + use crate::query::{Query, QueryParser, TermQuery}; + use crate::schema::{IndexRecordOption, IntoIpv6Addr, Schema, INDEXED, STORED}; + use crate::{doc, Index, Term}; + + #[test] + fn search_ip_test() { + let mut schema_builder = Schema::builder(); + let ip_field = schema_builder.add_ip_addr_field("ip", INDEXED | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let ip_addr_1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr(); + let ip_addr_2 = Ipv6Addr::from_u128(10); + + { + let mut index_writer = index.writer(3_000_000).unwrap(); + index_writer + .add_document(doc!( + ip_field => ip_addr_1 + )) + .unwrap(); + index_writer + .add_document(doc!( + ip_field => ip_addr_2 + )) + .unwrap(); + + index_writer.commit().unwrap(); + } + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let assert_single_hit = |query| { + let (_top_docs, count) = searcher + .search(&query, &(TopDocs::with_limit(2), Count)) + .unwrap(); + assert_eq!(count, 1); + }; + let query_from_text = |text: String| { + QueryParser::for_index(&index, vec![ip_field]) + .parse_query(&text) + .unwrap() + }; + + let query_from_ip = |ip_addr| -> Box { + Box::new(TermQuery::new( + Term::from_field_ip_addr(ip_field, ip_addr), + IndexRecordOption::Basic, + )) + }; + + assert_single_hit(query_from_ip(ip_addr_1)); + assert_single_hit(query_from_ip(ip_addr_2)); + assert_single_hit(query_from_text("127.0.0.1".to_string())); + assert_single_hit(query_from_text("\"127.0.0.1\"".to_string())); + assert_single_hit(query_from_text(format!("\"{}\"", ip_addr_1))); + assert_single_hit(query_from_text(format!("\"{}\"", ip_addr_2))); + } +} diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 557d2ec4d..aa7bd2c6a 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,4 +1,4 @@ -use std::net::{IpAddr, Ipv6Addr}; +use std::net::IpAddr; use std::str::FromStr; use serde::{Deserialize, Serialize}; @@ -6,7 +6,7 @@ use serde_json::Value as JsonValue; use thiserror::Error; use super::ip_options::IpAddrOptions; -use super::Cardinality; +use super::{Cardinality, IntoIpv6Addr}; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; use crate::schema::{ @@ -188,7 +188,7 @@ impl FieldType { FieldType::Facet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), - FieldType::IpAddr(_) => false, + FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_indexed(), } } @@ -264,7 +264,7 @@ impl FieldType { FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), FieldType::JsonObject(ref _json_object_options) => false, - FieldType::IpAddr(_) => false, + FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.fieldnorms(), } } @@ -309,7 +309,13 @@ impl FieldType { FieldType::JsonObject(ref json_obj_options) => json_obj_options .get_text_indexing_options() .map(TextFieldIndexing::index_option), - FieldType::IpAddr(_) => None, + FieldType::IpAddr(ref ip_addr_options) => { + if ip_addr_options.is_indexed() { + Some(IndexRecordOption::Basic) + } else { + None + } + } } } @@ -356,11 +362,8 @@ impl FieldType { json: JsonValue::String(field_text), } })?; - let ip_addr_v6: Ipv6Addr = match ip_addr { - IpAddr::V4(v4) => v4.to_ipv6_mapped(), - IpAddr::V6(v6) => v6, - }; - Ok(Value::IpAddr(ip_addr_v6)) + + Ok(Value::IpAddr(ip_addr.into_ipv6_addr())) } } } diff --git a/src/schema/ip_options.rs b/src/schema/ip_options.rs index ce998f43f..8738f75f3 100644 --- a/src/schema/ip_options.rs +++ b/src/schema/ip_options.rs @@ -1,3 +1,4 @@ +use std::net::{IpAddr, Ipv6Addr}; use std::ops::BitOr; use serde::{Deserialize, Serialize}; @@ -5,12 +6,29 @@ use serde::{Deserialize, Serialize}; use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; use super::Cardinality; +/// Trait to convert into an Ipv6Addr. +pub trait IntoIpv6Addr { + /// Consumes the object and returns an Ipv6Addr. + fn into_ipv6_addr(self) -> Ipv6Addr; +} + +impl IntoIpv6Addr for IpAddr { + fn into_ipv6_addr(self) -> Ipv6Addr { + match self { + IpAddr::V4(addr) => addr.to_ipv6_mapped(), + IpAddr::V6(addr) => addr, + } + } +} + /// Define how an ip field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct IpAddrOptions { #[serde(skip_serializing_if = "Option::is_none")] fast: Option, stored: bool, + indexed: bool, + fieldnorms: bool, } impl IpAddrOptions { @@ -19,11 +37,21 @@ impl IpAddrOptions { self.fast.is_some() } - /// Returns `true` if the json object should be stored. + /// Returns `true` if the ip address should be stored in the doc store. pub fn is_stored(&self) -> bool { self.stored } + /// Returns true iff the value is indexed and therefore searchable. + pub fn is_indexed(&self) -> bool { + self.indexed + } + + /// Returns true if and only if the value is normed. + pub fn fieldnorms(&self) -> bool { + self.fieldnorms + } + /// Returns the cardinality of the fastfield. /// /// If the field has not been declared as a fastfield, then @@ -32,6 +60,16 @@ impl IpAddrOptions { self.fast } + /// Set the field as normed. + /// + /// Setting an integer as normed will generate + /// the fieldnorm data for it. + #[must_use] + pub fn set_fieldnorms(mut self) -> Self { + self.fieldnorms = true; + self + } + /// Sets the field as stored #[must_use] pub fn set_stored(mut self) -> Self { @@ -39,6 +77,19 @@ impl IpAddrOptions { self } + /// Set the field as indexed. + /// + /// Setting an ip address as indexed will generate + /// a posting list for each value taken by the ip address. + /// Ips are normalized to IpV6. + /// + /// This is required for the field to be searchable. + #[must_use] + pub fn set_indexed(mut self) -> Self { + self.indexed = true; + self + } + /// Set the field as a fast field. /// /// Fast fields are designed for random access. @@ -61,6 +112,8 @@ impl From<()> for IpAddrOptions { impl From for IpAddrOptions { fn from(_: FastFlag) -> Self { IpAddrOptions { + fieldnorms: false, + indexed: false, stored: false, fast: Some(Cardinality::SingleValue), } @@ -70,6 +123,8 @@ impl From for IpAddrOptions { impl From for IpAddrOptions { fn from(_: StoredFlag) -> Self { IpAddrOptions { + fieldnorms: false, + indexed: false, stored: true, fast: None, } @@ -79,6 +134,8 @@ impl From for IpAddrOptions { impl From for IpAddrOptions { fn from(_: IndexedFlag) -> Self { IpAddrOptions { + fieldnorms: true, + indexed: true, stored: false, fast: None, } @@ -91,6 +148,8 @@ impl> BitOr for IpAddrOptions { fn bitor(self, other: T) -> IpAddrOptions { let other = other.into(); IpAddrOptions { + fieldnorms: self.fieldnorms | other.fieldnorms, + indexed: self.indexed | other.indexed, stored: self.stored | other.stored, fast: self.fast.or(other.fast), } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index c64eef788..2d2612500 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -138,7 +138,7 @@ pub use self::field_type::{FieldType, Type}; pub use self::field_value::FieldValue; pub use self::flags::{FAST, INDEXED, STORED}; pub use self::index_record_option::IndexRecordOption; -pub use self::ip_options::IpAddrOptions; +pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions}; pub use self::json_object_options::JsonObjectOptions; pub use self::named_field_document::NamedFieldDocument; pub use self::numeric_options::NumericOptions; diff --git a/src/schema/numeric_options.rs b/src/schema/numeric_options.rs index 3f82fe5db..ff03ad340 100644 --- a/src/schema/numeric_options.rs +++ b/src/schema/numeric_options.rs @@ -59,7 +59,7 @@ impl From for NumericOptions { } impl NumericOptions { - /// Returns true iff the value is stored. + /// Returns true iff the value is stored in the doc store. pub fn is_stored(&self) -> bool { self.stored } diff --git a/src/schema/term.rs b/src/schema/term.rs index 8a398818a..ba9a1ba82 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,7 +1,10 @@ use std::convert::TryInto; use std::hash::{Hash, Hasher}; +use std::net::Ipv6Addr; use std::{fmt, str}; +use fastfield_codecs::MonotonicallyMappableToU128; + use super::Field; use crate::fastfield::FastValue; use crate::schema::{Facet, Type}; @@ -68,6 +71,13 @@ impl Term { self.0.len() == TERM_METADATA_LENGTH } + /// Builds a term given a field, and a `Ipv6Addr`-value + pub fn from_field_ip_addr(field: Field, ip_addr: Ipv6Addr) -> Term { + let mut term = Self::with_type_and_field(Type::IpAddr, field); + term.set_ip_addr(ip_addr); + term + } + /// Builds a term given a field, and a `u64`-value pub fn from_field_u64(field: Field, val: u64) -> Term { Term::from_fast_value(field, &val) @@ -155,6 +165,11 @@ impl Term { self.set_bytes(val.to_u64().to_be_bytes().as_ref()); } + /// Sets a `Ipv6Addr` value in the term. + pub fn set_ip_addr(&mut self, val: Ipv6Addr) { + self.set_bytes(val.to_u128().to_be_bytes().as_ref()); + } + /// Sets the value of a `Bytes` field. pub fn set_bytes(&mut self, bytes: &[u8]) { self.truncate_value_bytes(0);