add indexing for ip field

Closes #1595
This commit is contained in:
Pascal Seitz
2022-10-14 13:54:33 +08:00
parent c9cf9c952a
commit 6800fdec9d
10 changed files with 209 additions and 23 deletions

View File

@@ -1656,13 +1656,15 @@ mod tests {
let old_reader = index.reader()?;
let ip_exists = |id| id % 3 != 0; // 0 does not exist
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
let ip_from_id = Ipv6Addr::from_u128(id as u128);
if id % 3 == 0 {
if !ip_exists(id) {
// every 3rd doc has no ip field
index_writer.add_document(doc!(id_field=>id,
bytes_field => id.to_le_bytes().as_slice(),
@@ -1803,7 +1805,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.flat_map(|id| {
if id % 3 == 0 {
if !ip_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1815,7 +1817,7 @@ mod tests {
let expected_ips = expected_ids_and_num_occurrences
.keys()
.filter_map(|id| {
if id % 3 == 0 {
if !ip_exists(*id) {
None
} else {
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1918,7 +1920,8 @@ mod tests {
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
};
for (existing_id, count) in expected_ids_and_num_occurrences {
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
let assert_field = |field| do_search(&existing_id.to_string(), field).len() as u64;
assert_eq!(assert_field(text_field), count);
assert_eq!(assert_field(i64_field), count);
@@ -1954,6 +1957,26 @@ mod tests {
Term::from_field_date(date_field, DateTime::from_timestamp_secs(deleted_id as i64));
assert_eq!(do_search2(term).len() as u64, 0);
}
// search ip address
//
for (existing_id, count) in &expected_ids_and_num_occurrences {
let (existing_id, count) = (*existing_id, *count);
if !ip_exists(existing_id) {
continue;
}
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
let ip_addr = Ipv6Addr::from_u128(existing_id as u128);
// Test incoming ip as ipv6
assert_eq!(do_search_ip_field(&format!("\"{}\"", ip_addr)), count);
let term = Term::from_field_ip_addr(ip_field, ip_addr);
assert_eq!(do_search2(term).len() as u64, count);
// Test incoming ip as ipv4
if let Some(ip_addr) = ip_addr.to_ipv4_mapped() {
assert_eq!(do_search_ip_field(&format!("\"{}\"", ip_addr)), count);
}
}
// test facets
for segment_reader in searcher.segment_readers().iter() {
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();

View File

@@ -320,7 +320,18 @@ impl SegmentWriter {
ctx,
)?;
}
FieldType::IpAddr(_) => {}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
term_buffer.set_ip_addr(ip_addr);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
}
}
Ok(())

View File

@@ -212,12 +212,12 @@ pub fn block_wand(
}
/// Specialized version of [`block_wand`] for a single scorer.
/// In this case, the algorithm is simple and readable and faster (~ x3)
/// In this case, the algorithm is simple, readable and faster (~ x3)
/// than the generic algorithm.
/// The algorithm behaves as follows:
/// - While we don't hit the end of the docset:
/// - While the block max score is under the `threshold`, go to the next block.
/// - On a block, advance until the end and execute `callback`` when the doc score is greater or
/// - On a block, advance until the end and execute `callback` when the doc score is greater or
/// equal to the `threshold`.
pub fn block_wand_single_scorer(
mut scorer: TermScorer,

View File

@@ -1,4 +1,5 @@
use std::collections::HashMap;
use std::net::{AddrParseError, IpAddr};
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::{FromStr, ParseBoolError};
@@ -15,7 +16,7 @@ use crate::query::{
TermQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type,
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, Schema, Term, Type,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
@@ -84,6 +85,9 @@ pub enum QueryParserError {
/// The format for the facet field is invalid.
#[error("The facet field is malformed: {0}")]
FacetFormatError(#[from] FacetParseError),
/// The format for the ip field is invalid.
#[error("The ip field is malformed: {0}")]
IpFormatError(#[from] AddrParseError),
}
/// Recursively remove empty clause from the AST
@@ -401,7 +405,7 @@ impl QueryParser {
Ok(Term::from_field_bytes(field, &bytes))
}
FieldType::IpAddr(_) => Err(QueryParserError::UnsupportedQuery(
"Range query are not supported on IpAddr field.".to_string(),
"Range query are not supported on ip field.".to_string(),
)),
}
}
@@ -509,7 +513,11 @@ impl QueryParser {
let bytes_term = Term::from_field_bytes(field, &bytes);
Ok(vec![LogicalLiteral::Term(bytes_term)])
}
FieldType::IpAddr(_) => Err(QueryParserError::FieldNotIndexed(field_name.to_string())),
FieldType::IpAddr(_) => {
let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
let term = Term::from_field_ip_addr(field, ip_v6);
Ok(vec![LogicalLiteral::Term(term)])
}
}
}

View File

@@ -124,3 +124,70 @@ impl Query for TermQuery {
visitor(&self.term, false);
}
}
#[cfg(test)]
mod tests {
use std::net::{IpAddr, Ipv6Addr};
use std::str::FromStr;
use fastfield_codecs::MonotonicallyMappableToU128;
use crate::collector::{Count, TopDocs};
use crate::query::{Query, QueryParser, TermQuery};
use crate::schema::{IndexRecordOption, IntoIpv6Addr, Schema, INDEXED, STORED};
use crate::{doc, Index, Term};
#[test]
fn search_ip_test() {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", INDEXED | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let ip_addr_1 = IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr();
let ip_addr_2 = Ipv6Addr::from_u128(10);
{
let mut index_writer = index.writer(3_000_000).unwrap();
index_writer
.add_document(doc!(
ip_field => ip_addr_1
))
.unwrap();
index_writer
.add_document(doc!(
ip_field => ip_addr_2
))
.unwrap();
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let assert_single_hit = |query| {
let (_top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();
assert_eq!(count, 1);
};
let query_from_text = |text: String| {
QueryParser::for_index(&index, vec![ip_field])
.parse_query(&text)
.unwrap()
};
let query_from_ip = |ip_addr| -> Box<dyn Query> {
Box::new(TermQuery::new(
Term::from_field_ip_addr(ip_field, ip_addr),
IndexRecordOption::Basic,
))
};
assert_single_hit(query_from_ip(ip_addr_1));
assert_single_hit(query_from_ip(ip_addr_2));
assert_single_hit(query_from_text("127.0.0.1".to_string()));
assert_single_hit(query_from_text("\"127.0.0.1\"".to_string()));
assert_single_hit(query_from_text(format!("\"{}\"", ip_addr_1)));
assert_single_hit(query_from_text(format!("\"{}\"", ip_addr_2)));
}
}

View File

@@ -1,4 +1,4 @@
use std::net::{IpAddr, Ipv6Addr};
use std::net::IpAddr;
use std::str::FromStr;
use serde::{Deserialize, Serialize};
@@ -6,7 +6,7 @@ use serde_json::Value as JsonValue;
use thiserror::Error;
use super::ip_options::IpAddrOptions;
use super::Cardinality;
use super::{Cardinality, IntoIpv6Addr};
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::{
@@ -188,7 +188,7 @@ impl FieldType {
FieldType::Facet(ref _facet_options) => true,
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(),
FieldType::IpAddr(_) => false,
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_indexed(),
}
}
@@ -264,7 +264,7 @@ impl FieldType {
FieldType::Facet(_) => false,
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
FieldType::JsonObject(ref _json_object_options) => false,
FieldType::IpAddr(_) => false,
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.fieldnorms(),
}
}
@@ -309,7 +309,13 @@ impl FieldType {
FieldType::JsonObject(ref json_obj_options) => json_obj_options
.get_text_indexing_options()
.map(TextFieldIndexing::index_option),
FieldType::IpAddr(_) => None,
FieldType::IpAddr(ref ip_addr_options) => {
if ip_addr_options.is_indexed() {
Some(IndexRecordOption::Basic)
} else {
None
}
}
}
}
@@ -356,11 +362,8 @@ impl FieldType {
json: JsonValue::String(field_text),
}
})?;
let ip_addr_v6: Ipv6Addr = match ip_addr {
IpAddr::V4(v4) => v4.to_ipv6_mapped(),
IpAddr::V6(v6) => v6,
};
Ok(Value::IpAddr(ip_addr_v6))
Ok(Value::IpAddr(ip_addr.into_ipv6_addr()))
}
}
}

View File

@@ -1,3 +1,4 @@
use std::net::{IpAddr, Ipv6Addr};
use std::ops::BitOr;
use serde::{Deserialize, Serialize};
@@ -5,12 +6,29 @@ use serde::{Deserialize, Serialize};
use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
use super::Cardinality;
/// Trait to convert into an Ipv6Addr.
pub trait IntoIpv6Addr {
/// Consumes the object and returns an Ipv6Addr.
fn into_ipv6_addr(self) -> Ipv6Addr;
}
impl IntoIpv6Addr for IpAddr {
fn into_ipv6_addr(self) -> Ipv6Addr {
match self {
IpAddr::V4(addr) => addr.to_ipv6_mapped(),
IpAddr::V6(addr) => addr,
}
}
}
/// Define how an ip field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct IpAddrOptions {
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
stored: bool,
indexed: bool,
fieldnorms: bool,
}
impl IpAddrOptions {
@@ -19,11 +37,21 @@ impl IpAddrOptions {
self.fast.is_some()
}
/// Returns `true` if the json object should be stored.
/// Returns `true` if the ip address should be stored in the doc store.
pub fn is_stored(&self) -> bool {
self.stored
}
/// Returns true iff the value is indexed and therefore searchable.
pub fn is_indexed(&self) -> bool {
self.indexed
}
/// Returns true if and only if the value is normed.
pub fn fieldnorms(&self) -> bool {
self.fieldnorms
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
@@ -32,6 +60,16 @@ impl IpAddrOptions {
self.fast
}
/// Set the field as normed.
///
/// Setting an integer as normed will generate
/// the fieldnorm data for it.
#[must_use]
pub fn set_fieldnorms(mut self) -> Self {
self.fieldnorms = true;
self
}
/// Sets the field as stored
#[must_use]
pub fn set_stored(mut self) -> Self {
@@ -39,6 +77,19 @@ impl IpAddrOptions {
self
}
/// Set the field as indexed.
///
/// Setting an ip address as indexed will generate
/// a posting list for each value taken by the ip address.
/// Ips are normalized to IpV6.
///
/// This is required for the field to be searchable.
#[must_use]
pub fn set_indexed(mut self) -> Self {
self.indexed = true;
self
}
/// Set the field as a fast field.
///
/// Fast fields are designed for random access.
@@ -61,6 +112,8 @@ impl From<()> for IpAddrOptions {
impl From<FastFlag> for IpAddrOptions {
fn from(_: FastFlag) -> Self {
IpAddrOptions {
fieldnorms: false,
indexed: false,
stored: false,
fast: Some(Cardinality::SingleValue),
}
@@ -70,6 +123,8 @@ impl From<FastFlag> for IpAddrOptions {
impl From<StoredFlag> for IpAddrOptions {
fn from(_: StoredFlag) -> Self {
IpAddrOptions {
fieldnorms: false,
indexed: false,
stored: true,
fast: None,
}
@@ -79,6 +134,8 @@ impl From<StoredFlag> for IpAddrOptions {
impl From<IndexedFlag> for IpAddrOptions {
fn from(_: IndexedFlag) -> Self {
IpAddrOptions {
fieldnorms: true,
indexed: true,
stored: false,
fast: None,
}
@@ -91,6 +148,8 @@ impl<T: Into<IpAddrOptions>> BitOr<T> for IpAddrOptions {
fn bitor(self, other: T) -> IpAddrOptions {
let other = other.into();
IpAddrOptions {
fieldnorms: self.fieldnorms | other.fieldnorms,
indexed: self.indexed | other.indexed,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
}

View File

@@ -138,7 +138,7 @@ pub use self::field_type::{FieldType, Type};
pub use self::field_value::FieldValue;
pub use self::flags::{FAST, INDEXED, STORED};
pub use self::index_record_option::IndexRecordOption;
pub use self::ip_options::IpAddrOptions;
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};
pub use self::json_object_options::JsonObjectOptions;
pub use self::named_field_document::NamedFieldDocument;
pub use self::numeric_options::NumericOptions;

View File

@@ -59,7 +59,7 @@ impl From<NumericOptionsDeser> for NumericOptions {
}
impl NumericOptions {
/// Returns true iff the value is stored.
/// Returns true iff the value is stored in the doc store.
pub fn is_stored(&self) -> bool {
self.stored
}

View File

@@ -1,7 +1,10 @@
use std::convert::TryInto;
use std::hash::{Hash, Hasher};
use std::net::Ipv6Addr;
use std::{fmt, str};
use fastfield_codecs::MonotonicallyMappableToU128;
use super::Field;
use crate::fastfield::FastValue;
use crate::schema::{Facet, Type};
@@ -68,6 +71,13 @@ impl Term {
self.0.len() == TERM_METADATA_LENGTH
}
/// Builds a term given a field, and a `Ipv6Addr`-value
pub fn from_field_ip_addr(field: Field, ip_addr: Ipv6Addr) -> Term {
let mut term = Self::with_type_and_field(Type::IpAddr, field);
term.set_ip_addr(ip_addr);
term
}
/// Builds a term given a field, and a `u64`-value
pub fn from_field_u64(field: Field, val: u64) -> Term {
Term::from_fast_value(field, &val)
@@ -155,6 +165,11 @@ impl Term {
self.set_bytes(val.to_u64().to_be_bytes().as_ref());
}
/// Sets a `Ipv6Addr` value in the term.
pub fn set_ip_addr(&mut self, val: Ipv6Addr) {
self.set_bytes(val.to_u128().to_be_bytes().as_ref());
}
/// Sets the value of a `Bytes` field.
pub fn set_bytes(&mut self, bytes: &[u8]) {
self.truncate_value_bytes(0);