Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot]
809b9ba100 Update nom requirement from 7 to 8
Updates the requirements on [nom](https://github.com/rust-bakery/nom) to permit the latest version.
- [Changelog](https://github.com/rust-bakery/nom/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-bakery/nom/compare/7.0.0...8.0.0)

---
updated-dependencies:
- dependency-name: nom
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-01-27 20:21:30 +00:00
12 changed files with 94 additions and 151 deletions

View File

@@ -94,14 +94,14 @@ impl BitUnpacker {
#[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
let addr_in_bits = idx as usize * self.num_bits as usize;
let addr = addr_in_bits >> 3;
let addr_in_bits = idx * self.num_bits;
let addr = (addr_in_bits >> 3) as usize;
if addr + 8 > data.len() {
if self.num_bits == 0 {
return 0;
}
let bit_shift = addr_in_bits & 7;
return self.get_slow_path(addr, bit_shift as u32, data);
return self.get_slow_path(addr, bit_shift, data);
}
let bit_shift = addr_in_bits & 7;
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();

View File

@@ -1,18 +0,0 @@
[package]
name = "tantivy-columnar-inspect"
version = "0.1.0"
edition = "2021"
license = "MIT"
[dependencies]
tantivy = {path="../..", package="tantivy"}
columnar = {path="../", package="tantivy-columnar"}
common = {path="../../common", package="tantivy-common"}
[workspace]
members = []
[profile.release]
debug = true
#debug-assertions = true
#overflow-checks = true

View File

@@ -1,54 +0,0 @@
use columnar::ColumnarReader;
use common::file_slice::{FileSlice, WrapFile};
use std::io;
use std::path::Path;
use tantivy::directory::footer::Footer;
fn main() -> io::Result<()> {
println!("Opens a columnar file written by tantivy and validates it.");
let path = std::env::args().nth(1).unwrap();
let path = Path::new(&path);
println!("Reading {:?}", path);
let _reader = open_and_validate_columnar(path.to_str().unwrap())?;
Ok(())
}
pub fn validate_columnar_reader(reader: &ColumnarReader) {
let num_rows = reader.num_rows();
println!("num_rows: {}", num_rows);
let columns = reader.list_columns().unwrap();
println!("num columns: {:?}", columns.len());
for (col_name, dynamic_column_handle) in columns {
let col = dynamic_column_handle.open().unwrap();
match col {
columnar::DynamicColumn::Bool(_)
| columnar::DynamicColumn::I64(_)
| columnar::DynamicColumn::U64(_)
| columnar::DynamicColumn::F64(_)
| columnar::DynamicColumn::IpAddr(_)
| columnar::DynamicColumn::DateTime(_)
| columnar::DynamicColumn::Bytes(_) => {}
columnar::DynamicColumn::Str(str_column) => {
let num_vals = str_column.ords().values.num_vals();
let num_terms_dict = str_column.num_terms() as u64;
let max_ord = str_column.ords().values.iter().max().unwrap_or_default();
println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",);
for ord in str_column.ords().values.iter() {
assert!(ord < num_terms_dict);
}
}
}
}
}
/// Opens a columnar file that was written by tantivy and validates it.
pub fn open_and_validate_columnar(path: &str) -> io::Result<ColumnarReader> {
let wrap_file = WrapFile::new(std::fs::File::open(path)?)?;
let slice = FileSlice::new(std::sync::Arc::new(wrap_file));
let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap();
let reader = ColumnarReader::open(slice).unwrap();
validate_columnar_reader(&reader);
Ok(reader)
}

View File

@@ -1,6 +1,5 @@
use std::fs::File;
use std::ops::{Deref, Range, RangeBounds};
use std::path::Path;
use std::sync::Arc;
use std::{fmt, io};
@@ -178,12 +177,6 @@ fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R)
}
impl FileSlice {
/// Creates a FileSlice from a path.
pub fn open(path: &Path) -> io::Result<FileSlice> {
let wrap_file = WrapFile::new(File::open(path)?)?;
Ok(FileSlice::new(Arc::new(wrap_file)))
}
/// Wraps a FileHandle.
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
let num_bytes = file_handle.len();

View File

@@ -12,4 +12,4 @@ keywords = ["search", "information", "retrieval"]
edition = "2021"
[dependencies]
nom = "7"
nom = "8"

View File

@@ -321,17 +321,7 @@ fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
UserInputLeaf::Exists {
field: String::new(),
},
tuple((
multispace0,
char('*'),
peek(alt((
value(
"",
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
),
eof,
))),
)),
tuple((multispace0, char('*'))),
)(inp)
}
@@ -341,14 +331,7 @@ fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
peek(tuple((
field_name,
multispace0,
char('*'),
peek(alt((
value(
"",
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
),
eof,
))), // we need to check this isn't a wildcard query
char('*'), // when we are here, we know it can't be anything but a exists
))),
)(inp)
.map_err(|e| e.map(|_| ()))
@@ -1641,19 +1624,13 @@ mod test {
#[test]
fn test_exist_query() {
test_parse_query_to_ast_helper("a:*", "$exists(\"a\")");
test_parse_query_to_ast_helper("a: *", "$exists(\"a\")");
test_parse_query_to_ast_helper("a:*", "\"a\":*");
test_parse_query_to_ast_helper("a: *", "\"a\":*");
// an exist followed by default term being b
test_is_parse_err("a:*b", "(*\"a\":* *b)");
test_parse_query_to_ast_helper(
"(hello AND toto:*) OR happy",
"(?(+hello +$exists(\"toto\")) ?happy)",
);
test_parse_query_to_ast_helper("(a:*)", "$exists(\"a\")");
// these are term/wildcard query (not a phrase prefix)
// this is a term query (not a phrase prefix)
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
test_parse_query_to_ast_helper("a:*b", "\"a\":*b");
test_parse_query_to_ast_helper(r#"a:*def*"#, "\"a\":*def*");
}
#[test]

View File

@@ -101,7 +101,7 @@ impl Debug for UserInputLeaf {
}
UserInputLeaf::All => write!(formatter, "*"),
UserInputLeaf::Exists { field } => {
write!(formatter, "$exists(\"{field}\")")
write!(formatter, "\"{field}\":*")
}
}
}

View File

@@ -125,26 +125,9 @@ impl MetricResult {
}
/// BucketEntry holds bucket aggregation result types.
// the order of fields is important to deserialize properly
// Terms must be first because all Terms are valid Range (we ignore unknown fields)
// Range and Histogram are always ambiguous, they contain the same 3 required fields, and all else
// is optional Having Range is usually more useful (contains more fields, missing field from
// Histogram can be obtained by key.to_string())
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
pub enum BucketResult {
/// This is the term result
Terms {
/// The buckets.
///
/// See [`TermsAggregation`](super::bucket::TermsAggregation)
buckets: Vec<BucketEntry>,
/// The number of documents that didnt make it into to TOP N due to shard_size or size
sum_other_doc_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
/// The upper bound error for the doc count of each term.
doc_count_error_upper_bound: Option<u64>,
},
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
/// sub-aggregations.
Range {
@@ -161,6 +144,18 @@ pub enum BucketResult {
/// See [`HistogramAggregation`](super::bucket::HistogramAggregation)
buckets: BucketEntries<BucketEntry>,
},
/// This is the term result
Terms {
/// The buckets.
///
/// See [`TermsAggregation`](super::bucket::TermsAggregation)
buckets: Vec<BucketEntry>,
/// The number of documents that didnt make it into to TOP N due to shard_size or size
sum_other_doc_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
/// The upper bound error for the doc count of each term.
doc_count_error_upper_bound: Option<u64>,
},
}
impl BucketResult {

View File

@@ -34,10 +34,10 @@ use crate::aggregation::*;
pub struct DateHistogramAggregationReq {
#[doc(hidden)]
/// Only for validation
pub interval: Option<String>,
interval: Option<String>,
#[doc(hidden)]
/// Only for validation
pub calendar_interval: Option<String>,
calendar_interval: Option<String>,
/// The field to aggregate on.
pub field: String,
/// The format to format dates. Unsupported currently.

View File

@@ -1,9 +1,3 @@
//! The footer is a small metadata structure that is appended at the end of every file.
//!
//! The footer is used to store a checksum of the file content.
//! The footer also stores the version of the index format.
//! This version is used to detect incompatibility between the index and the library version.
use std::io;
use std::io::Write;
@@ -26,22 +20,20 @@ type CrcHashU32 = u32;
/// A Footer is appended to every file
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Footer {
/// The version of the index format
pub version: Version,
/// The crc32 hash of the body
pub crc: CrcHashU32,
}
impl Footer {
pub(crate) fn new(crc: CrcHashU32) -> Self {
pub fn new(crc: CrcHashU32) -> Self {
let version = crate::VERSION.clone();
Footer { version, crc }
}
pub(crate) fn crc(&self) -> CrcHashU32 {
pub fn crc(&self) -> CrcHashU32 {
self.crc
}
pub(crate) fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
let mut counting_write = CountingWriter::wrap(&mut write);
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
let footer_payload_len = counting_write.written_bytes();
@@ -50,7 +42,6 @@ impl Footer {
Ok(())
}
/// Extracts the tantivy Footer from the file and returns the footer and the rest of the file
pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> {
if file.len() < 4 {
return Err(io::Error::new(

View File

@@ -6,7 +6,7 @@ mod mmap_directory;
mod directory;
mod directory_lock;
mod file_watcher;
pub mod footer;
mod footer;
mod managed_directory;
mod ram_directory;
mod watch_event_router;

View File

@@ -1,19 +1,79 @@
use std::ops::Range;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::atomic::Ordering;
use std::sync::Arc;
use crate::Opstamp;
#[cfg(not(target_arch = "arm"))]
mod atomic_impl {
use std::sync::atomic::{AtomicU64, Ordering};
use crate::Opstamp;
#[derive(Default)]
pub struct AtomicU64Wrapper(AtomicU64);
impl AtomicU64Wrapper {
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
AtomicU64Wrapper(AtomicU64::new(first_opstamp))
}
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
self.0.fetch_add(val, order)
}
pub fn revert(&self, val: u64, order: Ordering) -> u64 {
self.0.store(val, order);
val
}
}
}
#[cfg(target_arch = "arm")]
mod atomic_impl {
/// Under other architecture, we rely on a mutex.
use std::sync::atomic::Ordering;
use std::sync::RwLock;
use crate::Opstamp;
#[derive(Default)]
pub struct AtomicU64Wrapper(RwLock<u64>);
impl AtomicU64Wrapper {
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
AtomicU64Wrapper(RwLock::new(first_opstamp))
}
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
let previous_val = *lock;
*lock = previous_val + incr;
previous_val
}
pub fn revert(&self, val: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
*lock = val;
val
}
}
}
use self::atomic_impl::AtomicU64Wrapper;
/// Stamper provides Opstamps, which is just an auto-increment id to label
/// an operation.
///
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
pub struct Stamper(Arc<AtomicU64Wrapper>);
impl Stamper {
pub fn new(first_opstamp: Opstamp) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
Stamper(Arc::new(AtomicU64Wrapper::new(first_opstamp)))
}
pub fn stamp(&self) -> Opstamp {
@@ -32,8 +92,7 @@ impl Stamper {
/// Reverts the stamper to a given `Opstamp` value and returns it
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
self.0.store(to_opstamp, Ordering::SeqCst);
to_opstamp
self.0.revert(to_opstamp, Ordering::SeqCst)
}
}