From e7daf69de96d0aafa67c4a52ef3b04579cabc090 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 20 Feb 2025 14:44:43 +0100 Subject: [PATCH] use usize in bitpacker use usize in bitpacker to enable larger columns in the columnar store Godbolt comparison with u32 vs u64 for get access: https://godbolt.org/z/cjf7nenYP Add a mini-tool to inspect columnar files created by tantivy. (very basic functionality which can be extended later) --- bitpacker/src/bitpacker.rs | 6 +-- columnar/columnar-cli-inspect/Cargo.toml | 18 ++++++++ columnar/columnar-cli-inspect/src/main.rs | 54 +++++++++++++++++++++++ common/src/file_slice.rs | 7 +++ src/directory/footer.rs | 15 +++++-- src/directory/mod.rs | 2 +- 6 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 columnar/columnar-cli-inspect/Cargo.toml create mode 100644 columnar/columnar-cli-inspect/src/main.rs diff --git a/bitpacker/src/bitpacker.rs b/bitpacker/src/bitpacker.rs index d270647a9..021f01d82 100644 --- a/bitpacker/src/bitpacker.rs +++ b/bitpacker/src/bitpacker.rs @@ -94,14 +94,14 @@ impl BitUnpacker { #[inline] pub fn get(&self, idx: u32, data: &[u8]) -> u64 { - let addr_in_bits = idx * self.num_bits; - let addr = (addr_in_bits >> 3) as usize; + let addr_in_bits = idx as usize * self.num_bits as usize; + let addr = addr_in_bits >> 3; if addr + 8 > data.len() { if self.num_bits == 0 { return 0; } let bit_shift = addr_in_bits & 7; - return self.get_slow_path(addr, bit_shift, data); + return self.get_slow_path(addr, bit_shift as u32, data); } let bit_shift = addr_in_bits & 7; let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap(); diff --git a/columnar/columnar-cli-inspect/Cargo.toml b/columnar/columnar-cli-inspect/Cargo.toml new file mode 100644 index 000000000..f43bec1d2 --- /dev/null +++ b/columnar/columnar-cli-inspect/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "tantivy-columnar-inspect" +version = "0.1.0" +edition = "2021" +license = "MIT" + +[dependencies] +tantivy = {path="../..", package="tantivy"} +columnar = {path="../", package="tantivy-columnar"} +common = {path="../../common", package="tantivy-common"} + +[workspace] +members = [] + +[profile.release] +debug = true +#debug-assertions = true +#overflow-checks = true diff --git a/columnar/columnar-cli-inspect/src/main.rs b/columnar/columnar-cli-inspect/src/main.rs new file mode 100644 index 000000000..ee99cc058 --- /dev/null +++ b/columnar/columnar-cli-inspect/src/main.rs @@ -0,0 +1,54 @@ +use columnar::ColumnarReader; +use common::file_slice::{FileSlice, WrapFile}; +use std::io; +use std::path::Path; +use tantivy::directory::footer::Footer; + +fn main() -> io::Result<()> { + println!("Opens a columnar file written by tantivy and validates it."); + let path = std::env::args().nth(1).unwrap(); + + let path = Path::new(&path); + println!("Reading {:?}", path); + let _reader = open_and_validate_columnar(path.to_str().unwrap())?; + + Ok(()) +} + +pub fn validate_columnar_reader(reader: &ColumnarReader) { + let num_rows = reader.num_rows(); + println!("num_rows: {}", num_rows); + let columns = reader.list_columns().unwrap(); + println!("num columns: {:?}", columns.len()); + for (col_name, dynamic_column_handle) in columns { + let col = dynamic_column_handle.open().unwrap(); + match col { + columnar::DynamicColumn::Bool(_) + | columnar::DynamicColumn::I64(_) + | columnar::DynamicColumn::U64(_) + | columnar::DynamicColumn::F64(_) + | columnar::DynamicColumn::IpAddr(_) + | columnar::DynamicColumn::DateTime(_) + | columnar::DynamicColumn::Bytes(_) => {} + columnar::DynamicColumn::Str(str_column) => { + let num_vals = str_column.ords().values.num_vals(); + let num_terms_dict = str_column.num_terms() as u64; + let max_ord = str_column.ords().values.iter().max().unwrap_or_default(); + println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",); + for ord in str_column.ords().values.iter() { + assert!(ord < num_terms_dict); + } + } + } + } +} + +/// Opens a columnar file that was written by tantivy and validates it. +pub fn open_and_validate_columnar(path: &str) -> io::Result { + let wrap_file = WrapFile::new(std::fs::File::open(path)?)?; + let slice = FileSlice::new(std::sync::Arc::new(wrap_file)); + let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap(); + let reader = ColumnarReader::open(slice).unwrap(); + validate_columnar_reader(&reader); + Ok(reader) +} diff --git a/common/src/file_slice.rs b/common/src/file_slice.rs index b730f8b39..0d5d1adcd 100644 --- a/common/src/file_slice.rs +++ b/common/src/file_slice.rs @@ -1,5 +1,6 @@ use std::fs::File; use std::ops::{Deref, Range, RangeBounds}; +use std::path::Path; use std::sync::Arc; use std::{fmt, io}; @@ -177,6 +178,12 @@ fn combine_ranges>(orig_range: Range, rel_range: R) } impl FileSlice { + /// Creates a FileSlice from a path. + pub fn open(path: &Path) -> io::Result { + let wrap_file = WrapFile::new(File::open(path)?)?; + Ok(FileSlice::new(Arc::new(wrap_file))) + } + /// Wraps a FileHandle. pub fn new(file_handle: Arc) -> Self { let num_bytes = file_handle.len(); diff --git a/src/directory/footer.rs b/src/directory/footer.rs index 70acceebd..bffa2f2cf 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -1,3 +1,9 @@ +//! The footer is a small metadata structure that is appended at the end of every file. +//! +//! The footer is used to store a checksum of the file content. +//! The footer also stores the version of the index format. +//! This version is used to detect incompatibility between the index and the library version. + use std::io; use std::io::Write; @@ -20,20 +26,22 @@ type CrcHashU32 = u32; /// A Footer is appended to every file #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Footer { + /// The version of the index format pub version: Version, + /// The crc32 hash of the body pub crc: CrcHashU32, } impl Footer { - pub fn new(crc: CrcHashU32) -> Self { + pub(crate) fn new(crc: CrcHashU32) -> Self { let version = crate::VERSION.clone(); Footer { version, crc } } - pub fn crc(&self) -> CrcHashU32 { + pub(crate) fn crc(&self) -> CrcHashU32 { self.crc } - pub fn append_footer(&self, mut write: &mut W) -> io::Result<()> { + pub(crate) fn append_footer(&self, mut write: &mut W) -> io::Result<()> { let mut counting_write = CountingWriter::wrap(&mut write); counting_write.write_all(serde_json::to_string(&self)?.as_ref())?; let footer_payload_len = counting_write.written_bytes(); @@ -42,6 +50,7 @@ impl Footer { Ok(()) } + /// Extracts the tantivy Footer from the file and returns the footer and the rest of the file pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> { if file.len() < 4 { return Err(io::Error::new( diff --git a/src/directory/mod.rs b/src/directory/mod.rs index 93c922567..7fab7e051 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -6,7 +6,7 @@ mod mmap_directory; mod directory; mod directory_lock; mod file_watcher; -mod footer; +pub mod footer; mod managed_directory; mod ram_directory; mod watch_event_router;