use usize in bitpacker

use usize in bitpacker to enable larger columns in the columnar store

Godbolt comparison with u32 vs u64 for get access: https://godbolt.org/z/cjf7nenYP

Add a mini-tool to inspect columnar files created by tantivy. (very basic functionality which can be extended later)
This commit is contained in:
Pascal Seitz
2025-02-20 14:44:43 +01:00
parent 876a579e5d
commit e7daf69de9
6 changed files with 95 additions and 7 deletions

View File

@@ -94,14 +94,14 @@ impl BitUnpacker {
#[inline] #[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 { pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
let addr_in_bits = idx * self.num_bits; let addr_in_bits = idx as usize * self.num_bits as usize;
let addr = (addr_in_bits >> 3) as usize; let addr = addr_in_bits >> 3;
if addr + 8 > data.len() { if addr + 8 > data.len() {
if self.num_bits == 0 { if self.num_bits == 0 {
return 0; return 0;
} }
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
return self.get_slow_path(addr, bit_shift, data); return self.get_slow_path(addr, bit_shift as u32, data);
} }
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap(); let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();

View File

@@ -0,0 +1,18 @@
[package]
name = "tantivy-columnar-inspect"
version = "0.1.0"
edition = "2021"
license = "MIT"
[dependencies]
tantivy = {path="../..", package="tantivy"}
columnar = {path="../", package="tantivy-columnar"}
common = {path="../../common", package="tantivy-common"}
[workspace]
members = []
[profile.release]
debug = true
#debug-assertions = true
#overflow-checks = true

View File

@@ -0,0 +1,54 @@
use columnar::ColumnarReader;
use common::file_slice::{FileSlice, WrapFile};
use std::io;
use std::path::Path;
use tantivy::directory::footer::Footer;
fn main() -> io::Result<()> {
println!("Opens a columnar file written by tantivy and validates it.");
let path = std::env::args().nth(1).unwrap();
let path = Path::new(&path);
println!("Reading {:?}", path);
let _reader = open_and_validate_columnar(path.to_str().unwrap())?;
Ok(())
}
pub fn validate_columnar_reader(reader: &ColumnarReader) {
let num_rows = reader.num_rows();
println!("num_rows: {}", num_rows);
let columns = reader.list_columns().unwrap();
println!("num columns: {:?}", columns.len());
for (col_name, dynamic_column_handle) in columns {
let col = dynamic_column_handle.open().unwrap();
match col {
columnar::DynamicColumn::Bool(_)
| columnar::DynamicColumn::I64(_)
| columnar::DynamicColumn::U64(_)
| columnar::DynamicColumn::F64(_)
| columnar::DynamicColumn::IpAddr(_)
| columnar::DynamicColumn::DateTime(_)
| columnar::DynamicColumn::Bytes(_) => {}
columnar::DynamicColumn::Str(str_column) => {
let num_vals = str_column.ords().values.num_vals();
let num_terms_dict = str_column.num_terms() as u64;
let max_ord = str_column.ords().values.iter().max().unwrap_or_default();
println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",);
for ord in str_column.ords().values.iter() {
assert!(ord < num_terms_dict);
}
}
}
}
}
/// Opens a columnar file that was written by tantivy and validates it.
pub fn open_and_validate_columnar(path: &str) -> io::Result<ColumnarReader> {
let wrap_file = WrapFile::new(std::fs::File::open(path)?)?;
let slice = FileSlice::new(std::sync::Arc::new(wrap_file));
let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap();
let reader = ColumnarReader::open(slice).unwrap();
validate_columnar_reader(&reader);
Ok(reader)
}

View File

@@ -1,5 +1,6 @@
use std::fs::File; use std::fs::File;
use std::ops::{Deref, Range, RangeBounds}; use std::ops::{Deref, Range, RangeBounds};
use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, io}; use std::{fmt, io};
@@ -177,6 +178,12 @@ fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R)
} }
impl FileSlice { impl FileSlice {
/// Creates a FileSlice from a path.
pub fn open(path: &Path) -> io::Result<FileSlice> {
let wrap_file = WrapFile::new(File::open(path)?)?;
Ok(FileSlice::new(Arc::new(wrap_file)))
}
/// Wraps a FileHandle. /// Wraps a FileHandle.
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self { pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
let num_bytes = file_handle.len(); let num_bytes = file_handle.len();

View File

@@ -1,3 +1,9 @@
//! The footer is a small metadata structure that is appended at the end of every file.
//!
//! The footer is used to store a checksum of the file content.
//! The footer also stores the version of the index format.
//! This version is used to detect incompatibility between the index and the library version.
use std::io; use std::io;
use std::io::Write; use std::io::Write;
@@ -20,20 +26,22 @@ type CrcHashU32 = u32;
/// A Footer is appended to every file /// A Footer is appended to every file
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Footer { pub struct Footer {
/// The version of the index format
pub version: Version, pub version: Version,
/// The crc32 hash of the body
pub crc: CrcHashU32, pub crc: CrcHashU32,
} }
impl Footer { impl Footer {
pub fn new(crc: CrcHashU32) -> Self { pub(crate) fn new(crc: CrcHashU32) -> Self {
let version = crate::VERSION.clone(); let version = crate::VERSION.clone();
Footer { version, crc } Footer { version, crc }
} }
pub fn crc(&self) -> CrcHashU32 { pub(crate) fn crc(&self) -> CrcHashU32 {
self.crc self.crc
} }
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> { pub(crate) fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
let mut counting_write = CountingWriter::wrap(&mut write); let mut counting_write = CountingWriter::wrap(&mut write);
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?; counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
let footer_payload_len = counting_write.written_bytes(); let footer_payload_len = counting_write.written_bytes();
@@ -42,6 +50,7 @@ impl Footer {
Ok(()) Ok(())
} }
/// Extracts the tantivy Footer from the file and returns the footer and the rest of the file
pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> { pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> {
if file.len() < 4 { if file.len() < 4 {
return Err(io::Error::new( return Err(io::Error::new(

View File

@@ -6,7 +6,7 @@ mod mmap_directory;
mod directory; mod directory;
mod directory_lock; mod directory_lock;
mod file_watcher; mod file_watcher;
mod footer; pub mod footer;
mod managed_directory; mod managed_directory;
mod ram_directory; mod ram_directory;
mod watch_event_router; mod watch_event_router;