mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
use usize in bitpacker
use usize in bitpacker to enable larger columns in the columnar store Godbolt comparison with u32 vs u64 for get access: https://godbolt.org/z/cjf7nenYP Add a mini-tool to inspect columnar files created by tantivy. (very basic functionality which can be extended later)
This commit is contained in:
@@ -94,14 +94,14 @@ impl BitUnpacker {
|
|||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
|
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
|
||||||
let addr_in_bits = idx * self.num_bits;
|
let addr_in_bits = idx as usize * self.num_bits as usize;
|
||||||
let addr = (addr_in_bits >> 3) as usize;
|
let addr = addr_in_bits >> 3;
|
||||||
if addr + 8 > data.len() {
|
if addr + 8 > data.len() {
|
||||||
if self.num_bits == 0 {
|
if self.num_bits == 0 {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
let bit_shift = addr_in_bits & 7;
|
let bit_shift = addr_in_bits & 7;
|
||||||
return self.get_slow_path(addr, bit_shift, data);
|
return self.get_slow_path(addr, bit_shift as u32, data);
|
||||||
}
|
}
|
||||||
let bit_shift = addr_in_bits & 7;
|
let bit_shift = addr_in_bits & 7;
|
||||||
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
|
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
|
||||||
|
|||||||
18
columnar/columnar-cli-inspect/Cargo.toml
Normal file
18
columnar/columnar-cli-inspect/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[package]
|
||||||
|
name = "tantivy-columnar-inspect"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
license = "MIT"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
tantivy = {path="../..", package="tantivy"}
|
||||||
|
columnar = {path="../", package="tantivy-columnar"}
|
||||||
|
common = {path="../../common", package="tantivy-common"}
|
||||||
|
|
||||||
|
[workspace]
|
||||||
|
members = []
|
||||||
|
|
||||||
|
[profile.release]
|
||||||
|
debug = true
|
||||||
|
#debug-assertions = true
|
||||||
|
#overflow-checks = true
|
||||||
54
columnar/columnar-cli-inspect/src/main.rs
Normal file
54
columnar/columnar-cli-inspect/src/main.rs
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
use columnar::ColumnarReader;
|
||||||
|
use common::file_slice::{FileSlice, WrapFile};
|
||||||
|
use std::io;
|
||||||
|
use std::path::Path;
|
||||||
|
use tantivy::directory::footer::Footer;
|
||||||
|
|
||||||
|
fn main() -> io::Result<()> {
|
||||||
|
println!("Opens a columnar file written by tantivy and validates it.");
|
||||||
|
let path = std::env::args().nth(1).unwrap();
|
||||||
|
|
||||||
|
let path = Path::new(&path);
|
||||||
|
println!("Reading {:?}", path);
|
||||||
|
let _reader = open_and_validate_columnar(path.to_str().unwrap())?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn validate_columnar_reader(reader: &ColumnarReader) {
|
||||||
|
let num_rows = reader.num_rows();
|
||||||
|
println!("num_rows: {}", num_rows);
|
||||||
|
let columns = reader.list_columns().unwrap();
|
||||||
|
println!("num columns: {:?}", columns.len());
|
||||||
|
for (col_name, dynamic_column_handle) in columns {
|
||||||
|
let col = dynamic_column_handle.open().unwrap();
|
||||||
|
match col {
|
||||||
|
columnar::DynamicColumn::Bool(_)
|
||||||
|
| columnar::DynamicColumn::I64(_)
|
||||||
|
| columnar::DynamicColumn::U64(_)
|
||||||
|
| columnar::DynamicColumn::F64(_)
|
||||||
|
| columnar::DynamicColumn::IpAddr(_)
|
||||||
|
| columnar::DynamicColumn::DateTime(_)
|
||||||
|
| columnar::DynamicColumn::Bytes(_) => {}
|
||||||
|
columnar::DynamicColumn::Str(str_column) => {
|
||||||
|
let num_vals = str_column.ords().values.num_vals();
|
||||||
|
let num_terms_dict = str_column.num_terms() as u64;
|
||||||
|
let max_ord = str_column.ords().values.iter().max().unwrap_or_default();
|
||||||
|
println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",);
|
||||||
|
for ord in str_column.ords().values.iter() {
|
||||||
|
assert!(ord < num_terms_dict);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Opens a columnar file that was written by tantivy and validates it.
|
||||||
|
pub fn open_and_validate_columnar(path: &str) -> io::Result<ColumnarReader> {
|
||||||
|
let wrap_file = WrapFile::new(std::fs::File::open(path)?)?;
|
||||||
|
let slice = FileSlice::new(std::sync::Arc::new(wrap_file));
|
||||||
|
let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap();
|
||||||
|
let reader = ColumnarReader::open(slice).unwrap();
|
||||||
|
validate_columnar_reader(&reader);
|
||||||
|
Ok(reader)
|
||||||
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::ops::{Deref, Range, RangeBounds};
|
use std::ops::{Deref, Range, RangeBounds};
|
||||||
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::{fmt, io};
|
use std::{fmt, io};
|
||||||
|
|
||||||
@@ -177,6 +178,12 @@ fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R)
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl FileSlice {
|
impl FileSlice {
|
||||||
|
/// Creates a FileSlice from a path.
|
||||||
|
pub fn open(path: &Path) -> io::Result<FileSlice> {
|
||||||
|
let wrap_file = WrapFile::new(File::open(path)?)?;
|
||||||
|
Ok(FileSlice::new(Arc::new(wrap_file)))
|
||||||
|
}
|
||||||
|
|
||||||
/// Wraps a FileHandle.
|
/// Wraps a FileHandle.
|
||||||
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
|
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
|
||||||
let num_bytes = file_handle.len();
|
let num_bytes = file_handle.len();
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
//! The footer is a small metadata structure that is appended at the end of every file.
|
||||||
|
//!
|
||||||
|
//! The footer is used to store a checksum of the file content.
|
||||||
|
//! The footer also stores the version of the index format.
|
||||||
|
//! This version is used to detect incompatibility between the index and the library version.
|
||||||
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|
||||||
@@ -20,20 +26,22 @@ type CrcHashU32 = u32;
|
|||||||
/// A Footer is appended to every file
|
/// A Footer is appended to every file
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
pub struct Footer {
|
pub struct Footer {
|
||||||
|
/// The version of the index format
|
||||||
pub version: Version,
|
pub version: Version,
|
||||||
|
/// The crc32 hash of the body
|
||||||
pub crc: CrcHashU32,
|
pub crc: CrcHashU32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Footer {
|
impl Footer {
|
||||||
pub fn new(crc: CrcHashU32) -> Self {
|
pub(crate) fn new(crc: CrcHashU32) -> Self {
|
||||||
let version = crate::VERSION.clone();
|
let version = crate::VERSION.clone();
|
||||||
Footer { version, crc }
|
Footer { version, crc }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn crc(&self) -> CrcHashU32 {
|
pub(crate) fn crc(&self) -> CrcHashU32 {
|
||||||
self.crc
|
self.crc
|
||||||
}
|
}
|
||||||
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
|
pub(crate) fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
|
||||||
let mut counting_write = CountingWriter::wrap(&mut write);
|
let mut counting_write = CountingWriter::wrap(&mut write);
|
||||||
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
|
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
|
||||||
let footer_payload_len = counting_write.written_bytes();
|
let footer_payload_len = counting_write.written_bytes();
|
||||||
@@ -42,6 +50,7 @@ impl Footer {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extracts the tantivy Footer from the file and returns the footer and the rest of the file
|
||||||
pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> {
|
pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> {
|
||||||
if file.len() < 4 {
|
if file.len() < 4 {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ mod mmap_directory;
|
|||||||
mod directory;
|
mod directory;
|
||||||
mod directory_lock;
|
mod directory_lock;
|
||||||
mod file_watcher;
|
mod file_watcher;
|
||||||
mod footer;
|
pub mod footer;
|
||||||
mod managed_directory;
|
mod managed_directory;
|
||||||
mod ram_directory;
|
mod ram_directory;
|
||||||
mod watch_event_router;
|
mod watch_event_router;
|
||||||
|
|||||||
Reference in New Issue
Block a user