From 431c187a604f9c33e6d92a4a7f58cd588a27cb8d Mon Sep 17 00:00:00 2001 From: petr-tik Date: Sat, 14 Dec 2019 00:14:33 +0000 Subject: [PATCH] Make error handling richer in Footer::is_compatible (#724) * WIP implemented is_compatible hide Footer::from_bytes from public consumption - only found Footer::extract used outside the module Add a new error type for IncompatibleIndex add a prototypical call to footer.is_compatible() in ManagedDirectory::open_read to make sure we error before reading it further * Make error handling more ergonomic Add an error subtype for OpenReadError and converters to TantivyError * Remove an unnecessary assert it's follower by the same check that Errors instead of panicking * Correct the compatibility check logic Leave a defensive versioned footer check to make sure we add new logic handling when we add possible footer versions Restricted VersionedFooter::from_bytes to be used inside the crate only remove a half-baked test * WIP. * Return an error if index incompatible - closes #662 Enrich the error type with incompatibility Change return type to Result, instead of bool Add an Incompatibility enum that enriches the IncompatibleIndex error variant with information, which then allows us to generate a developer-friendly hint how to upgrade library version or switch feature flags for a different compression algorithm Updated changelog Change the signature of is_compatible Added documentation to the Incompatibility Added a conditional test on a Footer with lz4 erroring --- CHANGELOG.md | 4 +- src/directory/error.rs | 84 +++++- src/directory/footer.rs | 410 +++++++++++++++++------------ src/directory/managed_directory.rs | 14 +- src/directory/mod.rs | 7 + src/directory/read_only_source.rs | 6 + src/error.rs | 8 +- src/lib.rs | 68 ++++- src/positions/reader.rs | 7 +- src/store/compression_lz4.rs | 7 +- src/store/compression_snap.rs | 5 + src/store/mod.rs | 8 +- 12 files changed, 423 insertions(+), 205 deletions(-) mode change 100755 => 100644 src/lib.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 0db60c6ec..94be9ef3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Tantivy 0.11.0 - API change around `Box`. See detail in #629 - Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock) - Add footer with some metadata to index files. #605 (@fdb-hiroshima) +- Add a method to check the compatibility of the footer in the index with the running version of tantivy (@petr-tik) - TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock) - Added handling of pre-tokenized text fields (#642), which will enable users to load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara) @@ -16,10 +17,11 @@ Tantivy 0.11.0 ## How to update? +- The index format is changed. You are required to reindex your data to use tantivy 0.11. - `Box` has been replaced by a `BoxedTokenizer` struct. - Regex are now compiled when the `RegexQuery` instance is built. As a result, it can now return an error and handling the `Result` is required. - +- `tantivy::version()` now returns a `Version` object. This object implements `ToString()` Tantivy 0.10.2 ===================== diff --git a/src/directory/error.rs b/src/directory/error.rs index 4cc509443..9a3ff44eb 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -1,3 +1,4 @@ +use crate::Version; use std::error::Error as StdError; use std::fmt; use std::io; @@ -156,6 +157,65 @@ impl StdError for OpenWriteError { } } +/// Type of index incompatibility between the library and the index found on disk +/// Used to catch and provide a hint to solve this incompatibility issue +pub enum Incompatibility { + /// This library cannot decompress the index found on disk + CompressionMismatch { + /// Compression algorithm used by the current version of tantivy + library_compression_format: String, + /// Compression algorithm that was used to serialise the index + index_compression_format: String, + }, + /// The index format found on disk isn't supported by this version of the library + IndexMismatch { + /// Version used by the library + library_version: Version, + /// Version the index was built with + index_version: Version, + }, +} + +impl fmt::Debug for Incompatibility { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + match self { + Incompatibility::CompressionMismatch { + library_compression_format, + index_compression_format, + } => { + let err = format!( + "Library was compiled with {:?} compression, index was compressed with {:?}", + library_compression_format, index_compression_format + ); + let advice = format!( + "Change the feature flag to {:?} and rebuild the library", + index_compression_format + ); + write!(f, "{}. {}", err, advice)?; + } + Incompatibility::IndexMismatch { + library_version, + index_version, + } => { + let err = format!( + "Library version: {}, index version: {}", + library_version.index_format_version, index_version.index_format_version + ); + // TODO make a more useful error message + // include the version range that supports this index_format_version + let advice = format!( + "Change tantivy to a version compatible with index format {} (e.g. {}.{}.x) \ + and rebuild your project.", + index_version.index_format_version, index_version.major, index_version.minor + ); + write!(f, "{}. {}", err, advice)?; + } + } + + Ok(()) + } +} + /// Error that may occur when accessing a file read #[derive(Debug)] pub enum OpenReadError { @@ -164,6 +224,8 @@ pub enum OpenReadError { /// Any kind of IO error that happens when /// interacting with the underlying IO device. IOError(IOError), + /// This library doesn't support the index version found on disk + IncompatibleIndex(Incompatibility), } impl From for OpenReadError { @@ -183,19 +245,9 @@ impl fmt::Display for OpenReadError { "an io error occurred while opening a file for reading: '{}'", err ), - } - } -} - -impl StdError for OpenReadError { - fn description(&self) -> &str { - "error occurred while opening a file for reading" - } - - fn cause(&self) -> Option<&dyn StdError> { - match *self { - OpenReadError::FileDoesNotExist(_) => None, - OpenReadError::IOError(ref err) => Some(err), + OpenReadError::IncompatibleIndex(ref footer) => { + write!(f, "Incompatible index format: {:?}", footer) + } } } } @@ -216,6 +268,12 @@ impl From for DeleteError { } } +impl From for OpenReadError { + fn from(incompatibility: Incompatibility) -> Self { + OpenReadError::IncompatibleIndex(incompatibility) + } +} + impl fmt::Display for DeleteError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { diff --git a/src/directory/footer.rs b/src/directory/footer.rs index 1cfe911e0..c1f471788 100644 --- a/src/directory/footer.rs +++ b/src/directory/footer.rs @@ -1,181 +1,175 @@ +use crate::common::{BinarySerializable, CountingWriter, FixedSize, VInt}; +use crate::directory::error::Incompatibility; use crate::directory::read_only_source::ReadOnlySource; use crate::directory::{AntiCallToken, TerminatingWrite}; -use byteorder::{ByteOrder, LittleEndian}; +use crate::Version; +use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; use crc32fast::Hasher; use std::io; use std::io::Write; -const COMMON_FOOTER_SIZE: usize = 4 * 5; - type CrcHashU32 = u32; #[derive(Debug, Clone, PartialEq)] pub struct Footer { - pub tantivy_version: (u32, u32, u32), + pub version: Version, pub meta: String, pub versioned_footer: VersionedFooter, } +/// Serialises the footer to a byte-array +/// - versioned_footer_len : 4 bytes +///- versioned_footer: variable bytes +/// - meta_len: 4 bytes +/// - meta: variable bytes +/// - version_len: 4 bytes +/// - version json: variable bytes +impl BinarySerializable for Footer { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + BinarySerializable::serialize(&self.versioned_footer, writer)?; + BinarySerializable::serialize(&self.meta, writer)?; + let version_string = + serde_json::to_string(&self.version).map_err(|_err| io::ErrorKind::InvalidInput)?; + BinarySerializable::serialize(&version_string, writer)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let versioned_footer = VersionedFooter::deserialize(reader)?; + let meta = String::deserialize(reader)?; + let version_json = String::deserialize(reader)?; + let version = serde_json::from_str(&version_json)?; + Ok(Footer { + version, + meta, + versioned_footer, + }) + } +} + impl Footer { pub fn new(versioned_footer: VersionedFooter) -> Self { - let tantivy_version = ( - env!("CARGO_PKG_VERSION_MAJOR").parse().unwrap(), - env!("CARGO_PKG_VERSION_MINOR").parse().unwrap(), - env!("CARGO_PKG_VERSION_PATCH").parse().unwrap(), - ); + let version = crate::VERSION.clone(); + let meta = version.to_string(); Footer { - tantivy_version, - meta: format!( - "tantivy v{}.{}.{}, index_format v{}", - tantivy_version.0, - tantivy_version.1, - tantivy_version.2, - versioned_footer.version() - ), + version, + meta, versioned_footer, } } - /// Serialises the footer to a byte-array - /// [ versioned_footer | meta | common_footer ] - /// [ 0..8 | 8..32 | 32..52 ] - pub fn to_bytes(&self) -> Vec { - let mut res = self.versioned_footer.to_bytes(); - res.extend_from_slice(self.meta.as_bytes()); - let len = res.len(); - res.resize(len + COMMON_FOOTER_SIZE, 0); - let mut common_footer = &mut res[len..]; - LittleEndian::write_u32(&mut common_footer, self.meta.len() as u32); - LittleEndian::write_u32(&mut common_footer[4..], self.tantivy_version.0); - LittleEndian::write_u32(&mut common_footer[8..], self.tantivy_version.1); - LittleEndian::write_u32(&mut common_footer[12..], self.tantivy_version.2); - LittleEndian::write_u32(&mut common_footer[16..], (len + COMMON_FOOTER_SIZE) as u32); - res - } - - pub fn from_bytes(data: &[u8]) -> Result { - let len = data.len(); - if len < COMMON_FOOTER_SIZE + 4 { - // 4 bytes for index version, stored in versioned footer - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - format!("File corrupted. The footer len must be over 24, while the entire file len is {}", len) - ) - ); - } - - let size = LittleEndian::read_u32(&data[len - 4..]) as usize; - if len < size as usize { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - format!( - "The footer len is {}, while the entire file len is {}. \ - Your index is either corrupted or was built using a tantivy version\ - anterior to 0.11.", - size, len - ), - )); - } - let footer = &data[len - size as usize..]; - let meta_len = LittleEndian::read_u32(&footer[size - COMMON_FOOTER_SIZE..]) as usize; - let tantivy_major = LittleEndian::read_u32(&footer[size - 16..]); - let tantivy_minor = LittleEndian::read_u32(&footer[size - 12..]); - let tantivy_patch = LittleEndian::read_u32(&footer[size - 8..]); - Ok(Footer { - tantivy_version: (tantivy_major, tantivy_minor, tantivy_patch), - meta: String::from_utf8_lossy( - &footer[size - meta_len - COMMON_FOOTER_SIZE..size - COMMON_FOOTER_SIZE], - ) - .into_owned(), - versioned_footer: VersionedFooter::from_bytes( - &footer[..size - meta_len - COMMON_FOOTER_SIZE], - )?, - }) + pub fn append_footer(&self, mut write: &mut W) -> io::Result<()> { + let mut counting_write = CountingWriter::wrap(&mut write); + self.serialize(&mut counting_write)?; + let written_len = counting_write.written_bytes(); + write.write_u32::(written_len as u32)?; + Ok(()) } pub fn extract_footer(source: ReadOnlySource) -> Result<(Footer, ReadOnlySource), io::Error> { - let footer = Footer::from_bytes(source.as_slice())?; - let reader = source.slice_to(source.as_slice().len() - footer.size()); - Ok((footer, reader)) + if source.len() < 4 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "File corrupted. The file is smaller than 4 bytes (len={}).", + source.len() + ), + )); + } + let (body_footer, footer_len_bytes) = source.split_from_end(u32::SIZE_IN_BYTES); + let footer_len = LittleEndian::read_u32(footer_len_bytes.as_slice()) as usize; + let body_len = body_footer.len() - footer_len; + let (body, footer_data) = body_footer.split(body_len); + let mut cursor = footer_data.as_slice(); + let footer = Footer::deserialize(&mut cursor)?; + Ok((footer, body)) } - pub fn size(&self) -> usize { - self.versioned_footer.size() as usize + self.meta.len() + COMMON_FOOTER_SIZE + /// Confirms that the index will be read correctly by this version of tantivy + /// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory + pub fn is_compatible(&self) -> Result<(), Incompatibility> { + let library_version = crate::version(); + match &self.versioned_footer { + VersionedFooter::V1 { + crc32: _crc, + store_compression: compression, + } => { + if &library_version.store_compression != compression { + return Err(Incompatibility::CompressionMismatch { + library_compression_format: library_version.store_compression.to_string(), + index_compression_format: compression.to_string(), + }); + } + Ok(()) + } + VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch { + library_version: library_version.clone(), + index_version: self.version.clone(), + }), + } } } /// Footer that includes a crc32 hash that enables us to checksum files in the index #[derive(Debug, Clone, PartialEq)] pub enum VersionedFooter { - UnknownVersion { version: u32, size: u32 }, - V0(CrcHashU32), // crc + UnknownVersion, + V1 { + crc32: CrcHashU32, + store_compression: String, + }, +} + +impl BinarySerializable for VersionedFooter { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + let mut buf = Vec::new(); + match self { + VersionedFooter::V1 { + crc32, + store_compression: compression, + } => { + // Serializes a valid `VersionedFooter` or panics if the version is unknown + // [ version | crc_hash | compression_mode ] + // [ 0..4 | 4..8 | variable ] + BinarySerializable::serialize(&1u32, &mut buf)?; + BinarySerializable::serialize(crc32, &mut buf)?; + BinarySerializable::serialize(compression, &mut buf)?; + } + VersionedFooter::UnknownVersion => { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Cannot serialize an unknown versioned footer ", + )); + } + } + BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?; + writer.write_all(&buf[..])?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let len = VInt::deserialize(reader)?.0 as usize; + let mut buf = vec![0u8; len]; + reader.read_exact(&mut buf[..])?; + let mut cursor = &buf[..]; + let version = u32::deserialize(&mut cursor)?; + if version == 1 { + let crc32 = u32::deserialize(&mut cursor)?; + let compression = String::deserialize(&mut cursor)?; + Ok(VersionedFooter::V1 { + crc32, + store_compression: compression, + }) + } else { + Ok(VersionedFooter::UnknownVersion) + } + } } impl VersionedFooter { - /// Serializes a valid `VersionedFooter` or panics if the version is unknown - /// [ version | crc_hash ] - /// [ 0..4 | 4..8 ] - pub fn to_bytes(&self) -> Vec { - match self { - VersionedFooter::V0(crc) => { - let mut buf = [0u8; 8]; - LittleEndian::write_u32(&mut buf[0..4], 0); - LittleEndian::write_u32(&mut buf[4..8], *crc); - buf.to_vec() - } - VersionedFooter::UnknownVersion { .. } => { - panic!("Unsupported index should never get serialized"); - } - } - } - - pub fn from_bytes(footer: &[u8]) -> Result { - assert!(footer.len() >= 4); - if footer.len() < 4 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "Footer should be more than 4 bytes.", - )); - } - let version = LittleEndian::read_u32(footer); - match version { - // the first 4 bytes should be zeroed out thus returning a `0` - 0 => { - if footer.len() != 8 { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - format!( - "File corrupted. The versioned footer len is {}, while it should be 8", - footer.len() - ), - )); - } - Ok(VersionedFooter::V0(LittleEndian::read_u32(&footer[4..]))) - } - version => Ok(VersionedFooter::UnknownVersion { - version, - size: footer.len() as u32, - }), - } - } - - pub fn size(&self) -> u32 { - match self { - VersionedFooter::V0(_) => 8, - VersionedFooter::UnknownVersion { size, .. } => *size, - } - } - - pub fn version(&self) -> u32 { - match self { - VersionedFooter::V0(_) => 0, - VersionedFooter::UnknownVersion { version, .. } => *version, - } - } - pub fn crc(&self) -> Option { match self { - VersionedFooter::V0(crc) => Some(*crc), + VersionedFooter::V1 { crc32, .. } => Some(*crc32), VersionedFooter::UnknownVersion { .. } => None, } } @@ -211,10 +205,13 @@ impl Write for FooterProxy { impl TerminatingWrite for FooterProxy { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { - let crc = self.hasher.take().unwrap().finalize(); - let footer = Footer::new(VersionedFooter::V0(crc)).to_bytes(); + let crc32 = self.hasher.take().unwrap().finalize(); + let footer = Footer::new(VersionedFooter::V1 { + crc32, + store_compression: crate::store::COMPRESSION.to_string(), + }); let mut writer = self.writer.take().unwrap(); - writer.write_all(&footer)?; + footer.append_footer(&mut writer)?; writer.terminate() } } @@ -222,56 +219,121 @@ impl TerminatingWrite for FooterProxy { #[cfg(test)] mod tests { + use super::CrcHashU32; + use super::FooterProxy; + use crate::common::BinarySerializable; use crate::directory::footer::{Footer, VersionedFooter}; + use crate::directory::TerminatingWrite; + use byteorder::{ByteOrder, LittleEndian}; use regex::Regex; + #[test] + fn test_versioned_footer() { + let mut vec = Vec::new(); + let footer_proxy = FooterProxy::new(&mut vec); + assert!(footer_proxy.terminate().is_ok()); + assert_eq!(vec.len(), 167); + let footer = Footer::deserialize(&mut &vec[..]).unwrap(); + if let VersionedFooter::V1 { + crc32: _, + store_compression, + } = footer.versioned_footer + { + assert_eq!(store_compression, crate::store::COMPRESSION); + } else { + panic!("Versioned footer should be V1."); + } + assert_eq!(&footer.version, crate::version()); + } + #[test] fn test_serialize_deserialize_footer() { - let crc = 123456; - let footer = Footer::new(VersionedFooter::V0(crc)); - let footer_bytes = footer.to_bytes(); - assert_eq!(Footer::from_bytes(&footer_bytes).unwrap(), footer); + let mut buffer = Vec::new(); + let crc32 = 123456u32; + let footer: Footer = Footer::new(VersionedFooter::V1 { + crc32, + store_compression: "lz4".to_string(), + }); + footer.serialize(&mut buffer).unwrap(); + let footer_deser = Footer::deserialize(&mut &buffer[..]).unwrap(); + assert_eq!(footer_deser, footer); } #[test] fn footer_length() { - // test to make sure the ascii art in the doc-strings is correct - let crc = 1111111 as u32; - let versioned_footer = VersionedFooter::V0(crc); - assert_eq!(versioned_footer.size(), 8); + let crc32 = 1111111u32; + let versioned_footer = VersionedFooter::V1 { + crc32, + store_compression: "lz4".to_string(), + }; + let mut buf = Vec::new(); + versioned_footer.serialize(&mut buf).unwrap(); + assert_eq!(buf.len(), 13); let footer = Footer::new(versioned_footer); let regex_ptn = Regex::new( "tantivy v[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.{0,10}, index_format v[0-9]{1,5}", ) .unwrap(); - assert!(regex_ptn.find(&footer.meta).is_some()); + assert!(regex_ptn.is_match(&footer.meta)); } #[test] fn versioned_footer_from_bytes() { - use byteorder::{ByteOrder, LittleEndian}; - let v_footer_bytes = vec![0, 0, 0, 0, 12, 35, 89, 18]; - let versioned_footer = VersionedFooter::from_bytes(&v_footer_bytes).unwrap(); - let expected_versioned_footer = - VersionedFooter::V0(LittleEndian::read_u32(&[12, 35, 89, 18])); - assert_eq!(versioned_footer, expected_versioned_footer); - - assert_eq!(versioned_footer.to_bytes(), v_footer_bytes); - } - - #[should_panic(expected = "Unsupported index should never get serialized")] - #[test] - fn versioned_footer_panic() { - use byteorder::{ByteOrder, LittleEndian}; - let v_footer_bytes = vec![1; 8]; - let versioned_footer = VersionedFooter::from_bytes(&v_footer_bytes).unwrap(); - let expected_version = LittleEndian::read_u32(&[1, 1, 1, 1]); - let expected_versioned_footer = VersionedFooter::UnknownVersion { - version: expected_version, - size: v_footer_bytes.len() as u32, + let v_footer_bytes = vec![ + // versionned footer length + 12 | 128, + // index format version + 1, + 0, + 0, + 0, + // crc 32 + 12, + 35, + 89, + 18, + // compression format + 3 | 128, + b'l', + b'z', + b'4', + ]; + let mut cursor = &v_footer_bytes[..]; + let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap(); + assert!(cursor.is_empty()); + let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32; + let expected_versioned_footer: VersionedFooter = VersionedFooter::V1 { + crc32: expected_crc, + store_compression: "lz4".to_string(), }; assert_eq!(versioned_footer, expected_versioned_footer); + let mut buffer = Vec::new(); + assert!(versioned_footer.serialize(&mut buffer).is_ok()); + assert_eq!(&v_footer_bytes[..], &buffer[..]); + } - versioned_footer.to_bytes(); + #[test] + fn versioned_footer_panic() { + let v_footer_bytes = vec![6u8 | 128u8, 3u8, 0u8, 0u8, 1u8, 0u8, 0u8]; + let mut b = &v_footer_bytes[..]; + let versioned_footer = VersionedFooter::deserialize(&mut b).unwrap(); + assert!(b.is_empty()); + let expected_versioned_footer = VersionedFooter::UnknownVersion; + assert_eq!(versioned_footer, expected_versioned_footer); + let mut buf = Vec::new(); + assert!(versioned_footer.serialize(&mut buf).is_err()); + } + + #[test] + #[cfg(not(feature = "lz4"))] + fn compression_mismatch() { + let crc32 = 1111111u32; + let versioned_footer = VersionedFooter::V1 { + crc32, + store_compression: "lz4".to_string(), + }; + let footer = Footer::new(versioned_footer); + let res = footer.is_compatible(); + assert!(res.is_err()); } } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 8351136f7..1874119d5 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -9,7 +9,7 @@ use crate::directory::{ReadOnlySource, WritePtr}; use crate::directory::{WatchCallback, WatchHandle}; use crate::error::DataCorruption; use crate::Directory; -use crate::Result; + use crc32fast::Hasher; use serde_json; use std::collections::HashSet; @@ -65,7 +65,7 @@ fn save_managed_paths( impl ManagedDirectory { /// Wraps a directory as managed directory. - pub fn wrap(directory: Dir) -> Result { + pub fn wrap(directory: Dir) -> crate::Result { match directory.atomic_read(&MANAGED_FILEPATH) { Ok(data) => { let managed_files_json = String::from_utf8_lossy(&data); @@ -88,6 +88,11 @@ impl ManagedDirectory { meta_informations: Arc::default(), }), Err(OpenReadError::IOError(e)) => Err(From::from(e)), + Err(OpenReadError::IncompatibleIndex(incompatibility)) => { + // For the moment, this should never happen `meta.json` + // do not have any footer and cannot detect incompatibility. + Err(crate::TantivyError::IncompatibleIndex(incompatibility)) + } } } @@ -261,8 +266,9 @@ impl ManagedDirectory { impl Directory for ManagedDirectory { fn open_read(&self, path: &Path) -> result::Result { let read_only_source = self.directory.open_read(path)?; - let (_footer, reader) = Footer::extract_footer(read_only_source) + let (footer, reader) = Footer::extract_footer(read_only_source) .map_err(|err| IOError::with_path(path.to_path_buf(), err))?; + footer.is_compatible()?; Ok(reader) } @@ -409,6 +415,8 @@ mod tests_mmap_specific { write.write_all(&[3u8, 4u8, 5u8]).unwrap(); write.terminate().unwrap(); + let read_source = managed_directory.open_read(test_path2).unwrap(); + assert_eq!(read_source.as_slice(), &[3u8, 4u8, 5u8]); assert!(managed_directory.list_damaged().unwrap().is_empty()); let mut corrupted_path = tempdir_path.clone(); diff --git a/src/directory/mod.rs b/src/directory/mod.rs index ceabbc3cc..c7a836909 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -81,6 +81,13 @@ impl TerminatingWrite for BufWriter { } } +#[cfg(test)] +impl<'a> TerminatingWrite for &'a mut Vec { + fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> { + self.flush() + } +} + /// Write object for Directory. /// /// `WritePtr` are required to implement both Write diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index 1a49fc0aa..9949b9e77 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -70,6 +70,12 @@ impl ReadOnlySource { (left, right) } + /// Splits into 2 `ReadOnlySource`, at the offset `end - right_len`. + pub fn split_from_end(self, right_len: usize) -> (ReadOnlySource, ReadOnlySource) { + let left_len = self.len() - right_len; + self.split(left_len) + } + /// Creates a ReadOnlySource that is just a /// view over a slice of the data. /// diff --git a/src/error.rs b/src/error.rs index ac6d96216..c4752141b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,8 +2,8 @@ use std::io; -use crate::directory::error::LockError; use crate::directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError}; +use crate::directory::error::{Incompatibility, LockError}; use crate::fastfield::FastFieldNotAvailableError; use crate::query; use crate::schema; @@ -80,6 +80,9 @@ pub enum TantivyError { /// System error. (e.g.: We failed spawning a new thread) #[fail(display = "System error.'{}'", _0)] SystemError(String), + /// Index incompatible with current version of tantivy + #[fail(display = "{:?}", _0)] + IncompatibleIndex(Incompatibility), } impl From for TantivyError { @@ -129,6 +132,9 @@ impl From for TantivyError { match error { OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath), OpenReadError::IOError(io_error) => TantivyError::IOError(io_error), + OpenReadError::IncompatibleIndex(incompatibility) => { + TantivyError::IncompatibleIndex(incompatibility) + } } } } diff --git a/src/lib.rs b/src/lib.rs old mode 100755 new mode 100644 index e0328d200..de3f2705a --- a/src/lib.rs +++ b/src/lib.rs @@ -160,7 +160,6 @@ pub use self::snippet::{Snippet, SnippetGenerator}; mod docset; pub use self::docset::{DocSet, SkipResult}; - pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64}; pub use crate::core::SegmentComponent; pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta}; @@ -170,11 +169,58 @@ pub use crate::indexer::IndexWriter; pub use crate::postings::Postings; pub use crate::reader::LeasedItem; pub use crate::schema::{Document, Term}; +use std::fmt; -/// Expose the current version of tantivy, as well -/// whether it was compiled with the simd compression. -pub fn version() -> &'static str { - env!("CARGO_PKG_VERSION") +use once_cell::sync::Lazy; + +/// Index format version. +const INDEX_FORMAT_VERSION: u32 = 1; + +/// Structure version for the index. +#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Version { + major: u32, + minor: u32, + patch: u32, + index_format_version: u32, + store_compression: String, +} + +impl fmt::Debug for Version { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.to_string()) + } +} + +static VERSION: Lazy = Lazy::new(|| Version { + major: env!("CARGO_PKG_VERSION_MAJOR").parse().unwrap(), + minor: env!("CARGO_PKG_VERSION_MINOR").parse().unwrap(), + patch: env!("CARGO_PKG_VERSION_PATCH").parse().unwrap(), + index_format_version: INDEX_FORMAT_VERSION, + store_compression: crate::store::COMPRESSION.to_string(), +}); + +impl ToString for Version { + fn to_string(&self) -> String { + format!( + "tantivy v{}.{}.{}, index_format v{}, store_compression: {}", + self.major, self.minor, self.patch, self.index_format_version, self.store_compression + ) + } +} + +static VERSION_STRING: Lazy = Lazy::new(|| VERSION.to_string()); + +/// Expose the current version of tantivy as found in Cargo.toml during compilation. +/// eg. "0.11.0" as well as the compression scheme used in the docstore. +pub fn version() -> &'static Version { + &VERSION +} + +/// Exposes the complete version of tantivy as found in Cargo.toml during compilation as a string. +/// eg. "tantivy v0.11.0, index_format v1, store_compression: lz4". +pub fn version_string() -> &'static str { + VERSION_STRING.as_str() } /// Defines tantivy's merging strategy @@ -287,6 +333,18 @@ mod tests { sample_with_seed(n, ratio, 4) } + #[test] + #[cfg(not(feature = "lz4"))] + fn test_version_string() { + use regex::Regex; + let regex_ptn = Regex::new( + "tantivy v[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.{0,10}, index_format v[0-9]{1,5}", + ) + .unwrap(); + let version = super::version().to_string(); + assert!(regex_ptn.find(&version).is_some()); + } + #[test] #[cfg(feature = "mmap")] fn test_indexing() { diff --git a/src/positions/reader.rs b/src/positions/reader.rs index 5c845c0b3..6737f4bbc 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -36,11 +36,10 @@ struct Positions { impl Positions { pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions { - let skip_len = skip_source.len(); - let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES); + let (body, footer) = skip_source.split_from_end(u32::SIZE_IN_BYTES); let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted"); - let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize); - let (skip_source, long_skip_source) = body.split(body_split); + let (skip_source, long_skip_source) = + body.split_from_end(u64::SIZE_IN_BYTES * (num_long_skips as usize)); Positions { bit_packer: BitPacker4x::new(), skip_source, diff --git a/src/store/compression_lz4.rs b/src/store/compression_lz4.rs index 533985327..07a1c9127 100644 --- a/src/store/compression_lz4.rs +++ b/src/store/compression_lz4.rs @@ -1,7 +1,10 @@ -extern crate lz4; - use std::io::{self, Read, Write}; +/// Name of the compression scheme used in the doc store. +/// +/// This name is appended to the version string of tantivy. +pub const COMPRESSION: &'static str = "lz4"; + pub fn compress(uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { compressed.clear(); let mut encoder = lz4::EncoderBuilder::new().build(compressed)?; diff --git a/src/store/compression_snap.rs b/src/store/compression_snap.rs index b5cc2ded9..6a3182997 100644 --- a/src/store/compression_snap.rs +++ b/src/store/compression_snap.rs @@ -2,6 +2,11 @@ use snap; use std::io::{self, Read, Write}; +/// Name of the compression scheme used in the doc store. +/// +/// This name is appended to the version string of tantivy. +pub const COMPRESSION: &str = "snappy"; + pub fn compress(uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { compressed.clear(); let mut encoder = snap::Writer::new(compressed); diff --git a/src/store/mod.rs b/src/store/mod.rs index 4f1347654..bb15301b7 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -42,12 +42,16 @@ pub use self::writer::StoreWriter; #[cfg(feature = "lz4")] mod compression_lz4; #[cfg(feature = "lz4")] -use self::compression_lz4::*; +pub use self::compression_lz4::COMPRESSION; +#[cfg(feature = "lz4")] +use self::compression_lz4::{compress, decompress}; #[cfg(not(feature = "lz4"))] mod compression_snap; #[cfg(not(feature = "lz4"))] -use self::compression_snap::*; +pub use self::compression_snap::COMPRESSION; +#[cfg(not(feature = "lz4"))] +use self::compression_snap::{compress, decompress}; #[cfg(test)] pub mod tests {