mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 09:12:55 +00:00
Compare commits
14 Commits
petr-tik-n
...
0.11.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87120acf7c | ||
|
|
401f74f7ae | ||
|
|
03d31f6713 | ||
|
|
a57faf07f6 | ||
|
|
562ea9a839 | ||
|
|
cf92cc1ada | ||
|
|
f6000aece7 | ||
|
|
2b3fe3a2b5 | ||
|
|
0fde90faac | ||
|
|
5838644b03 | ||
|
|
c0011edd05 | ||
|
|
431c187a60 | ||
|
|
392abec420 | ||
|
|
dfbe337fe2 |
15
CHANGELOG.md
15
CHANGELOG.md
@@ -1,3 +1,17 @@
|
||||
Tantivy 0.11.3
|
||||
=======================
|
||||
- Fixed DateTime as a fast field (#735)
|
||||
|
||||
Tantivy 0.11.2
|
||||
=======================
|
||||
- The future returned by `IndexWriter::merge` does not borrow `self` mutably anymore (#732)
|
||||
- Exposing a constructor for `WatchHandle` (#731)
|
||||
|
||||
Tantivy 0.11.1
|
||||
=====================
|
||||
- Bug fix #729
|
||||
|
||||
|
||||
Tantivy 0.11.0
|
||||
=====================
|
||||
|
||||
@@ -9,6 +23,7 @@ Tantivy 0.11.0
|
||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
||||
- Add a method to check the compatibility of the footer in the index with the running version of tantivy (@petr-tik)
|
||||
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
||||
- Added handling of pre-tokenized text fields (#642), which will enable users to
|
||||
load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.11.0"
|
||||
version = "0.11.3"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -33,7 +33,6 @@ fs2={version="0.4", optional=true}
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
notify = {version="4", optional=true}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||
crossbeam = "0.7"
|
||||
futures = {version = "0.3", features=["thread-pool"] }
|
||||
@@ -41,7 +40,7 @@ owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = { version="1.0" }
|
||||
tantivy-query-grammar = { path="./query-grammar" }
|
||||
tantivy-query-grammar = { version="0.11", path="./query-grammar" }
|
||||
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
|
||||
census = "0.4"
|
||||
fnv = "1.0.6"
|
||||
|
||||
3
query-grammar/README.md
Normal file
3
query-grammar/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Tantivy Query Grammar
|
||||
|
||||
This crate is used by tantivy to parse queries.
|
||||
@@ -119,7 +119,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Specifically, subsequent writes or flushes should
|
||||
/// have no effect on the returned `ReadOnlySource` object.
|
||||
///
|
||||
/// You should only use this to read files create with [`open_write`]
|
||||
/// You should only use this to read files create with [Directory::open_write].
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
|
||||
|
||||
/// Removes a file
|
||||
@@ -160,7 +160,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
///
|
||||
/// This should only be used for small files.
|
||||
///
|
||||
/// You should only use this to read files create with [`atomic_write`]
|
||||
/// You should only use this to read files create with [Directory::atomic_write].
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError>;
|
||||
|
||||
/// Atomically replace the content of a file with data.
|
||||
@@ -197,7 +197,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Registers a callback that will be called whenever a change on the `meta.json`
|
||||
/// using the `atomic_write` API is detected.
|
||||
///
|
||||
/// The behavior when using `.watch()` on a file using `.open_write(...)` is, on the other
|
||||
/// The behavior when using `.watch()` on a file using [Directory::open_write] is, on the other
|
||||
/// hand, undefined.
|
||||
///
|
||||
/// The file will be watched for the lifetime of the returned `WatchHandle`. The caller is
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::directory::footer::Footer;
|
||||
use crate::Version;
|
||||
use std::error::Error as StdError;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
@@ -157,6 +157,65 @@ impl StdError for OpenWriteError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Type of index incompatibility between the library and the index found on disk
|
||||
/// Used to catch and provide a hint to solve this incompatibility issue
|
||||
pub enum Incompatibility {
|
||||
/// This library cannot decompress the index found on disk
|
||||
CompressionMismatch {
|
||||
/// Compression algorithm used by the current version of tantivy
|
||||
library_compression_format: String,
|
||||
/// Compression algorithm that was used to serialise the index
|
||||
index_compression_format: String,
|
||||
},
|
||||
/// The index format found on disk isn't supported by this version of the library
|
||||
IndexMismatch {
|
||||
/// Version used by the library
|
||||
library_version: Version,
|
||||
/// Version the index was built with
|
||||
index_version: Version,
|
||||
},
|
||||
}
|
||||
|
||||
impl fmt::Debug for Incompatibility {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match self {
|
||||
Incompatibility::CompressionMismatch {
|
||||
library_compression_format,
|
||||
index_compression_format,
|
||||
} => {
|
||||
let err = format!(
|
||||
"Library was compiled with {:?} compression, index was compressed with {:?}",
|
||||
library_compression_format, index_compression_format
|
||||
);
|
||||
let advice = format!(
|
||||
"Change the feature flag to {:?} and rebuild the library",
|
||||
index_compression_format
|
||||
);
|
||||
write!(f, "{}. {}", err, advice)?;
|
||||
}
|
||||
Incompatibility::IndexMismatch {
|
||||
library_version,
|
||||
index_version,
|
||||
} => {
|
||||
let err = format!(
|
||||
"Library version: {}, index version: {}",
|
||||
library_version.index_format_version, index_version.index_format_version
|
||||
);
|
||||
// TODO make a more useful error message
|
||||
// include the version range that supports this index_format_version
|
||||
let advice = format!(
|
||||
"Change tantivy to a version compatible with index format {} (e.g. {}.{}.x) \
|
||||
and rebuild your project.",
|
||||
index_version.index_format_version, index_version.major, index_version.minor
|
||||
);
|
||||
write!(f, "{}. {}", err, advice)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when accessing a file read
|
||||
#[derive(Debug)]
|
||||
pub enum OpenReadError {
|
||||
@@ -165,8 +224,8 @@ pub enum OpenReadError {
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
IOError(IOError),
|
||||
/// The version of tantivy trying to read the index doesn't support its format
|
||||
IncompatibleIndex(Footer),
|
||||
/// This library doesn't support the index version found on disk
|
||||
IncompatibleIndex(Incompatibility),
|
||||
}
|
||||
|
||||
impl From<IOError> for OpenReadError {
|
||||
@@ -193,20 +252,6 @@ impl fmt::Display for OpenReadError {
|
||||
}
|
||||
}
|
||||
|
||||
impl StdError for OpenReadError {
|
||||
fn description(&self) -> &str {
|
||||
"error occurred while opening a file for reading"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&dyn StdError> {
|
||||
match *self {
|
||||
OpenReadError::FileDoesNotExist(_) => None,
|
||||
OpenReadError::IOError(ref err) => Some(err),
|
||||
OpenReadError::IncompatibleIndex(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when trying to delete a file
|
||||
#[derive(Debug)]
|
||||
pub enum DeleteError {
|
||||
@@ -223,6 +268,12 @@ impl From<IOError> for DeleteError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Incompatibility> for OpenReadError {
|
||||
fn from(incompatibility: Incompatibility) -> Self {
|
||||
OpenReadError::IncompatibleIndex(incompatibility)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeleteError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::common::{BinarySerializable, CountingWriter, FixedSize, VInt};
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::read_only_source::ReadOnlySource;
|
||||
use crate::directory::{AntiCallToken, TerminatingWrite};
|
||||
use crate::Version;
|
||||
@@ -17,11 +18,11 @@ pub struct Footer {
|
||||
}
|
||||
|
||||
/// Serialises the footer to a byte-array
|
||||
/// - versioned_footer_len : 4bytes
|
||||
/// - versioned_footer_len : 4 bytes
|
||||
///- versioned_footer: variable bytes
|
||||
/// - meta_len: 4 bytes
|
||||
/// - meta: variable bytes
|
||||
/// - version_len: 4bytes
|
||||
/// - version_len: 4 bytes
|
||||
/// - version json: variable bytes
|
||||
impl BinarySerializable for Footer {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
@@ -86,16 +87,25 @@ impl Footer {
|
||||
|
||||
/// Confirms that the index will be read correctly by this version of tantivy
|
||||
/// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory
|
||||
pub fn is_compatible(&self) -> bool {
|
||||
let version = &*crate::VERSION;
|
||||
pub fn is_compatible(&self) -> Result<(), Incompatibility> {
|
||||
let library_version = crate::version();
|
||||
match &self.versioned_footer {
|
||||
VersionedFooter::V1 {
|
||||
crc32: _,
|
||||
compression,
|
||||
crc32: _crc,
|
||||
store_compression: compression,
|
||||
} => {
|
||||
return compression == &version.store_compression;
|
||||
if &library_version.store_compression != compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::UnknownVersion { version: _ } => false,
|
||||
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
|
||||
library_version: library_version.clone(),
|
||||
index_version: self.version.clone(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,31 +113,32 @@ impl Footer {
|
||||
/// Footer that includes a crc32 hash that enables us to checksum files in the index
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum VersionedFooter {
|
||||
UnknownVersion {
|
||||
version: u32,
|
||||
},
|
||||
UnknownVersion,
|
||||
V1 {
|
||||
crc32: CrcHashU32,
|
||||
compression: String,
|
||||
store_compression: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl BinarySerializable for VersionedFooter {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let mut buf = Vec::new();
|
||||
BinarySerializable::serialize(&self.version(), &mut buf)?;
|
||||
match self {
|
||||
VersionedFooter::V1 { crc32, compression } => {
|
||||
VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression: compression,
|
||||
} => {
|
||||
// Serializes a valid `VersionedFooter` or panics if the version is unknown
|
||||
// [ version | crc_hash | compression_mode ]
|
||||
// [ 0..4 | 4..8 | variable ]
|
||||
// [ 0..4 | 4..8 | variable ]
|
||||
BinarySerializable::serialize(&1u32, &mut buf)?;
|
||||
BinarySerializable::serialize(crc32, &mut buf)?;
|
||||
BinarySerializable::serialize(compression, &mut buf)?;
|
||||
}
|
||||
VersionedFooter::UnknownVersion { version: _ } => {
|
||||
VersionedFooter::UnknownVersion => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"Cannot serialize an unknown versionned footer ",
|
||||
"Cannot serialize an unknown versioned footer ",
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -145,30 +156,20 @@ impl BinarySerializable for VersionedFooter {
|
||||
if version == 1 {
|
||||
let crc32 = u32::deserialize(&mut cursor)?;
|
||||
let compression = String::deserialize(&mut cursor)?;
|
||||
Ok(VersionedFooter::V1 { crc32, compression })
|
||||
Ok(VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression: compression,
|
||||
})
|
||||
} else {
|
||||
Ok(VersionedFooter::UnknownVersion { version })
|
||||
Ok(VersionedFooter::UnknownVersion)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VersionedFooter {
|
||||
pub fn version(&self) -> u32 {
|
||||
match self {
|
||||
VersionedFooter::V1 {
|
||||
crc32: _,
|
||||
compression: _,
|
||||
} => 1u32,
|
||||
VersionedFooter::UnknownVersion { version, .. } => *version,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn crc(&self) -> Option<CrcHashU32> {
|
||||
match self {
|
||||
VersionedFooter::V1 {
|
||||
crc32,
|
||||
compression: _,
|
||||
} => Some(*crc32),
|
||||
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::UnknownVersion { .. } => None,
|
||||
}
|
||||
}
|
||||
@@ -207,7 +208,7 @@ impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
let crc32 = self.hasher.take().unwrap().finalize();
|
||||
let footer = Footer::new(VersionedFooter::V1 {
|
||||
crc32,
|
||||
compression: crate::store::COMPRESSION.to_string(),
|
||||
store_compression: crate::store::COMPRESSION.to_string(),
|
||||
});
|
||||
let mut writer = self.writer.take().unwrap();
|
||||
footer.append_footer(&mut writer)?;
|
||||
@@ -227,16 +228,21 @@ mod tests {
|
||||
use regex::Regex;
|
||||
|
||||
#[test]
|
||||
fn test_footer_version() {
|
||||
fn test_versioned_footer() {
|
||||
let mut vec = Vec::new();
|
||||
let footer_proxy = FooterProxy::new(&mut vec);
|
||||
assert!(footer_proxy.terminate().is_ok());
|
||||
assert_eq!(vec.len(), 167);
|
||||
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
|
||||
assert_eq!(
|
||||
footer.versioned_footer.version(),
|
||||
crate::INDEX_FORMAT_VERSION
|
||||
);
|
||||
if let VersionedFooter::V1 {
|
||||
crc32: _,
|
||||
store_compression,
|
||||
} = footer.versioned_footer
|
||||
{
|
||||
assert_eq!(store_compression, crate::store::COMPRESSION);
|
||||
} else {
|
||||
panic!("Versioned footer should be V1.");
|
||||
}
|
||||
assert_eq!(&footer.version, crate::version());
|
||||
}
|
||||
|
||||
@@ -246,7 +252,7 @@ mod tests {
|
||||
let crc32 = 123456u32;
|
||||
let footer: Footer = Footer::new(VersionedFooter::V1 {
|
||||
crc32,
|
||||
compression: "lz4".to_string(),
|
||||
store_compression: "lz4".to_string(),
|
||||
});
|
||||
footer.serialize(&mut buffer).unwrap();
|
||||
let footer_deser = Footer::deserialize(&mut &buffer[..]).unwrap();
|
||||
@@ -255,11 +261,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn footer_length() {
|
||||
// test to make sure the ascii art in the doc-strings is correct
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V1 {
|
||||
crc32,
|
||||
compression: "lz4".to_string(),
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let mut buf = Vec::new();
|
||||
versioned_footer.serialize(&mut buf).unwrap();
|
||||
@@ -299,7 +304,7 @@ mod tests {
|
||||
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
|
||||
let expected_versioned_footer: VersionedFooter = VersionedFooter::V1 {
|
||||
crc32: expected_crc,
|
||||
compression: "lz4".to_string(),
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buffer = Vec::new();
|
||||
@@ -313,11 +318,22 @@ mod tests {
|
||||
let mut b = &v_footer_bytes[..];
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut b).unwrap();
|
||||
assert!(b.is_empty());
|
||||
let expected_versioned_footer = VersionedFooter::UnknownVersion {
|
||||
version: 16_777_219u32,
|
||||
};
|
||||
let expected_versioned_footer = VersionedFooter::UnknownVersion;
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buf = Vec::new();
|
||||
assert!(versioned_footer.serialize(&mut buf).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
fn compression_mismatch() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let footer = Footer::new(versioned_footer);
|
||||
let res = footer.is_compatible();
|
||||
assert!(res.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use crate::directory::{ReadOnlySource, WritePtr};
|
||||
use crate::directory::{WatchCallback, WatchHandle};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::Directory;
|
||||
use crate::Result;
|
||||
|
||||
use crc32fast::Hasher;
|
||||
use serde_json;
|
||||
@@ -66,7 +65,7 @@ fn save_managed_paths(
|
||||
|
||||
impl ManagedDirectory {
|
||||
/// Wraps a directory as managed directory.
|
||||
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||
pub fn wrap<Dir: Directory>(directory: Dir) -> crate::Result<ManagedDirectory> {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
@@ -89,8 +88,10 @@ impl ManagedDirectory {
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
Err(OpenReadError::IncompatibleIndex(footer)) => {
|
||||
Err(crate::Error::IncompatibleIndex(format!("{:?}", footer)))
|
||||
Err(OpenReadError::IncompatibleIndex(incompatibility)) => {
|
||||
// For the moment, this should never happen `meta.json`
|
||||
// do not have any footer and cannot detect incompatibility.
|
||||
Err(crate::TantivyError::IncompatibleIndex(incompatibility))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -267,9 +268,7 @@ impl Directory for ManagedDirectory {
|
||||
let read_only_source = self.directory.open_read(path)?;
|
||||
let (footer, reader) = Footer::extract_footer(read_only_source)
|
||||
.map_err(|err| IOError::with_path(path.to_path_buf(), err))?;
|
||||
if !footer.is_compatible() {
|
||||
return Err(OpenReadError::IncompatibleIndex(footer));
|
||||
}
|
||||
footer.is_compatible()?;
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
|
||||
@@ -23,11 +23,9 @@ pub use self::directory::{Directory, DirectoryClone};
|
||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub(crate) use self::watch_event_router::WatchCallbackList;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchHandle};
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
use std::io::{self, BufWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Outcome of the Garbage collection
|
||||
pub struct GarbageCollectionResult {
|
||||
/// List of files that were deleted in this cycle
|
||||
@@ -48,6 +46,9 @@ pub use self::mmap_directory::MmapDirectory;
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
|
||||
@@ -24,13 +24,20 @@ pub struct WatchCallbackList {
|
||||
#[derive(Clone)]
|
||||
pub struct WatchHandle(Arc<WatchCallback>);
|
||||
|
||||
impl WatchHandle {
|
||||
/// Create a WatchHandle handle.
|
||||
pub fn new(watch_callback: Arc<WatchCallback>) -> WatchHandle {
|
||||
WatchHandle(watch_callback)
|
||||
}
|
||||
}
|
||||
|
||||
impl WatchCallbackList {
|
||||
/// Suscribes a new callback and returns a handle that controls the lifetime of the callback.
|
||||
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
let watch_callback_arc = Arc::new(watch_callback);
|
||||
let watch_callback_weak = Arc::downgrade(&watch_callback_arc);
|
||||
self.router.write().unwrap().push(watch_callback_weak);
|
||||
WatchHandle(watch_callback_arc)
|
||||
WatchHandle::new(watch_callback_arc)
|
||||
}
|
||||
|
||||
fn list_callback(&self) -> Vec<Arc<WatchCallback>> {
|
||||
|
||||
13
src/error.rs
13
src/error.rs
@@ -2,8 +2,8 @@
|
||||
|
||||
use std::io;
|
||||
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::error::{Incompatibility, LockError};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::query;
|
||||
use crate::schema;
|
||||
@@ -81,11 +81,8 @@ pub enum TantivyError {
|
||||
#[fail(display = "System error.'{}'", _0)]
|
||||
SystemError(String),
|
||||
/// Index incompatible with current version of tantivy
|
||||
#[fail(
|
||||
display = "Current version of tantivy is incompatible with index version: '{}'",
|
||||
_0
|
||||
)]
|
||||
IncompatibleIndex(String),
|
||||
#[fail(display = "{:?}", _0)]
|
||||
IncompatibleIndex(Incompatibility),
|
||||
}
|
||||
|
||||
impl From<DataCorruption> for TantivyError {
|
||||
@@ -135,8 +132,8 @@ impl From<OpenReadError> for TantivyError {
|
||||
match error {
|
||||
OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath),
|
||||
OpenReadError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||
OpenReadError::IncompatibleIndex(tantivy_err) => {
|
||||
TantivyError::IncompatibleIndex(format!("{:?}", tantivy_err))
|
||||
OpenReadError::IncompatibleIndex(incompatibility) => {
|
||||
TantivyError::IncompatibleIndex(incompatibility)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use crate::common::HasLen;
|
||||
use crate::common::{BitSet, HasLen};
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use bit_set::BitSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -17,7 +16,7 @@ pub fn write_delete_bitset(
|
||||
) -> io::Result<()> {
|
||||
let mut byte = 0u8;
|
||||
let mut shift = 0u8;
|
||||
for doc in 0..(max_doc as usize) {
|
||||
for doc in 0..max_doc {
|
||||
if delete_bitset.contains(doc) {
|
||||
byte |= 1 << shift;
|
||||
}
|
||||
@@ -32,7 +31,7 @@ pub fn write_delete_bitset(
|
||||
if max_doc % 8 > 0 {
|
||||
writer.write_all(&[byte])?;
|
||||
}
|
||||
writer.flush()
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set of deleted `DocId`s.
|
||||
@@ -86,7 +85,6 @@ impl HasLen for DeleteBitSet {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::directory::*;
|
||||
use bit_set::BitSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) {
|
||||
@@ -95,27 +93,26 @@ mod tests {
|
||||
{
|
||||
let mut writer = directory.open_write(&*test_path).unwrap();
|
||||
write_delete_bitset(bitset, max_doc, &mut writer).unwrap();
|
||||
writer.terminate().unwrap();
|
||||
}
|
||||
{
|
||||
let source = directory.open_read(&test_path).unwrap();
|
||||
let delete_bitset = DeleteBitSet::open(source);
|
||||
for doc in 0..max_doc as usize {
|
||||
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), bitset.len());
|
||||
let source = directory.open_read(&test_path).unwrap();
|
||||
let delete_bitset = DeleteBitSet::open(source);
|
||||
for doc in 0..max_doc {
|
||||
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), bitset.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset() {
|
||||
{
|
||||
let mut bitset = BitSet::with_capacity(10);
|
||||
let mut bitset = BitSet::with_max_value(10);
|
||||
bitset.insert(1);
|
||||
bitset.insert(9);
|
||||
test_delete_bitset_helper(&bitset, 10);
|
||||
}
|
||||
{
|
||||
let mut bitset = BitSet::with_capacity(8);
|
||||
let mut bitset = BitSet::with_max_value(8);
|
||||
bitset.insert(1);
|
||||
bitset.insert(2);
|
||||
bitset.insert(3);
|
||||
|
||||
@@ -33,6 +33,7 @@ pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::chrono::{NaiveDateTime, Utc};
|
||||
use crate::common;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
@@ -49,7 +50,7 @@ mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64, i64 and f64).
|
||||
pub trait FastValue: Default + Clone + Copy + Send + Sync + PartialOrd {
|
||||
pub trait FastValue: Clone + Copy + Send + Sync + PartialOrd {
|
||||
/// Converts a value from u64
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
@@ -69,6 +70,12 @@ pub trait FastValue: Default + Clone + Copy + Send + Sync + PartialOrd {
|
||||
/// Cast value to `u64`.
|
||||
/// The value is just reinterpreted in memory.
|
||||
fn as_u64(&self) -> u64;
|
||||
|
||||
/// Build a default value. This default value is never used, so the value does not
|
||||
/// really matter.
|
||||
fn make_zero() -> Self {
|
||||
Self::from_u64(0i64.to_u64())
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for u64 {
|
||||
@@ -135,11 +142,34 @@ impl FastValue for f64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for crate::DateTime {
|
||||
fn from_u64(timestamp_u64: u64) -> Self {
|
||||
let timestamp_i64 = i64::from_u64(timestamp_u64);
|
||||
crate::DateTime::from_utc(NaiveDateTime::from_timestamp(timestamp_i64, 0), Utc)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
self.timestamp().to_u64()
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::Date(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.timestamp().as_u64()
|
||||
}
|
||||
}
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match *value {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
Value::F64(ref val) => common::f64_to_u64(*val),
|
||||
Value::Date(ref datetime) => common::i64_to_u64(datetime.timestamp()),
|
||||
_ => panic!("Expected a u64/i64/f64 field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
@@ -151,10 +181,12 @@ mod tests {
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Document;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::schema::{Document, IntOptions};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
@@ -178,6 +210,12 @@ mod tests {
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_i64_u64() {
|
||||
let datetime = crate::DateTime::from_utc(NaiveDateTime::from_timestamp(0i64, 0), Utc);
|
||||
assert_eq!(i64::from_u64(datetime.to_u64()), 0i64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_small() {
|
||||
let path = Path::new("test");
|
||||
@@ -429,6 +467,93 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_missing_date_fast_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let segment_ids: Vec<SegmentId> = reader
|
||||
.searcher()
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect();
|
||||
assert_eq!(segment_ids.len(), 2);
|
||||
let merge_future = index_writer.merge(&segment_ids[..]);
|
||||
let merge_res = futures::executor::block_on(merge_future);
|
||||
assert!(merge_res.is_ok());
|
||||
assert!(reader.reload().is_ok());
|
||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_datetime() {
|
||||
assert_eq!(crate::DateTime::make_zero().timestamp(), 0i64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_datefastfield() {
|
||||
use crate::fastfield::FastValue;
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
let multi_date_field = schema_builder.add_date_field(
|
||||
"multi_date",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(1i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(4i64.to_u64())
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let date_fast_field = fast_fields.date(date_field).unwrap();
|
||||
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
|
||||
let mut dates = vec![];
|
||||
{
|
||||
assert_eq!(date_fast_field.get(0u32).timestamp(), 1i64);
|
||||
dates_fast_field.get_vals(0u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].timestamp(), 2i64);
|
||||
assert_eq!(dates[1].timestamp(), 3i64);
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(1u32).timestamp(), 4i64);
|
||||
dates_fast_field.get_vals(1u32, &mut dates);
|
||||
assert!(dates.is_empty());
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(2u32).timestamp(), 0i64);
|
||||
dates_fast_field.get_vals(2u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].timestamp(), 5i64);
|
||||
assert_eq!(dates[1].timestamp(), 6i64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
|
||||
@@ -45,7 +45,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||
}
|
||||
|
||||
|
||||
@@ -15,9 +15,11 @@ pub struct FastFieldReaders {
|
||||
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
|
||||
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
|
||||
fast_field_f64: HashMap<Field, FastFieldReader<f64>>,
|
||||
fast_field_date: HashMap<Field, FastFieldReader<crate::DateTime>>,
|
||||
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
|
||||
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
|
||||
fast_field_f64s: HashMap<Field, MultiValueIntFastFieldReader<f64>>,
|
||||
fast_field_dates: HashMap<Field, MultiValueIntFastFieldReader<crate::DateTime>>,
|
||||
fast_bytes: HashMap<Field, BytesFastFieldReader>,
|
||||
fast_fields_composite: CompositeFile,
|
||||
}
|
||||
@@ -26,6 +28,7 @@ enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
F64,
|
||||
Date,
|
||||
}
|
||||
|
||||
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
@@ -39,6 +42,9 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
|
||||
FieldType::F64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::F64, cardinality)),
|
||||
FieldType::Date(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Date, cardinality)),
|
||||
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
_ => None,
|
||||
}
|
||||
@@ -53,9 +59,11 @@ impl FastFieldReaders {
|
||||
fast_field_i64: Default::default(),
|
||||
fast_field_u64: Default::default(),
|
||||
fast_field_f64: Default::default(),
|
||||
fast_field_date: Default::default(),
|
||||
fast_field_i64s: Default::default(),
|
||||
fast_field_u64s: Default::default(),
|
||||
fast_field_f64s: Default::default(),
|
||||
fast_field_dates: Default::default(),
|
||||
fast_bytes: Default::default(),
|
||||
fast_fields_composite: fast_fields_composite.clone(),
|
||||
};
|
||||
@@ -95,6 +103,12 @@ impl FastFieldReaders {
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
FastType::Date => {
|
||||
fast_field_readers.fast_field_date.insert(
|
||||
field,
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
@@ -130,6 +144,14 @@ impl FastFieldReaders {
|
||||
.fast_field_f64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
FastType::Date => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_dates
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
@@ -156,8 +178,6 @@ impl FastFieldReaders {
|
||||
/// If the field is a i64-fast field, return the associated u64 reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
|
||||
///
|
||||
///TODO should it also be lenient with f64?
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
if let Some(u64_ff_reader) = self.u64(field) {
|
||||
@@ -166,6 +186,12 @@ impl FastFieldReaders {
|
||||
if let Some(i64_ff_reader) = self.i64(field) {
|
||||
return Some(i64_ff_reader.into_u64_reader());
|
||||
}
|
||||
if let Some(f64_ff_reader) = self.f64(field) {
|
||||
return Some(f64_ff_reader.into_u64_reader());
|
||||
}
|
||||
if let Some(date_ff_reader) = self.date(field) {
|
||||
return Some(date_ff_reader.into_u64_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
@@ -176,6 +202,13 @@ impl FastFieldReaders {
|
||||
self.fast_field_i64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||
pub fn date(&self, field: Field) -> Option<FastFieldReader<crate::DateTime>> {
|
||||
self.fast_field_date.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns `None`.
|
||||
@@ -202,6 +235,9 @@ impl FastFieldReaders {
|
||||
if let Some(i64s_ff_reader) = self.i64s(field) {
|
||||
return Some(i64s_ff_reader.into_u64s_reader());
|
||||
}
|
||||
if let Some(f64s_ff_reader) = self.f64s(field) {
|
||||
return Some(f64s_ff_reader.into_u64s_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
@@ -219,6 +255,13 @@ impl FastFieldReaders {
|
||||
self.fast_field_f64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns a `crate::DateTime` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a `crate::DateTime` multi-valued fast field, this method returns `None`.
|
||||
pub fn dates(&self, field: Field) -> Option<MultiValueIntFastFieldReader<crate::DateTime>> {
|
||||
self.fast_field_dates.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `bytes` fast field reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bytes fast field, returns `None`.
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldType, Schema};
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use fnv::FnvHashMap;
|
||||
use std::collections::HashMap;
|
||||
@@ -17,6 +17,14 @@ pub struct FastFieldsWriter {
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
}
|
||||
|
||||
fn fast_field_default_value(field_entry: &FieldEntry) -> u64 {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(_) | FieldType::Date(_) => common::i64_to_u64(0i64),
|
||||
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
||||
_ => 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
@@ -25,18 +33,15 @@ impl FastFieldsWriter {
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
for (field, field_entry) in schema.fields() {
|
||||
let default_value = match *field_entry.field_type() {
|
||||
FieldType::I64(_) => common::i64_to_u64(0i64),
|
||||
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
||||
_ => 0u64,
|
||||
};
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options)
|
||||
| FieldType::U64(ref int_options)
|
||||
| FieldType::F64(ref int_options) => {
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Date(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
let default_value = fast_field_default_value(field_entry);
|
||||
fast_field_writer.set_val_if_missing(default_value);
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use crate::common::BitSet;
|
||||
use crate::core::Index;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
@@ -23,7 +24,6 @@ use crate::schema::Document;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Opstamp;
|
||||
use bit_set::BitSet;
|
||||
use crossbeam::channel;
|
||||
use futures::executor::block_on;
|
||||
use futures::future::Future;
|
||||
@@ -115,7 +115,7 @@ fn compute_deleted_bitset(
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
delete_bitset.insert(deleted_doc as usize);
|
||||
delete_bitset.insert(deleted_doc);
|
||||
might_have_changed = true;
|
||||
}
|
||||
}
|
||||
@@ -126,51 +126,60 @@ fn compute_deleted_bitset(
|
||||
Ok(might_have_changed)
|
||||
}
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
/// Advance delete for the given segment up to the target opstamp.
|
||||
///
|
||||
/// Note that there are no guarantee that the resulting `segment_entry` delete_opstamp
|
||||
/// is `==` target_opstamp.
|
||||
/// For instance, there was no delete operation between the state of the `segment_entry` and
|
||||
/// the `target_opstamp`, `segment_entry` is not updated.
|
||||
pub(crate) fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: Opstamp,
|
||||
) -> crate::Result<()> {
|
||||
{
|
||||
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
||||
// We are already up-to-date here.
|
||||
return Ok(());
|
||||
}
|
||||
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
||||
// We are already up-to-date here.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
if segment_entry.delete_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
|
||||
// There has been no `DeleteOperation` between the segment status and `target_opstamp`.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_capacity(max_doc as usize),
|
||||
};
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
|
||||
let delete_cursor = segment_entry.delete_cursor();
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
)?;
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_max_value(max_doc),
|
||||
};
|
||||
|
||||
// TODO optimize
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
segment_entry.delete_cursor(),
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
)?;
|
||||
|
||||
// TODO optimize
|
||||
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
|
||||
for doc in 0u32..max_doc {
|
||||
if segment_reader.is_deleted(doc) {
|
||||
delete_bitset.insert(doc as usize);
|
||||
if seg_delete_bitset.is_deleted(doc) {
|
||||
delete_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
|
||||
let num_deleted_docs = delete_bitset.len();
|
||||
if num_deleted_docs > 0 {
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
}
|
||||
|
||||
let num_deleted_docs = delete_bitset.len();
|
||||
if num_deleted_docs > 0 {
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
Ok(())
|
||||
}
|
||||
@@ -236,7 +245,7 @@ fn apply_deletes(
|
||||
mut delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &[Opstamp],
|
||||
last_docstamp: Opstamp,
|
||||
) -> crate::Result<Option<BitSet<u32>>> {
|
||||
) -> crate::Result<Option<BitSet>> {
|
||||
if delete_cursor.get().is_none() {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
@@ -246,7 +255,7 @@ fn apply_deletes(
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_capacity(max_doc as usize);
|
||||
let mut deleted_bitset = BitSet::with_max_value(max_doc);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
@@ -505,9 +514,13 @@ impl IndexWriter {
|
||||
/// Merges a given list of segments
|
||||
///
|
||||
/// `segment_ids` is required to be non-empty.
|
||||
pub async fn merge(&mut self, segment_ids: &[SegmentId]) -> crate::Result<SegmentMeta> {
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Output = crate::Result<SegmentMeta>> {
|
||||
let merge_operation = self.segment_updater.make_merge_operation(segment_ids);
|
||||
self.segment_updater.start_merge(merge_operation)?.await
|
||||
let segment_updater = self.segment_updater.clone();
|
||||
async move { segment_updater.start_merge(merge_operation)?.await }
|
||||
}
|
||||
|
||||
/// Closes the current document channel send.
|
||||
|
||||
@@ -1492,4 +1492,46 @@ mod tests {
|
||||
assert_eq!(&vals, &[20]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merges_f64_fast_fields_correctly() -> crate::Result<()> {
|
||||
let mut builder = schema::SchemaBuilder::new();
|
||||
|
||||
let fast_multi = IntOptions::default().set_fast(Cardinality::MultiValues);
|
||||
|
||||
let field = builder.add_f64_field("f64", schema::FAST);
|
||||
let multi_field = builder.add_f64_field("f64s", fast_multi);
|
||||
|
||||
let index = Index::create_in_ram(builder.build());
|
||||
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
|
||||
// Make sure we'll attempt to merge every created segment
|
||||
let mut policy = crate::indexer::LogMergePolicy::default();
|
||||
policy.set_min_merge_size(2);
|
||||
writer.set_merge_policy(Box::new(policy));
|
||||
|
||||
for i in 0..100 {
|
||||
let mut doc = Document::new();
|
||||
doc.add_f64(field, 42.0);
|
||||
|
||||
doc.add_f64(multi_field, 0.24);
|
||||
doc.add_f64(multi_field, 0.27);
|
||||
|
||||
writer.add_document(doc);
|
||||
|
||||
if i % 5 == 0 {
|
||||
writer.commit()?;
|
||||
}
|
||||
}
|
||||
|
||||
writer.commit()?;
|
||||
writer.wait_merging_threads()?;
|
||||
|
||||
// If a merging thread fails, we should end up with more
|
||||
// than one segment here
|
||||
assert_eq!(1, index.searchable_segments()?.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ pub type DefaultMergePolicy = LogMergePolicy;
|
||||
mod tests {
|
||||
use crate::schema::{self, Schema};
|
||||
use crate::{Index, Term};
|
||||
|
||||
#[test]
|
||||
fn test_advance_delete_bug() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use bit_set::BitSet;
|
||||
use std::fmt;
|
||||
|
||||
/// A segment entry describes the state of
|
||||
|
||||
@@ -214,6 +214,10 @@ impl SegmentUpdater {
|
||||
self.pool.spawn_ok(async move {
|
||||
let _ = sender.send(f.await);
|
||||
});
|
||||
} else {
|
||||
let _ = sender.send(Err(crate::TantivyError::SystemError(
|
||||
"Segment updater killed".to_string(),
|
||||
)));
|
||||
}
|
||||
receiver.unwrap_or_else(|_| {
|
||||
let err_msg =
|
||||
@@ -326,13 +330,11 @@ impl SegmentUpdater {
|
||||
) -> impl Future<Output = crate::Result<()>> {
|
||||
let segment_updater: SegmentUpdater = self.clone();
|
||||
self.schedule_future(async move {
|
||||
if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||
segment_updater.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp, payload)?;
|
||||
let _ = garbage_collect_files(segment_updater.clone()).await;
|
||||
segment_updater.consider_merge_options().await;
|
||||
}
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||
segment_updater.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp, payload)?;
|
||||
let _ = garbage_collect_files(segment_updater.clone()).await;
|
||||
segment_updater.consider_merge_options().await;
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
70
src/lib.rs
70
src/lib.rs
@@ -173,6 +173,7 @@ use std::fmt;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 1;
|
||||
|
||||
/// Structure version for the index.
|
||||
@@ -939,4 +940,73 @@ mod tests {
|
||||
assert_eq!(fast_field_reader.get(0), 4f64)
|
||||
}
|
||||
}
|
||||
|
||||
// motivated by #729
|
||||
#[test]
|
||||
fn test_update_via_delete_insert() {
|
||||
use crate::collector::Count;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::AllQuery;
|
||||
use crate::SegmentId;
|
||||
use futures::executor::block_on;
|
||||
|
||||
const DOC_COUNT: u64 = 2u64;
|
||||
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let id = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index_reader = index.reader().unwrap();
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
for doc_id in 0u64..DOC_COUNT {
|
||||
index_writer.add_document(doc!(id => doc_id));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
index_reader.reload().unwrap();
|
||||
let searcher = index_reader.searcher();
|
||||
|
||||
assert_eq!(
|
||||
searcher.search(&AllQuery, &Count).unwrap(),
|
||||
DOC_COUNT as usize
|
||||
);
|
||||
|
||||
// update the 10 elements by deleting and re-adding
|
||||
for doc_id in 0u64..DOC_COUNT {
|
||||
index_writer.delete_term(Term::from_field_u64(id, doc_id));
|
||||
index_writer.commit().unwrap();
|
||||
index_reader.reload().unwrap();
|
||||
let doc = doc!(id => doc_id);
|
||||
index_writer.add_document(doc);
|
||||
index_writer.commit().unwrap();
|
||||
index_reader.reload().unwrap();
|
||||
let searcher = index_reader.searcher();
|
||||
// The number of document should be stable.
|
||||
assert_eq!(
|
||||
searcher.search(&AllQuery, &Count).unwrap(),
|
||||
DOC_COUNT as usize
|
||||
);
|
||||
}
|
||||
|
||||
index_reader.reload().unwrap();
|
||||
let searcher = index_reader.searcher();
|
||||
let segment_ids: Vec<SegmentId> = searcher
|
||||
.segment_readers()
|
||||
.into_iter()
|
||||
.map(|reader| reader.segment_id())
|
||||
.collect();
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
|
||||
index_reader.reload().unwrap();
|
||||
let searcher = index_reader.searcher();
|
||||
|
||||
assert_eq!(
|
||||
searcher.search(&AllQuery, &Count).unwrap(),
|
||||
DOC_COUNT as usize
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,8 +209,8 @@ impl From<f64> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DateTime> for Value {
|
||||
fn from(date_time: DateTime) -> Value {
|
||||
impl From<crate::DateTime> for Value {
|
||||
fn from(date_time: crate::DateTime) -> Value {
|
||||
Value::Date(date_time)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::io::{self, Read, Write};
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &'static str = "snappy";
|
||||
pub const COMPRESSION: &str = "snappy";
|
||||
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
|
||||
Reference in New Issue
Block a user