diff --git a/Cargo.toml b/Cargo.toml index 3c0133d7d..4c27f3090 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ rust-stemmers = "0.1.0" downcast = { version="0.9", features = ["nightly"]} matches = "0.1" snap = "0.2" -bitpacking = {path="../bitpacking", default-features=false} +bitpacking = {path = "../bitpacking"} [target.'cfg(windows)'.dependencies] winapi = "0.2" @@ -57,8 +57,7 @@ debug-assertions = false [features] -default = ["mmap", "simdcompression"] -simdcompression = ["bitpacking/sse3"] +default = ["mmap"] streamdict = [] mmap = ["fst/mmap", "atomicwrites"] @@ -69,3 +68,12 @@ travis-ci = { repository = "tantivy-search/tantivy" } [[example]] name = "simple_search" required-features = ["mmap"] + + +[[bin]] +name = "convert_to_static" +path = "./bin/convert_to_static.rs" + +[[bin]] +name = "test_static_dir" +path = "./bin/test_static_dir.rs" \ No newline at end of file diff --git a/bin/convert_to_static.rs b/bin/convert_to_static.rs new file mode 100644 index 000000000..7643b1b7b --- /dev/null +++ b/bin/convert_to_static.rs @@ -0,0 +1,20 @@ +use std::env; +use std::path::PathBuf; +use std::fs::File; +use std::io::Write; +extern crate tantivy; +use tantivy::directory::write_static_from_directory; + +fn main() { + // Prints each argument on a separate line + let mut args = env::args(); + args.next().unwrap(); + let directory_path= args.next().expect("Expect 2 args. "); + let output_path = args.next().expect("Expect 2 args. "); + println!("{} => {}", directory_path, output_path); + let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap(); + println!("Read all"); + let mut output = File::create(output_path).unwrap(); + output.write_all(&buffer[..]).unwrap(); + output.flush().unwrap(); +} \ No newline at end of file diff --git a/bin/test_static_dir.rs b/bin/test_static_dir.rs new file mode 100644 index 000000000..a55f2ebf0 --- /dev/null +++ b/bin/test_static_dir.rs @@ -0,0 +1,51 @@ +use std::env; +use std::path::PathBuf; +use std::fs::File; +use std::io::Write; +extern crate tantivy; +use tantivy::directory::{StaticDirectory, write_static_from_directory}; +use tantivy::Index; +use tantivy::query::QueryParser; +use tantivy::collector::TopCollector; + + +static DATA: &'static [u8] = include_bytes!("output.bin"); + +fn run() -> tantivy::Result<()> { + // Prints each argument on a separate line + let directory = StaticDirectory::open(DATA).unwrap(); + let index = Index::open_directory(directory).unwrap(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + + let schema = index.schema(); + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let query_parser = QueryParser::for_index(&index, vec![title, body]); + let query = query_parser.parse_query("sea whale")?; + + let mut top_collector = TopCollector::with_limit(10); + + searcher.search(&*query, &mut top_collector)?; + + let doc_addresses = top_collector.docs(); + + // The actual documents still need to be + // retrieved from Tantivy's store. + // + // Since the body field was not configured as stored, + // the document returned will only contain + // a title. + + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } + Ok(()) +} + + +fn main() { + run().unwrap(); +} diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 04524013a..0d36e4d41 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -3,6 +3,7 @@ use std::io; use common::serialize::BinarySerializable; use std::mem; use std::ops::Deref; +use std::ptr; pub(crate) struct BitPacker { mini_buffer: u64, @@ -105,18 +106,18 @@ where addr + 8 <= data.len(), "The fast field field should have been padded with 7 bytes." ); - let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; + let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; val_shifted & mask } else { let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() { - unsafe { *(data[addr..].as_ptr() as *const u64) } + unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) } } else { let mut buffer = [0u8; 8]; for i in addr..data.len() { buffer[i - addr] += data[i]; } - unsafe { *(buffer[..].as_ptr() as *const u64) } + unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) } }; let val_shifted = val_unshifted_unmasked >> (bit_shift as u64); val_shifted & mask @@ -140,7 +141,7 @@ where for output_val in output.iter_mut() { let addr = addr_in_bits >> 3; let bit_shift = addr_in_bits & 7; - let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) }; + let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; *output_val = val_shifted & mask; addr_in_bits += num_bits; diff --git a/src/compression/mod.rs b/src/compression/mod.rs index fea9587a1..c4f5294a9 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -7,20 +7,10 @@ pub const COMPRESSION_BLOCK_SIZE: usize = 128; const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1; pub use self::stream::CompressedIntStream; -use std::cmp; -use bitpacking::BitPacker; +use bitpacking::{BitPacker, BitPacker4x}; -#[cfg(not(feature = "simdcompression"))] -pub use bitpacking::ScalarBitPacker as BitPackerImpl; -#[cfg(not(feature = "simdcompression"))] -const MINI_BLOCK: usize = 4; - -#[cfg(feature = "simdcompression")] -pub use bitpacking::SSE3BitPacker as BitPackerImpl; -#[cfg(feature = "simdcompression")] -const MINI_BLOCK: usize = 1; /// Returns the size in bytes of a compressed block, given `num_bits`. pub fn compressed_block_size(num_bits: u8) -> usize { @@ -28,6 +18,7 @@ pub fn compressed_block_size(num_bits: u8) -> usize { } pub struct BlockEncoder { + bitpacker: BitPacker4x, pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE], pub output_len: usize, } @@ -35,52 +26,30 @@ pub struct BlockEncoder { impl BlockEncoder { pub fn new() -> BlockEncoder { BlockEncoder { + bitpacker: BitPacker4x::new(), output: [0u8; COMPRESSED_BLOCK_MAX_SIZE], output_len: 0, } } - pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] { - assert_eq!(vals.len(), COMPRESSION_BLOCK_SIZE); - let mut num_bits = 0; - let mut offsets = [offset; MINI_BLOCK]; - for i in 1..MINI_BLOCK { - offsets[i] = vals[(i * BitPackerImpl::BLOCK_LEN) - 1]; - } - for i in 0..MINI_BLOCK { - let block = &vals[i * BitPackerImpl::BLOCK_LEN.. (i + 1)*BitPackerImpl::BLOCK_LEN]; - num_bits = cmp::max(BitPackerImpl::num_bits_sorted(offsets[i], block), num_bits); - } + pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] { + let num_bits = self.bitpacker.num_bits_sorted(offset, block); self.output[0] = num_bits; - let compressed_chunk_len = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8; - let mut written_size = 1; - for i in 0..MINI_BLOCK { - let block = &vals[i * BitPackerImpl::BLOCK_LEN.. (i + 1)*BitPackerImpl::BLOCK_LEN]; - BitPackerImpl::compress_sorted(offsets[i], block, &mut self.output[written_size..], num_bits); - written_size += compressed_chunk_len; - } + let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits); &self.output[..written_size] } - pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] { - assert_eq!(vals.len(), COMPRESSION_BLOCK_SIZE); - let num_bits = vals.chunks(BitPackerImpl::BLOCK_LEN) - .map(|chunk| BitPackerImpl::num_bits(chunk)) - .max() - .unwrap_or(0u8); + pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] { + let num_bits = self.bitpacker.num_bits(block); self.output[0] = num_bits; - let mut written_size = 1; - let compressed_chunk_len = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8; - for chunk in vals.chunks(BitPackerImpl::BLOCK_LEN) { - BitPackerImpl::compress(chunk, &mut self.output[written_size..], num_bits); - written_size += compressed_chunk_len; - } + let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits); &self.output[..written_size] } } pub struct BlockDecoder { + bitpacker: BitPacker4x, pub output: [u32; COMPRESSION_BLOCK_SIZE + 1], pub output_len: usize, } @@ -94,34 +63,22 @@ impl BlockDecoder { let mut output = [val; COMPRESSION_BLOCK_SIZE + 1]; output[COMPRESSION_BLOCK_SIZE] = 0u32; BlockDecoder { + bitpacker: BitPacker4x::new(), output, output_len: 0, } } - pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], mut offset: u32) -> usize { + pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize { let num_bits = compressed_data[0]; - let mut read_size: usize = 1; - let chunk_size: usize = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8; - for i in 0..MINI_BLOCK { - BitPackerImpl::decompress_sorted(offset, &compressed_data[read_size..], &mut self.output[i*BitPackerImpl::BLOCK_LEN..], num_bits); - offset = self.output[(i + 1)*BitPackerImpl::BLOCK_LEN - 1]; - read_size += chunk_size; - } self.output_len = COMPRESSION_BLOCK_SIZE; - read_size + 1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits) } pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize { let num_bits = compressed_data[0]; - let mut read_size: usize = 1; - let chunk_size: usize = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8; - for i in 0..MINI_BLOCK { - BitPackerImpl::decompress(&compressed_data[read_size..], &mut self.output[i*BitPackerImpl::BLOCK_LEN..], num_bits); - read_size += chunk_size; - } self.output_len = COMPRESSION_BLOCK_SIZE; - read_size + 1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits) } #[inline] diff --git a/src/core/index.rs b/src/core/index.rs index 7659c0fb8..ea6abf2df 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -122,6 +122,13 @@ impl Index { Index::create_from_metas(directory, &metas) } + pub fn open_directory(directory: TDirectory) -> Result { + let directory = ManagedDirectory::new(directory)?; + let metas = load_metas(&directory)?; + Index::create_from_metas(directory, &metas) + } + + /// Reads the index meta file from the directory. pub fn load_metas(&self) -> Result { load_metas(self.directory()) diff --git a/src/directory/mod.rs b/src/directory/mod.rs index e2a7d670e..75d4cce6d 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -12,12 +12,15 @@ mod directory; mod read_only_source; mod shared_vec_slice; mod managed_directory; +mod static_directory; /// Errors specific to the directory module. pub mod error; use std::io::{BufWriter, Seek, Write}; +pub use self::static_directory::StaticDirectory; +pub use self::static_directory::write_static_from_directory; pub use self::read_only_source::ReadOnlySource; pub use self::directory::Directory; pub use self::ram_directory::RAMDirectory; diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index fe17742d7..3c36015c0 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -7,6 +7,8 @@ use std::slice; use std::io::{self, Read}; use stable_deref_trait::{CloneStableDeref, StableDeref}; +const EMPTY_SLICE: [u8; 0] = []; + /// Read object that represents files in tantivy. /// /// These read objects are only in charge to deliver @@ -19,6 +21,8 @@ pub enum ReadOnlySource { Mmap(MmapReadOnly), /// Wrapping a `Vec` Anonymous(SharedVecSlice), + /// Wrapping a static slice + Static(&'static [u8]) } unsafe impl StableDeref for ReadOnlySource {} @@ -35,7 +39,7 @@ impl Deref for ReadOnlySource { impl ReadOnlySource { /// Creates an empty ReadOnlySource pub fn empty() -> ReadOnlySource { - ReadOnlySource::Anonymous(SharedVecSlice::empty()) + ReadOnlySource::Static(&EMPTY_SLICE) } /// Returns the data underlying the ReadOnlySource object. @@ -44,6 +48,7 @@ impl ReadOnlySource { #[cfg(feature="mmap")] ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() }, ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(), + ReadOnlySource::Static(data) => data, } } @@ -76,6 +81,9 @@ impl ReadOnlySource { ReadOnlySource::Anonymous(ref shared_vec) => { ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset)) } + ReadOnlySource::Static(data) => { + ReadOnlySource::Static(&data[from_offset..to_offset]) + } } } @@ -116,6 +124,12 @@ impl From> for ReadOnlySource { } } +impl From<&'static [u8]> for ReadOnlySource { + fn from(data: &'static [u8]) -> ReadOnlySource { + ReadOnlySource::Static(data) + } +} + /// Acts as a owning cursor over the data backed up by a `ReadOnlySource` pub(crate) struct SourceRead { _data_owner: ReadOnlySource, diff --git a/src/directory/static_directory.rs b/src/directory/static_directory.rs new file mode 100644 index 000000000..96261035a --- /dev/null +++ b/src/directory/static_directory.rs @@ -0,0 +1,123 @@ +use std::collections::HashMap; +use Directory; +use std::path::PathBuf; +use directory::ReadOnlySource; +use std::io::BufWriter; +use directory::error::{DeleteError, OpenReadError, OpenWriteError}; +use std::path::Path; +use std::fmt::{Formatter, Debug, self}; +use Result as TantivyResult; +use directory::SeekableWrite; +use std::io; +use std::fs; +use common::Endianness; +use common::BinarySerializable; +use common::VInt; +use byteorder::ByteOrder; +use std::str; +use std::fs::File; +use std::io::{Read, Write}; +use std::ffi::OsString; + +#[derive(Clone)] +pub struct StaticDirectory { + files: HashMap, +} + +impl Debug for StaticDirectory { + fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { + write!(f, "StaticDirectory[{} files]", self.files.len())?; + Ok(()) + } +} + +impl StaticDirectory { + pub fn open(mut data: &'static [u8]) -> TantivyResult { + assert!(data.len() > 8); + let footer_len_offset = data.len() - 8; + let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize; + let mut body = &data[..body_len]; + let mut footer = &data[body_len..footer_len_offset]; + let num_files = VInt::deserialize(&mut footer)?.0 as usize; + let mut files = HashMap::new(); + for _ in 0..num_files { + let filename_len = VInt::deserialize(&mut footer)?.0 as usize; + let filename = &footer[..filename_len]; + footer = &footer[filename_len..]; + let data_len = VInt::deserialize(&mut footer)?.0 as usize; + let file_data = &body[..data_len]; + body = &body[data_len..]; + let filename_str = str::from_utf8(filename).expect("Invalid UTF8"); + let filename = PathBuf::from(filename_str); + println!("{:?} {:?}", filename, data_len); + files.insert(filename, file_data); + } + Ok(StaticDirectory { + files + }) + } +} + +impl Directory for StaticDirectory { + fn open_read(&self, path: &Path) -> Result { + if let Some(static_data) = self.files.get(path) { + Ok(ReadOnlySource::from(*static_data)) + } else { + Err(OpenReadError::FileDoesNotExist(path.to_owned())) + } + } + + fn delete(&self, path: &Path) -> Result<(), DeleteError> { + unimplemented!("Static directory is read-only !") + } + + fn exists(&self, path: &Path) -> bool { + self.files.contains_key(path) + } + + fn open_write(&mut self, path: &Path) -> Result>, OpenWriteError> { + unimplemented!("Static directory is read-only !") + } + + fn atomic_read(&self, path: &Path) -> Result, OpenReadError> { + if let Some(static_data) = self.files.get(path) { + Ok(static_data.to_vec()) + } else { + Err(OpenReadError::FileDoesNotExist(path.to_owned())) + } + } + + fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + unimplemented!("Static directory is read-only !") + } + + fn box_clone(&self) -> Box { + box self.clone() + } +} + +pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult> { + assert!(directory_path.is_dir()); + let mut file_data: Vec<(OsString, usize)> = Vec::new(); + let mut write: Vec = Vec::new(); + for entry in fs::read_dir(directory_path)? { + let entry = entry?; + let path = entry.path(); + if path.is_file() { + info!("Appending {}", path.to_string_lossy()); + let mut open_file = File::open(&path)?; + let file_len = open_file.read_to_end(&mut write)?; + file_data.push((entry.file_name(), file_len)); + } + } + // write footer + let body_len = write.len(); + VInt(file_data.len() as u64).serialize(&mut write)?; + for (filename, filelen) in file_data { + VInt(filename.len() as u64).serialize(&mut write)?; + write.write_all(filename.to_string_lossy().as_bytes())?; + VInt(filelen as u64).serialize(&mut write)?; + } + (body_len as u64).serialize(&mut write)?; + Ok(write) +} \ No newline at end of file diff --git a/src/termdict/fstdict/termdict.rs b/src/termdict/fstdict/termdict.rs index 5f71b40fc..9295ca1f9 100644 --- a/src/termdict/fstdict/termdict.rs +++ b/src/termdict/fstdict/termdict.rs @@ -86,6 +86,9 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map { ReadOnlySource::Anonymous(data) => { Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted") } + ReadOnlySource::Static(bytes) => { + Fst::from_static_slice(bytes).expect("FST data is corrupted") + } #[cfg(feature="mmap")] ReadOnlySource::Mmap(mmap_readonly) => { Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")