Added convert to static [u8]

This commit is contained in:
Paul Masurel
2018-04-10 21:18:32 +09:00
10 changed files with 252 additions and 65 deletions

View File

@@ -39,7 +39,7 @@ rust-stemmers = "0.1.0"
downcast = { version="0.9", features = ["nightly"]}
matches = "0.1"
snap = "0.2"
bitpacking = {path="../bitpacking", default-features=false}
bitpacking = {path = "../bitpacking"}
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -57,8 +57,7 @@ debug-assertions = false
[features]
default = ["mmap", "simdcompression"]
simdcompression = ["bitpacking/sse3"]
default = ["mmap"]
streamdict = []
mmap = ["fst/mmap", "atomicwrites"]
@@ -69,3 +68,12 @@ travis-ci = { repository = "tantivy-search/tantivy" }
[[example]]
name = "simple_search"
required-features = ["mmap"]
[[bin]]
name = "convert_to_static"
path = "./bin/convert_to_static.rs"
[[bin]]
name = "test_static_dir"
path = "./bin/test_static_dir.rs"

20
bin/convert_to_static.rs Normal file
View File

@@ -0,0 +1,20 @@
use std::env;
use std::path::PathBuf;
use std::fs::File;
use std::io::Write;
extern crate tantivy;
use tantivy::directory::write_static_from_directory;
fn main() {
// Prints each argument on a separate line
let mut args = env::args();
args.next().unwrap();
let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
println!("{} => {}", directory_path, output_path);
let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
println!("Read all");
let mut output = File::create(output_path).unwrap();
output.write_all(&buffer[..]).unwrap();
output.flush().unwrap();
}

51
bin/test_static_dir.rs Normal file
View File

@@ -0,0 +1,51 @@
use std::env;
use std::path::PathBuf;
use std::fs::File;
use std::io::Write;
extern crate tantivy;
use tantivy::directory::{StaticDirectory, write_static_from_directory};
use tantivy::Index;
use tantivy::query::QueryParser;
use tantivy::collector::TopCollector;
static DATA: &'static [u8] = include_bytes!("output.bin");
fn run() -> tantivy::Result<()> {
// Prints each argument on a separate line
let directory = StaticDirectory::open(DATA).unwrap();
let index = Index::open_directory(directory).unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let schema = index.schema();
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sea whale")?;
let mut top_collector = TopCollector::with_limit(10);
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs();
// The actual documents still need to be
// retrieved from Tantivy's store.
//
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
}
fn main() {
run().unwrap();
}

View File

@@ -3,6 +3,7 @@ use std::io;
use common::serialize::BinarySerializable;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -105,18 +106,18 @@ where
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
} else {
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
unsafe { *(data[addr..].as_ptr() as *const u64) }
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
} else {
let mut buffer = [0u8; 8];
for i in addr..data.len() {
buffer[i - addr] += data[i];
}
unsafe { *(buffer[..].as_ptr() as *const u64) }
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
};
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
val_shifted & mask
@@ -140,7 +141,7 @@ where
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;

View File

@@ -7,20 +7,10 @@ pub const COMPRESSION_BLOCK_SIZE: usize = 128;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
pub use self::stream::CompressedIntStream;
use std::cmp;
use bitpacking::BitPacker;
use bitpacking::{BitPacker, BitPacker4x};
#[cfg(not(feature = "simdcompression"))]
pub use bitpacking::ScalarBitPacker as BitPackerImpl;
#[cfg(not(feature = "simdcompression"))]
const MINI_BLOCK: usize = 4;
#[cfg(feature = "simdcompression")]
pub use bitpacking::SSE3BitPacker as BitPackerImpl;
#[cfg(feature = "simdcompression")]
const MINI_BLOCK: usize = 1;
/// Returns the size in bytes of a compressed block, given `num_bits`.
pub fn compressed_block_size(num_bits: u8) -> usize {
@@ -28,6 +18,7 @@ pub fn compressed_block_size(num_bits: u8) -> usize {
}
pub struct BlockEncoder {
bitpacker: BitPacker4x,
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
}
@@ -35,52 +26,30 @@ pub struct BlockEncoder {
impl BlockEncoder {
pub fn new() -> BlockEncoder {
BlockEncoder {
bitpacker: BitPacker4x::new(),
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
assert_eq!(vals.len(), COMPRESSION_BLOCK_SIZE);
let mut num_bits = 0;
let mut offsets = [offset; MINI_BLOCK];
for i in 1..MINI_BLOCK {
offsets[i] = vals[(i * BitPackerImpl::BLOCK_LEN) - 1];
}
for i in 0..MINI_BLOCK {
let block = &vals[i * BitPackerImpl::BLOCK_LEN.. (i + 1)*BitPackerImpl::BLOCK_LEN];
num_bits = cmp::max(BitPackerImpl::num_bits_sorted(offsets[i], block), num_bits);
}
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
self.output[0] = num_bits;
let compressed_chunk_len = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8;
let mut written_size = 1;
for i in 0..MINI_BLOCK {
let block = &vals[i * BitPackerImpl::BLOCK_LEN.. (i + 1)*BitPackerImpl::BLOCK_LEN];
BitPackerImpl::compress_sorted(offsets[i], block, &mut self.output[written_size..], num_bits);
written_size += compressed_chunk_len;
}
let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
assert_eq!(vals.len(), COMPRESSION_BLOCK_SIZE);
let num_bits = vals.chunks(BitPackerImpl::BLOCK_LEN)
.map(|chunk| BitPackerImpl::num_bits(chunk))
.max()
.unwrap_or(0u8);
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
let num_bits = self.bitpacker.num_bits(block);
self.output[0] = num_bits;
let mut written_size = 1;
let compressed_chunk_len = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8;
for chunk in vals.chunks(BitPackerImpl::BLOCK_LEN) {
BitPackerImpl::compress(chunk, &mut self.output[written_size..], num_bits);
written_size += compressed_chunk_len;
}
let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
}
pub struct BlockDecoder {
bitpacker: BitPacker4x,
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
pub output_len: usize,
}
@@ -94,34 +63,22 @@ impl BlockDecoder {
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
output[COMPRESSION_BLOCK_SIZE] = 0u32;
BlockDecoder {
bitpacker: BitPacker4x::new(),
output,
output_len: 0,
}
}
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], mut offset: u32) -> usize {
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
let num_bits = compressed_data[0];
let mut read_size: usize = 1;
let chunk_size: usize = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8;
for i in 0..MINI_BLOCK {
BitPackerImpl::decompress_sorted(offset, &compressed_data[read_size..], &mut self.output[i*BitPackerImpl::BLOCK_LEN..], num_bits);
offset = self.output[(i + 1)*BitPackerImpl::BLOCK_LEN - 1];
read_size += chunk_size;
}
self.output_len = COMPRESSION_BLOCK_SIZE;
read_size
1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
let mut read_size: usize = 1;
let chunk_size: usize = (num_bits as usize) * BitPackerImpl::BLOCK_LEN / 8;
for i in 0..MINI_BLOCK {
BitPackerImpl::decompress(&compressed_data[read_size..], &mut self.output[i*BitPackerImpl::BLOCK_LEN..], num_bits);
read_size += chunk_size;
}
self.output_len = COMPRESSION_BLOCK_SIZE;
read_size
1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
}
#[inline]

View File

@@ -122,6 +122,13 @@ impl Index {
Index::create_from_metas(directory, &metas)
}
pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
/// Reads the index meta file from the directory.
pub fn load_metas(&self) -> Result<IndexMeta> {
load_metas(self.directory())

View File

@@ -12,12 +12,15 @@ mod directory;
mod read_only_source;
mod shared_vec_slice;
mod managed_directory;
mod static_directory;
/// Errors specific to the directory module.
pub mod error;
use std::io::{BufWriter, Seek, Write};
pub use self::static_directory::StaticDirectory;
pub use self::static_directory::write_static_from_directory;
pub use self::read_only_source::ReadOnlySource;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;

View File

@@ -7,6 +7,8 @@ use std::slice;
use std::io::{self, Read};
use stable_deref_trait::{CloneStableDeref, StableDeref};
const EMPTY_SLICE: [u8; 0] = [];
/// Read object that represents files in tantivy.
///
/// These read objects are only in charge to deliver
@@ -19,6 +21,8 @@ pub enum ReadOnlySource {
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
/// Wrapping a static slice
Static(&'static [u8])
}
unsafe impl StableDeref for ReadOnlySource {}
@@ -35,7 +39,7 @@ impl Deref for ReadOnlySource {
impl ReadOnlySource {
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::Anonymous(SharedVecSlice::empty())
ReadOnlySource::Static(&EMPTY_SLICE)
}
/// Returns the data underlying the ReadOnlySource object.
@@ -44,6 +48,7 @@ impl ReadOnlySource {
#[cfg(feature="mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
ReadOnlySource::Static(data) => data,
}
}
@@ -76,6 +81,9 @@ impl ReadOnlySource {
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
}
ReadOnlySource::Static(data) => {
ReadOnlySource::Static(&data[from_offset..to_offset])
}
}
}
@@ -116,6 +124,12 @@ impl From<Vec<u8>> for ReadOnlySource {
}
}
impl From<&'static [u8]> for ReadOnlySource {
fn from(data: &'static [u8]) -> ReadOnlySource {
ReadOnlySource::Static(data)
}
}
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,

View File

@@ -0,0 +1,123 @@
use std::collections::HashMap;
use Directory;
use std::path::PathBuf;
use directory::ReadOnlySource;
use std::io::BufWriter;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use std::path::Path;
use std::fmt::{Formatter, Debug, self};
use Result as TantivyResult;
use directory::SeekableWrite;
use std::io;
use std::fs;
use common::Endianness;
use common::BinarySerializable;
use common::VInt;
use byteorder::ByteOrder;
use std::str;
use std::fs::File;
use std::io::{Read, Write};
use std::ffi::OsString;
#[derive(Clone)]
pub struct StaticDirectory {
files: HashMap<PathBuf, &'static [u8]>,
}
impl Debug for StaticDirectory {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
write!(f, "StaticDirectory[{} files]", self.files.len())?;
Ok(())
}
}
impl StaticDirectory {
pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
assert!(data.len() > 8);
let footer_len_offset = data.len() - 8;
let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
let mut body = &data[..body_len];
let mut footer = &data[body_len..footer_len_offset];
let num_files = VInt::deserialize(&mut footer)?.0 as usize;
let mut files = HashMap::new();
for _ in 0..num_files {
let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
let filename = &footer[..filename_len];
footer = &footer[filename_len..];
let data_len = VInt::deserialize(&mut footer)?.0 as usize;
let file_data = &body[..data_len];
body = &body[data_len..];
let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
let filename = PathBuf::from(filename_str);
println!("{:?} {:?}", filename, data_len);
files.insert(filename, file_data);
}
Ok(StaticDirectory {
files
})
}
}
impl Directory for StaticDirectory {
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
if let Some(static_data) = self.files.get(path) {
Ok(ReadOnlySource::from(*static_data))
} else {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
}
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
unimplemented!("Static directory is read-only !")
}
fn exists(&self, path: &Path) -> bool {
self.files.contains_key(path)
}
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
unimplemented!("Static directory is read-only !")
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
if let Some(static_data) = self.files.get(path) {
Ok(static_data.to_vec())
} else {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
unimplemented!("Static directory is read-only !")
}
fn box_clone(&self) -> Box<Directory> {
box self.clone()
}
}
pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
assert!(directory_path.is_dir());
let mut file_data: Vec<(OsString, usize)> = Vec::new();
let mut write: Vec<u8> = Vec::new();
for entry in fs::read_dir(directory_path)? {
let entry = entry?;
let path = entry.path();
if path.is_file() {
info!("Appending {}", path.to_string_lossy());
let mut open_file = File::open(&path)?;
let file_len = open_file.read_to_end(&mut write)?;
file_data.push((entry.file_name(), file_len));
}
}
// write footer
let body_len = write.len();
VInt(file_data.len() as u64).serialize(&mut write)?;
for (filename, filelen) in file_data {
VInt(filename.len() as u64).serialize(&mut write)?;
write.write_all(filename.to_string_lossy().as_bytes())?;
VInt(filelen as u64).serialize(&mut write)?;
}
(body_len as u64).serialize(&mut write)?;
Ok(write)
}

View File

@@ -86,6 +86,9 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted")
}
ReadOnlySource::Static(bytes) => {
Fst::from_static_slice(bytes).expect("FST data is corrupted")
}
#[cfg(feature="mmap")]
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")