mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 20:42:54 +00:00
Compare commits
3 Commits
column-rea
...
wasm
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
232ca5c06c | ||
|
|
e78af20375 | ||
|
|
30637f7a7f |
15
Cargo.toml
15
Cargo.toml
@@ -18,7 +18,6 @@ tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = {version="0.2", default-features=false}
|
||||
atomicwrites = {version="0.1", optional=true}
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
tempdir = "0.3"
|
||||
@@ -27,7 +26,6 @@ serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
@@ -40,13 +38,15 @@ stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9", features = ["nightly"]}
|
||||
matches = "0.1"
|
||||
bitpacking = "0.3"
|
||||
snap = "0.2"
|
||||
bitpacking = {path = "../bitpacking"}
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.3"
|
||||
tempfile = "2.1"
|
||||
env_logger = "0.4"
|
||||
|
||||
[profile.release]
|
||||
@@ -68,3 +68,12 @@ travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
[[example]]
|
||||
name = "simple_search"
|
||||
required-features = ["mmap"]
|
||||
|
||||
|
||||
[[bin]]
|
||||
name = "convert_to_static"
|
||||
path = "./bin/convert_to_static.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "test_static_dir"
|
||||
path = "./bin/test_static_dir.rs"
|
||||
20
bin/convert_to_static.rs
Normal file
20
bin/convert_to_static.rs
Normal file
@@ -0,0 +1,20 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
extern crate tantivy;
|
||||
use tantivy::directory::write_static_from_directory;
|
||||
|
||||
fn main() {
|
||||
// Prints each argument on a separate line
|
||||
let mut args = env::args();
|
||||
args.next().unwrap();
|
||||
let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
|
||||
let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
|
||||
println!("{} => {}", directory_path, output_path);
|
||||
let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
|
||||
println!("Read all");
|
||||
let mut output = File::create(output_path).unwrap();
|
||||
output.write_all(&buffer[..]).unwrap();
|
||||
output.flush().unwrap();
|
||||
}
|
||||
51
bin/test_static_dir.rs
Normal file
51
bin/test_static_dir.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
extern crate tantivy;
|
||||
use tantivy::directory::{StaticDirectory, write_static_from_directory};
|
||||
use tantivy::Index;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::collector::TopCollector;
|
||||
|
||||
|
||||
static DATA: &'static [u8] = include_bytes!("output.bin");
|
||||
|
||||
fn run() -> tantivy::Result<()> {
|
||||
// Prints each argument on a separate line
|
||||
let directory = StaticDirectory::open(DATA).unwrap();
|
||||
let index = Index::open_directory(directory).unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let schema = index.schema();
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
fn main() {
|
||||
run().unwrap();
|
||||
}
|
||||
@@ -86,6 +86,7 @@ impl Index {
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature="mmap")]
|
||||
#[cfg(test)]
|
||||
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
@@ -121,6 +122,13 @@ impl Index {
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> Result<IndexMeta> {
|
||||
load_metas(self.directory())
|
||||
|
||||
@@ -12,12 +12,15 @@ mod directory;
|
||||
mod read_only_source;
|
||||
mod shared_vec_slice;
|
||||
mod managed_directory;
|
||||
mod static_directory;
|
||||
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
|
||||
pub use self::static_directory::StaticDirectory;
|
||||
pub use self::static_directory::write_static_from_directory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub use self::directory::Directory;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
|
||||
@@ -7,6 +7,8 @@ use std::slice;
|
||||
use std::io::{self, Read};
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
|
||||
const EMPTY_SLICE: [u8; 0] = [];
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
/// These read objects are only in charge to deliver
|
||||
@@ -19,6 +21,8 @@ pub enum ReadOnlySource {
|
||||
Mmap(MmapReadOnly),
|
||||
/// Wrapping a `Vec<u8>`
|
||||
Anonymous(SharedVecSlice),
|
||||
/// Wrapping a static slice
|
||||
Static(&'static [u8])
|
||||
}
|
||||
|
||||
unsafe impl StableDeref for ReadOnlySource {}
|
||||
@@ -35,7 +39,7 @@ impl Deref for ReadOnlySource {
|
||||
impl ReadOnlySource {
|
||||
/// Creates an empty ReadOnlySource
|
||||
pub fn empty() -> ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::empty())
|
||||
ReadOnlySource::Static(&EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Returns the data underlying the ReadOnlySource object.
|
||||
@@ -44,6 +48,7 @@ impl ReadOnlySource {
|
||||
#[cfg(feature="mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
ReadOnlySource::Static(data) => data,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,6 +81,9 @@ impl ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||
}
|
||||
ReadOnlySource::Static(data) => {
|
||||
ReadOnlySource::Static(&data[from_offset..to_offset])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,6 +124,12 @@ impl From<Vec<u8>> for ReadOnlySource {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&'static [u8]> for ReadOnlySource {
|
||||
fn from(data: &'static [u8]) -> ReadOnlySource {
|
||||
ReadOnlySource::Static(data)
|
||||
}
|
||||
}
|
||||
|
||||
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
|
||||
pub(crate) struct SourceRead {
|
||||
_data_owner: ReadOnlySource,
|
||||
|
||||
123
src/directory/static_directory.rs
Normal file
123
src/directory/static_directory.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use std::collections::HashMap;
|
||||
use Directory;
|
||||
use std::path::PathBuf;
|
||||
use directory::ReadOnlySource;
|
||||
use std::io::BufWriter;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use std::path::Path;
|
||||
use std::fmt::{Formatter, Debug, self};
|
||||
use Result as TantivyResult;
|
||||
use directory::SeekableWrite;
|
||||
use std::io;
|
||||
use std::fs;
|
||||
use common::Endianness;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use byteorder::ByteOrder;
|
||||
use std::str;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Write};
|
||||
use std::ffi::OsString;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StaticDirectory {
|
||||
files: HashMap<PathBuf, &'static [u8]>,
|
||||
}
|
||||
|
||||
impl Debug for StaticDirectory {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "StaticDirectory[{} files]", self.files.len())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl StaticDirectory {
|
||||
pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
|
||||
assert!(data.len() > 8);
|
||||
let footer_len_offset = data.len() - 8;
|
||||
let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
|
||||
let mut body = &data[..body_len];
|
||||
let mut footer = &data[body_len..footer_len_offset];
|
||||
let num_files = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let mut files = HashMap::new();
|
||||
for _ in 0..num_files {
|
||||
let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let filename = &footer[..filename_len];
|
||||
footer = &footer[filename_len..];
|
||||
let data_len = VInt::deserialize(&mut footer)?.0 as usize;
|
||||
let file_data = &body[..data_len];
|
||||
body = &body[data_len..];
|
||||
let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
|
||||
let filename = PathBuf::from(filename_str);
|
||||
println!("{:?} {:?}", filename, data_len);
|
||||
files.insert(filename, file_data);
|
||||
}
|
||||
Ok(StaticDirectory {
|
||||
files
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for StaticDirectory {
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
if let Some(static_data) = self.files.get(path) {
|
||||
Ok(ReadOnlySource::from(*static_data))
|
||||
} else {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.files.contains_key(path)
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||
if let Some(static_data) = self.files.get(path) {
|
||||
Ok(static_data.to_vec())
|
||||
} else {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
unimplemented!("Static directory is read-only !")
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
box self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
|
||||
assert!(directory_path.is_dir());
|
||||
let mut file_data: Vec<(OsString, usize)> = Vec::new();
|
||||
let mut write: Vec<u8> = Vec::new();
|
||||
for entry in fs::read_dir(directory_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_file() {
|
||||
info!("Appending {}", path.to_string_lossy());
|
||||
let mut open_file = File::open(&path)?;
|
||||
let file_len = open_file.read_to_end(&mut write)?;
|
||||
file_data.push((entry.file_name(), file_len));
|
||||
}
|
||||
}
|
||||
// write footer
|
||||
let body_len = write.len();
|
||||
VInt(file_data.len() as u64).serialize(&mut write)?;
|
||||
for (filename, filelen) in file_data {
|
||||
VInt(filename.len() as u64).serialize(&mut write)?;
|
||||
write.write_all(filename.to_string_lossy().as_bytes())?;
|
||||
VInt(filelen as u64).serialize(&mut write)?;
|
||||
}
|
||||
(body_len as u64).serialize(&mut write)?;
|
||||
Ok(write)
|
||||
}
|
||||
@@ -1,15 +1,15 @@
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
pub struct Stamper(Arc<AtomicUsize>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
Stamper(Arc::new(AtomicUsize::new(first_opstamp as usize)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst)
|
||||
self.0.fetch_add(1, Ordering::SeqCst) as u64
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,7 +140,7 @@ extern crate fst;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate itertools;
|
||||
extern crate lz4;
|
||||
extern crate snap;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
@@ -149,6 +149,7 @@ extern crate serde;
|
||||
extern crate serde_json;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
#[cfg(test)]
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
extern crate bitpacking;
|
||||
|
||||
@@ -110,7 +110,6 @@ mod tests {
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&*searcher, false).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
println!("{:?}", scorer.type_name());
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::mem::size_of;
|
||||
use std::io::{self, Read};
|
||||
use common::VInt;
|
||||
use datastruct::SkipList;
|
||||
use lz4;
|
||||
use snap;
|
||||
|
||||
/// Reads document off tantivy's [`Store`](./index.html)
|
||||
#[derive(Clone)]
|
||||
@@ -61,9 +61,9 @@ impl StoreReader {
|
||||
let mut current_block_mut = self.current_block.borrow_mut();
|
||||
current_block_mut.clear();
|
||||
let compressed_block = self.compressed_block(block_offset);
|
||||
let mut lz4_decoder = lz4::Decoder::new(compressed_block)?;
|
||||
let mut snap_decoder = snap::Reader::new(compressed_block);
|
||||
*self.current_block_offset.borrow_mut() = usize::max_value();
|
||||
lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())?;
|
||||
snap_decoder.read_to_end(&mut current_block_mut).map(|_| ())?;
|
||||
*self.current_block_offset.borrow_mut() = block_offset;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -3,7 +3,7 @@ use DocId;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use std::io::{self, Write};
|
||||
use super::StoreReader;
|
||||
use lz4;
|
||||
use snap;
|
||||
use datastruct::SkipListBuilder;
|
||||
use common::CountingWriter;
|
||||
use schema::Document;
|
||||
@@ -88,10 +88,9 @@ impl StoreWriter {
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
{
|
||||
let mut encoder = lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer)?;
|
||||
let mut encoder = snap::Writer::new(&mut self.intermediary_buffer);
|
||||
encoder.write_all(&self.current_block)?;
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
encoder_result?;
|
||||
encoder.flush()?;
|
||||
}
|
||||
(self.intermediary_buffer.len() as u32).serialize(&mut self.writer)?;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
|
||||
@@ -86,6 +86,9 @@ fn open_fst_index(source: ReadOnlySource) -> fst::Map {
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted")
|
||||
}
|
||||
ReadOnlySource::Static(bytes) => {
|
||||
Fst::from_static_slice(bytes).expect("FST data is corrupted")
|
||||
}
|
||||
#[cfg(feature="mmap")]
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
|
||||
|
||||
Reference in New Issue
Block a user