mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3d06639531 | ||
|
|
edcafb69bb |
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.7.2"
|
version = "0.7.1"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -72,6 +72,7 @@ default = ["mmap", "no_fail"]
|
|||||||
mmap = ["fst/mmap", "atomicwrites"]
|
mmap = ["fst/mmap", "atomicwrites"]
|
||||||
lz4-compression = ["lz4"]
|
lz4-compression = ["lz4"]
|
||||||
no_fail = ["fail/no_fail"]
|
no_fail = ["fail/no_fail"]
|
||||||
|
unstable = [] # useful for benches.
|
||||||
|
|
||||||
[badges]
|
[badges]
|
||||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||||
|
|||||||
@@ -4,8 +4,9 @@
|
|||||||
|
|
||||||
[Avant Propos](./avant-propos.md)
|
[Avant Propos](./avant-propos.md)
|
||||||
|
|
||||||
|
- [Schema](./schema.md)
|
||||||
|
- [Indexing](./indexing.md)
|
||||||
- [Segments](./basis.md)
|
- [Segments](./basis.md)
|
||||||
- [Defining your schema](./schema.md)
|
|
||||||
- [Facetting](./facetting.md)
|
- [Facetting](./facetting.md)
|
||||||
- [Innerworkings](./innerworkings.md)
|
- [Innerworkings](./innerworkings.md)
|
||||||
- [Inverted index](./inverted_index.md)
|
- [Inverted index](./inverted_index.md)
|
||||||
|
|||||||
@@ -31,4 +31,3 @@ relevancy, collapsing, highlighting, spatial search.
|
|||||||
index from a different format.
|
index from a different format.
|
||||||
|
|
||||||
Tantivy exposes a lot of low level API to do all of these things.
|
Tantivy exposes a lot of low level API to do all of these things.
|
||||||
|
|
||||||
|
|||||||
0
doc/src/indexing.md
Normal file
0
doc/src/indexing.md
Normal file
@@ -1 +1,50 @@
|
|||||||
# Defining your schema
|
# Schema
|
||||||
|
|
||||||
|
When starting a new project using tantivy, your first step will be to your schema. Be aware that changing it will probably require you to reindex all of your data.
|
||||||
|
It is strongly recommended you keep the means to iterate through your original data when this happens.
|
||||||
|
|
||||||
|
If not specified otherwise, tantivy does not keep a raw version of your data,
|
||||||
|
so the good practise is to rely on a distinct storage to store your
|
||||||
|
raw documents.
|
||||||
|
|
||||||
|
The schema defines both the type of the fields you are indexing, but also the type of indexing you want to apply to them. The set of search operations that you will be able to perform depends on the way you set up your schema.
|
||||||
|
|
||||||
|
Here is what defining your schema could look like.
|
||||||
|
|
||||||
|
```Rust
|
||||||
|
use tantivy::schema::{Schema, TEXT, STORED, INT_INDEXED};
|
||||||
|
|
||||||
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
let text_field = schema_builder.add_text_field("name", TEXT | STORED);
|
||||||
|
let tag_field = schema_builder.add_facet_field("tags");
|
||||||
|
let timestamp_field = schema_buider.add_u64_field("timestamp", INT_INDEXED)
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
```
|
||||||
|
|
||||||
|
Notice how adding a new field to your schema builder
|
||||||
|
follows the following pattern :
|
||||||
|
|
||||||
|
```verbatim
|
||||||
|
schema_builder.add_<fieldtype>_field("<fieldname>", <field_configuration>);
|
||||||
|
```
|
||||||
|
|
||||||
|
This method returns a `Field` handle that will be used for all kind of
|
||||||
|
|
||||||
|
# Field types
|
||||||
|
|
||||||
|
Tantivy currently supports only 4 types.
|
||||||
|
|
||||||
|
- `text` (understand `&str`)
|
||||||
|
- `u64` and `i64`
|
||||||
|
- `HierarchicalFacet`
|
||||||
|
|
||||||
|
Let's go into their specificities.
|
||||||
|
|
||||||
|
# Text
|
||||||
|
|
||||||
|
Full-text search is the bread and butter of search engine.
|
||||||
|
The key idea is fairly simple. Your text is broken apart into tokens (that's
|
||||||
|
what we call tokenization). Tantivy then keeps track of the list of the documents containing each token.
|
||||||
|
|
||||||
|
In order to increase recall you might want to normalize tokens. For instance,
|
||||||
|
you most likely want to lowercase your tokens so that documents match the query `cat` regardless of whether your they contain the token `cat` or `Cat`.
|
||||||
|
|||||||
@@ -266,21 +266,20 @@ pub mod tests {
|
|||||||
mod bench {
|
mod bench {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use rand::Rng;
|
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use rand::XorShiftRng;
|
use rand::{Rng, XorShiftRng};
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
let seed: &[u8; 16] = &[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,seed_val];
|
||||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||||
(0..u32::max_value())
|
(0u32..)
|
||||||
.filter(|_| rng.next_f32() < ratio)
|
.filter(|_| rng.gen_bool(ratio))
|
||||||
.take(n)
|
.take(n)
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
||||||
generate_array_with_seed(n, ratio, 4)
|
generate_array_with_seed(n, ratio, 4)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -297,24 +296,23 @@ mod bench {
|
|||||||
fn bench_uncompress(b: &mut Bencher) {
|
fn bench_uncompress(b: &mut Bencher) {
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||||
let (_, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||||
let mut decoder = BlockDecoder::new();
|
let mut decoder = BlockDecoder::new();
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
decoder.uncompress_block_sorted(compressed, 0u32);
|
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_docs_compression_numbits() {
|
fn test_all_docs_compression_numbits() {
|
||||||
for num_bits in 0..33 {
|
for expected_num_bits in 0u8.. {
|
||||||
let mut data = [0u32; 128];
|
let mut data = [0u32; 128];
|
||||||
if num_bits > 0 {
|
if expected_num_bits > 0 {
|
||||||
data[0] = 1 << (num_bits - 1);
|
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
||||||
}
|
}
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let compressed = encoder.compress_block_unsorted(&data);
|
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
||||||
assert_eq!(compressed[0] as usize, num_bits);
|
assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
||||||
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -654,7 +654,7 @@ mod bench {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bench_skip_next(p: f32, b: &mut Bencher) {
|
fn bench_skip_next(p: f64, b: &mut Bencher) {
|
||||||
let searcher = INDEX.searcher();
|
let searcher = INDEX.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let docs = tests::sample(segment_reader.num_docs(), p);
|
let docs = tests::sample(segment_reader.num_docs(), p);
|
||||||
|
|||||||
@@ -126,6 +126,7 @@ impl SegmentPostings {
|
|||||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||||
let mut start = 0;
|
let mut start = 0;
|
||||||
let end = arr.len();
|
let end = arr.len();
|
||||||
|
debug_assert!(target >= arr[start]);
|
||||||
debug_assert!(target <= arr[end - 1]);
|
debug_assert!(target <= arr[end - 1]);
|
||||||
let mut jump = 1;
|
let mut jump = 1;
|
||||||
loop {
|
loop {
|
||||||
@@ -215,10 +216,11 @@ impl DocSet for SegmentPostings {
|
|||||||
|
|
||||||
// we're in the right block now, start with an exponential search
|
// we're in the right block now, start with an exponential search
|
||||||
let block_docs = self.block_cursor.docs();
|
let block_docs = self.block_cursor.docs();
|
||||||
|
|
||||||
|
debug_assert!(target >= self.doc());
|
||||||
let new_cur = self
|
let new_cur = self
|
||||||
.cur
|
.cur
|
||||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||||
|
|
||||||
if need_positions {
|
if need_positions {
|
||||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||||
.iter()
|
.iter()
|
||||||
@@ -630,10 +632,8 @@ mod tests {
|
|||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::SchemaBuilder;
|
use schema::SchemaBuilder;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use super::exponential_search;
|
|
||||||
use schema::INT_INDEXED;
|
use schema::INT_INDEXED;
|
||||||
use DocId;
|
use DocId;
|
||||||
use SkipResult;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_segment_postings() {
|
fn test_empty_segment_postings() {
|
||||||
@@ -661,13 +661,6 @@ mod tests {
|
|||||||
.0
|
.0
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_exponentiel_search() {
|
|
||||||
assert_eq!(exponential_search(0, &[1,2]), (0, 1));
|
|
||||||
assert_eq!(exponential_search(1, &[1,2]), (0, 1));
|
|
||||||
assert_eq!(exponential_search(7, &[1,2,3,4,5,6,7,8,9,10,11]), (3,7));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
search_within_block(block, target),
|
search_within_block(block, target),
|
||||||
@@ -699,7 +692,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings() {
|
fn test_block_segment_postings() {
|
||||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
|
||||||
let mut offset: u32 = 0u32;
|
let mut offset: u32 = 0u32;
|
||||||
// checking that the block before calling advance is empty
|
// checking that the block before calling advance is empty
|
||||||
assert!(block_segments.docs().is_empty());
|
assert!(block_segments.docs().is_empty());
|
||||||
@@ -713,45 +706,14 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
||||||
#[test]
|
|
||||||
fn test_skip_right_at_new_block() {
|
|
||||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
|
||||||
doc_ids.push(129);
|
|
||||||
doc_ids.push(130);
|
|
||||||
{
|
|
||||||
let block_segments = build_block_postings(&doc_ids);
|
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
|
||||||
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
|
||||||
assert_eq!(docset.doc(), 129);
|
|
||||||
assert!(docset.advance());
|
|
||||||
assert_eq!(docset.doc(), 130);
|
|
||||||
assert!(!docset.advance());
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let block_segments = build_block_postings(&doc_ids);
|
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
|
||||||
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
|
||||||
assert_eq!(docset.doc(), 129);
|
|
||||||
assert!(docset.advance());
|
|
||||||
assert_eq!(docset.doc(), 130);
|
|
||||||
assert!(!docset.advance());
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let block_segments = build_block_postings(&doc_ids);
|
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
|
||||||
assert_eq!(docset.skip_next(131), SkipResult::End);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let mut last_doc = 0u32;
|
let mut last_doc = 0u32;
|
||||||
for &doc in docs {
|
for doc in docs {
|
||||||
for _ in last_doc..doc {
|
for _ in last_doc..doc {
|
||||||
index_writer.add_document(doc!(int_field=>1u64));
|
index_writer.add_document(doc!(int_field=>1u64));
|
||||||
}
|
}
|
||||||
@@ -771,7 +733,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings_skip() {
|
fn test_block_segment_postings_skip() {
|
||||||
for i in 0..4 {
|
for i in 0..4 {
|
||||||
let mut block_postings = build_block_postings(&[3]);
|
let mut block_postings = build_block_postings(vec![3]);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(i),
|
block_postings.skip_to(i),
|
||||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||||
@@ -781,7 +743,7 @@ mod tests {
|
|||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(&[3]);
|
let mut block_postings = build_block_postings(vec![3]);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(4u32),
|
block_postings.skip_to(4u32),
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
@@ -794,7 +756,7 @@ mod tests {
|
|||||||
for i in 0..1300 {
|
for i in 0..1300 {
|
||||||
docs.push((i * i / 100) + i);
|
docs.push((i * i / 100) + i);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(&docs[..]);
|
let mut block_postings = build_block_postings(docs.clone());
|
||||||
for i in vec![0, 424, 10000] {
|
for i in vec![0, 424, 10000] {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(i),
|
block_postings.skip_to(i),
|
||||||
|
|||||||
@@ -175,7 +175,7 @@ mod tests {
|
|||||||
#[cfg(all(test, feature = "unstable"))]
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
mod bench {
|
mod bench {
|
||||||
use super::ExpUnrolledLinkedList;
|
use super::ExpUnrolledLinkedList;
|
||||||
use tantivy_memory_arena::MemoryArena;
|
use super::super::MemoryArena;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
const NUM_STACK: usize = 10_000;
|
const NUM_STACK: usize = 10_000;
|
||||||
@@ -199,20 +199,19 @@ mod bench {
|
|||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_push_stack(bench: &mut Bencher) {
|
fn bench_push_stack(bench: &mut Bencher) {
|
||||||
let heap = MemoryArena::new();
|
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
|
let mut heap = MemoryArena::new();
|
||||||
let mut stacks = Vec::with_capacity(100);
|
let mut stacks = Vec::with_capacity(100);
|
||||||
for _ in 0..NUM_STACK {
|
for _ in 0..NUM_STACK {
|
||||||
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||||
stacks.push(stack);
|
stacks.push(stack);
|
||||||
}
|
}
|
||||||
for s in 0..NUM_STACK {
|
for s in 0..NUM_STACK {
|
||||||
for i in 0u32..STACK_SIZE {
|
for i in 0u32..STACK_SIZE {
|
||||||
let t = s * 392017 % NUM_STACK;
|
let t = s * 392017 % NUM_STACK;
|
||||||
stacks[t].push(i, &heap);
|
stacks[t].push(i, &mut heap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
heap.clear();
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user