mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-07-03 15:50:44 +00:00
We ([ParadeDB](https://github.com/paradedb/paradedb)) have restored and been using the removed [index sorting](https://github.com/quickwit-oss/tantivy/issues/2352) feature in our Tantivy fork. Our use case is sorting the index by Postgres' internal `ctid` identifier. Results returned from Tantivy must be checked against Postgres' visibility map, and checking them in ctid order is much more cache friendly, resulting in up to 80% speedups for certain queries. This PR is split into 5 commits, corresponding to the index sorting reversal plus bug fixes we uncovered during our usage of index sorting. | Commit | Maps to | What it does | |---|---|---| | `2aea0ad9f` | foundation ([#104](https://github.com/paradedb/tantivy/pull/104)) | Restore `SegmentComponent::TempStore` (revert of upstream #2815). Subsumes fork PR [#104](https://github.com/paradedb/tantivy/pull/104)'s CI fix. | | `9205bcb0c` | [#92](https://github.com/paradedb/tantivy/pull/92) | Restore sort-by-field (single-segment + merge paths). | | `39c790f0f` | [#101](https://github.com/paradedb/tantivy/pull/101) | Enable `sort_by` for `Str`/`Bytes` fast fields. | | `9c4341a87` | [#105](https://github.com/paradedb/tantivy/pull/105) | Native typed numeric sort-key comparison (precision/NULL fix). | | `2d9ba2418` | [#106](https://github.com/paradedb/tantivy/pull/106) | Preserve NULL ordering in numeric segment merges. | We have discussed with the Tantivy maintainers and they indicated they would be open to this PR. Another motivation for landing this PR is we are planning on contributing a significant refactor that makes Tantivy's segment components extensible, and landing that without index sorting leads to too many conflicts.
207 lines
7.4 KiB
Rust
207 lines
7.4 KiB
Rust
use std::collections::HashSet;
|
|
|
|
use rand::{rng, Rng};
|
|
|
|
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
|
use crate::schema::*;
|
|
#[allow(deprecated)]
|
|
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
|
|
|
|
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
|
assert!(searcher.segment_readers().len() < 20);
|
|
assert_eq!(searcher.num_docs() as usize, vals.len());
|
|
for segment_reader in searcher.segment_readers() {
|
|
let store_reader = segment_reader.get_store_reader(1)?;
|
|
for doc_id in 0..segment_reader.max_doc() {
|
|
let _doc: TantivyDocument = store_reader.get(doc_id)?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
#[ignore]
|
|
fn test_functional_store() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
|
|
let id_field = schema_builder.add_u64_field("id", INDEXED | STORED);
|
|
let schema = schema_builder.build();
|
|
|
|
let index = Index::create_in_ram(schema);
|
|
let reader = index.reader()?;
|
|
|
|
let mut rng = rng();
|
|
|
|
let mut index_writer: IndexWriter =
|
|
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
|
|
|
|
let mut doc_set: Vec<u64> = Vec::new();
|
|
|
|
let mut doc_id = 0u64;
|
|
for _iteration in 0..get_num_iterations() {
|
|
let num_docs: usize = rng.random_range(0..4);
|
|
if !doc_set.is_empty() {
|
|
let doc_to_remove_id = rng.random_range(0..doc_set.len());
|
|
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
|
|
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
|
|
}
|
|
for _ in 0..num_docs {
|
|
doc_set.push(doc_id);
|
|
index_writer.add_document(doc!(id_field=>doc_id))?;
|
|
doc_id += 1;
|
|
}
|
|
index_writer.commit()?;
|
|
reader.reload()?;
|
|
let searcher = reader.searcher();
|
|
check_index_content(&searcher, &doc_set)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn get_num_iterations() -> usize {
|
|
std::env::var("NUM_FUNCTIONAL_TEST_ITERATIONS")
|
|
.map(|str| str.parse().unwrap())
|
|
.unwrap_or(2000)
|
|
}
|
|
#[test]
|
|
#[ignore]
|
|
fn test_functional_indexing_sorted() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
|
|
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
|
|
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
|
let text_field_options = TextOptions::default()
|
|
.set_indexing_options(
|
|
TextFieldIndexing::default()
|
|
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
|
)
|
|
.set_stored();
|
|
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
|
let schema = schema_builder.build();
|
|
|
|
let mut index_builder = Index::builder().schema(schema);
|
|
index_builder = index_builder.settings(IndexSettings {
|
|
sort_by_field: Some(IndexSortByField {
|
|
field: "id".to_string(),
|
|
order: Order::Desc,
|
|
}),
|
|
..Default::default()
|
|
});
|
|
let index = index_builder.create_from_tempdir().unwrap();
|
|
|
|
let reader = index.reader()?;
|
|
|
|
let mut rng = rng();
|
|
|
|
let mut index_writer: IndexWriter =
|
|
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
|
|
|
|
let mut committed_docs: HashSet<u64> = HashSet::new();
|
|
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
|
|
|
for _ in 0..get_num_iterations() {
|
|
let random_val = rng.random_range(0..20);
|
|
if random_val == 0 {
|
|
index_writer.commit()?;
|
|
committed_docs.extend(&uncommitted_docs);
|
|
uncommitted_docs.clear();
|
|
reader.reload()?;
|
|
let searcher = reader.searcher();
|
|
// check that everything is correct.
|
|
check_index_content(
|
|
&searcher,
|
|
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
|
)?;
|
|
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
|
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
|
index_writer.delete_term(doc_id_term);
|
|
} else {
|
|
uncommitted_docs.insert(random_val);
|
|
let mut doc = TantivyDocument::new();
|
|
doc.add_u64(id_field, random_val);
|
|
for i in 1u64..10u64 {
|
|
doc.add_u64(multiples_field, random_val * i);
|
|
}
|
|
doc.add_text(text_field, get_text());
|
|
index_writer.add_document(doc)?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \
|
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \
|
|
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo \
|
|
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse \
|
|
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat \
|
|
non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
|
|
fn get_text() -> String {
|
|
use rand::seq::IndexedRandom;
|
|
let mut rng = rng();
|
|
let tokens: Vec<_> = LOREM.split(' ').collect();
|
|
let random_val = rng.random_range(0..20);
|
|
|
|
(0..random_val)
|
|
.map(|_| tokens.choose(&mut rng).unwrap())
|
|
.cloned()
|
|
.collect::<Vec<_>>()
|
|
.join(" ")
|
|
}
|
|
|
|
#[test]
|
|
#[ignore]
|
|
fn test_functional_indexing_unsorted() -> crate::Result<()> {
|
|
let mut schema_builder = Schema::builder();
|
|
|
|
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
|
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
|
let text_field_options = TextOptions::default()
|
|
.set_indexing_options(
|
|
TextFieldIndexing::default()
|
|
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
|
)
|
|
.set_stored();
|
|
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
|
let schema = schema_builder.build();
|
|
|
|
let index = Index::create_from_tempdir(schema)?;
|
|
let reader = index.reader()?;
|
|
|
|
let mut rng = rng();
|
|
|
|
let mut index_writer: IndexWriter =
|
|
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
|
|
|
|
let mut committed_docs: HashSet<u64> = HashSet::new();
|
|
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
|
|
|
for _ in 0..get_num_iterations() {
|
|
let random_val = rng.random_range(0..20);
|
|
if random_val == 0 {
|
|
index_writer.commit()?;
|
|
committed_docs.extend(&uncommitted_docs);
|
|
uncommitted_docs.clear();
|
|
reader.reload()?;
|
|
let searcher = reader.searcher();
|
|
// check that everything is correct.
|
|
check_index_content(
|
|
&searcher,
|
|
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
|
)?;
|
|
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
|
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
|
index_writer.delete_term(doc_id_term);
|
|
} else {
|
|
uncommitted_docs.insert(random_val);
|
|
let mut doc = TantivyDocument::new();
|
|
doc.add_u64(id_field, random_val);
|
|
for i in 1u64..10u64 {
|
|
doc.add_u64(multiples_field, random_val * i);
|
|
}
|
|
doc.add_text(text_field, get_text());
|
|
index_writer.add_document(doc)?;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|