Compare commits

...

1 Commits

Author SHA1 Message Date
BubbleCal
4b24e28539 Add skip_merge option for FTS indexes 2026-01-20 21:32:41 +08:00
9 changed files with 40 additions and 2 deletions

View File

@@ -558,6 +558,12 @@ export interface FtsOptions {
* whether to only index the prefix of the token for ngram tokenizer
*/
prefixOnly?: boolean;
/**
* Whether to skip the partition merge stage after indexing.
* Useful for distributed indexing where merges are handled separately.
*/
skipMerge?: boolean;
}
export class Index {
@@ -726,6 +732,7 @@ export class Index {
options?.ngramMinLength,
options?.ngramMaxLength,
options?.prefixOnly,
options?.skipMerge,
),
);
}

View File

@@ -157,6 +157,7 @@ impl Index {
ngram_min_length: Option<u32>,
ngram_max_length: Option<u32>,
prefix_only: Option<bool>,
skip_merge: Option<bool>,
) -> Self {
let mut opts = FtsIndexBuilder::default();
if let Some(with_position) = with_position {
@@ -192,6 +193,9 @@ impl Index {
if let Some(prefix_only) = prefix_only {
opts = opts.ngram_prefix_only(prefix_only);
}
if let Some(skip_merge) = skip_merge {
opts = opts.skip_merge(skip_merge);
}
Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),

View File

@@ -127,6 +127,9 @@ class FTS:
ascii_folding : bool, default True
Whether to fold ASCII characters. This converts accented characters to
their ASCII equivalent. For example, "café" would be converted to "cafe".
skip_merge : bool, default False
Whether to skip the partition merge stage after indexing. This can be
useful for distributed indexing where merges are handled separately.
"""
with_position: bool = False
@@ -140,6 +143,7 @@ class FTS:
ngram_min_length: int = 3
ngram_max_length: int = 3
prefix_only: bool = False
skip_merge: bool = False
@dataclass

View File

@@ -176,6 +176,7 @@ class RemoteTable(Table):
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
skip_merge: bool = False,
name: Optional[str] = None,
):
config = FTS(
@@ -190,6 +191,7 @@ class RemoteTable(Table):
ngram_min_length=ngram_min_length,
ngram_max_length=ngram_max_length,
prefix_only=prefix_only,
skip_merge=skip_merge,
)
LOOP.run(
self._table.create_index(

View File

@@ -892,6 +892,7 @@ class Table(ABC):
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
skip_merge: bool = False,
wait_timeout: Optional[timedelta] = None,
name: Optional[str] = None,
):
@@ -956,6 +957,9 @@ class Table(ABC):
The maximum length of an n-gram.
prefix_only: bool, default False
Whether to only index the prefix of the token for ngram tokenizer.
skip_merge: bool, default False
Only available with use_tantivy=False.
If True, skip the partition merge stage after indexing.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
name: str, optional
@@ -2259,6 +2263,7 @@ class LanceTable(Table):
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
skip_merge: bool = False,
name: Optional[str] = None,
):
if not use_tantivy:
@@ -2282,6 +2287,8 @@ class LanceTable(Table):
else:
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
tokenizer_configs["skip_merge"] = skip_merge
config = FTS(
**tokenizer_configs,
)

View File

@@ -405,7 +405,10 @@ def test_table_create_indices():
# Test create_fts_index with custom name
table.create_fts_index(
"text", wait_timeout=timedelta(seconds=2), name="custom_fts_idx"
"text",
wait_timeout=timedelta(seconds=2),
name="custom_fts_idx",
skip_merge=True,
)
# Test create_index with custom name
@@ -427,6 +430,7 @@ def test_table_create_indices():
fts_req = received_requests[1]
assert "name" in fts_req
assert fts_req["name"] == "custom_fts_idx"
assert fts_req["skip_merge"] is True
# Check vector index request has custom name
vector_req = received_requests[2]

View File

@@ -50,7 +50,8 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
.ascii_folding(params.ascii_folding)
.ngram_min_length(params.ngram_min_length)
.ngram_max_length(params.ngram_max_length)
.ngram_prefix_only(params.prefix_only);
.ngram_prefix_only(params.prefix_only)
.skip_merge(params.skip_merge);
Ok(LanceDbIndex::FTS(inner_opts))
},
"IvfFlat" => {
@@ -179,6 +180,7 @@ struct FtsParams {
ngram_min_length: u32,
ngram_max_length: u32,
prefix_only: bool,
skip_merge: bool,
}
#[derive(FromPyObject)]

View File

@@ -53,5 +53,8 @@ pub struct LabelListIndexBuilder {}
pub use lance_index::scalar::inverted::query::*;
pub use lance_index::scalar::FullTextSearchQuery;
/// Builder for full text search (FTS) index parameters.
///
/// Use [`FtsIndexBuilder::skip_merge`] to skip the partition merge stage after indexing.
pub use lance_index::scalar::InvertedIndexParams as FtsIndexBuilder;
pub use lance_index::scalar::InvertedIndexParams;

View File

@@ -2624,6 +2624,11 @@ mod tests {
serde_json::to_value(InvertedIndexParams::default()).unwrap(),
Index::FTS(Default::default()),
),
(
"FTS",
serde_json::to_value(InvertedIndexParams::default().skip_merge(true)).unwrap(),
Index::FTS(InvertedIndexParams::default().skip_merge(true)),
),
];
for (index_type, expected_body, index) in cases {