mirror of
https://github.com/lancedb/lancedb.git
synced 2026-03-26 02:20:40 +00:00
Compare commits
1 Commits
python-v0.
...
yang/fts-s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4b24e28539 |
@@ -558,6 +558,12 @@ export interface FtsOptions {
|
||||
* whether to only index the prefix of the token for ngram tokenizer
|
||||
*/
|
||||
prefixOnly?: boolean;
|
||||
|
||||
/**
|
||||
* Whether to skip the partition merge stage after indexing.
|
||||
* Useful for distributed indexing where merges are handled separately.
|
||||
*/
|
||||
skipMerge?: boolean;
|
||||
}
|
||||
|
||||
export class Index {
|
||||
@@ -726,6 +732,7 @@ export class Index {
|
||||
options?.ngramMinLength,
|
||||
options?.ngramMaxLength,
|
||||
options?.prefixOnly,
|
||||
options?.skipMerge,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -157,6 +157,7 @@ impl Index {
|
||||
ngram_min_length: Option<u32>,
|
||||
ngram_max_length: Option<u32>,
|
||||
prefix_only: Option<bool>,
|
||||
skip_merge: Option<bool>,
|
||||
) -> Self {
|
||||
let mut opts = FtsIndexBuilder::default();
|
||||
if let Some(with_position) = with_position {
|
||||
@@ -192,6 +193,9 @@ impl Index {
|
||||
if let Some(prefix_only) = prefix_only {
|
||||
opts = opts.ngram_prefix_only(prefix_only);
|
||||
}
|
||||
if let Some(skip_merge) = skip_merge {
|
||||
opts = opts.skip_merge(skip_merge);
|
||||
}
|
||||
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||
|
||||
@@ -127,6 +127,9 @@ class FTS:
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
skip_merge : bool, default False
|
||||
Whether to skip the partition merge stage after indexing. This can be
|
||||
useful for distributed indexing where merges are handled separately.
|
||||
"""
|
||||
|
||||
with_position: bool = False
|
||||
@@ -140,6 +143,7 @@ class FTS:
|
||||
ngram_min_length: int = 3
|
||||
ngram_max_length: int = 3
|
||||
prefix_only: bool = False
|
||||
skip_merge: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -176,6 +176,7 @@ class RemoteTable(Table):
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
skip_merge: bool = False,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
config = FTS(
|
||||
@@ -190,6 +191,7 @@ class RemoteTable(Table):
|
||||
ngram_min_length=ngram_min_length,
|
||||
ngram_max_length=ngram_max_length,
|
||||
prefix_only=prefix_only,
|
||||
skip_merge=skip_merge,
|
||||
)
|
||||
LOOP.run(
|
||||
self._table.create_index(
|
||||
|
||||
@@ -892,6 +892,7 @@ class Table(ABC):
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
skip_merge: bool = False,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
@@ -956,6 +957,9 @@ class Table(ABC):
|
||||
The maximum length of an n-gram.
|
||||
prefix_only: bool, default False
|
||||
Whether to only index the prefix of the token for ngram tokenizer.
|
||||
skip_merge: bool, default False
|
||||
Only available with use_tantivy=False.
|
||||
If True, skip the partition merge stage after indexing.
|
||||
wait_timeout: timedelta, optional
|
||||
The timeout to wait if indexing is asynchronous.
|
||||
name: str, optional
|
||||
@@ -2259,6 +2263,7 @@ class LanceTable(Table):
|
||||
ngram_min_length: int = 3,
|
||||
ngram_max_length: int = 3,
|
||||
prefix_only: bool = False,
|
||||
skip_merge: bool = False,
|
||||
name: Optional[str] = None,
|
||||
):
|
||||
if not use_tantivy:
|
||||
@@ -2282,6 +2287,8 @@ class LanceTable(Table):
|
||||
else:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
|
||||
tokenizer_configs["skip_merge"] = skip_merge
|
||||
|
||||
config = FTS(
|
||||
**tokenizer_configs,
|
||||
)
|
||||
|
||||
@@ -405,7 +405,10 @@ def test_table_create_indices():
|
||||
|
||||
# Test create_fts_index with custom name
|
||||
table.create_fts_index(
|
||||
"text", wait_timeout=timedelta(seconds=2), name="custom_fts_idx"
|
||||
"text",
|
||||
wait_timeout=timedelta(seconds=2),
|
||||
name="custom_fts_idx",
|
||||
skip_merge=True,
|
||||
)
|
||||
|
||||
# Test create_index with custom name
|
||||
@@ -427,6 +430,7 @@ def test_table_create_indices():
|
||||
fts_req = received_requests[1]
|
||||
assert "name" in fts_req
|
||||
assert fts_req["name"] == "custom_fts_idx"
|
||||
assert fts_req["skip_merge"] is True
|
||||
|
||||
# Check vector index request has custom name
|
||||
vector_req = received_requests[2]
|
||||
|
||||
@@ -50,7 +50,8 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
.ascii_folding(params.ascii_folding)
|
||||
.ngram_min_length(params.ngram_min_length)
|
||||
.ngram_max_length(params.ngram_max_length)
|
||||
.ngram_prefix_only(params.prefix_only);
|
||||
.ngram_prefix_only(params.prefix_only)
|
||||
.skip_merge(params.skip_merge);
|
||||
Ok(LanceDbIndex::FTS(inner_opts))
|
||||
},
|
||||
"IvfFlat" => {
|
||||
@@ -179,6 +180,7 @@ struct FtsParams {
|
||||
ngram_min_length: u32,
|
||||
ngram_max_length: u32,
|
||||
prefix_only: bool,
|
||||
skip_merge: bool,
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
|
||||
@@ -53,5 +53,8 @@ pub struct LabelListIndexBuilder {}
|
||||
|
||||
pub use lance_index::scalar::inverted::query::*;
|
||||
pub use lance_index::scalar::FullTextSearchQuery;
|
||||
/// Builder for full text search (FTS) index parameters.
|
||||
///
|
||||
/// Use [`FtsIndexBuilder::skip_merge`] to skip the partition merge stage after indexing.
|
||||
pub use lance_index::scalar::InvertedIndexParams as FtsIndexBuilder;
|
||||
pub use lance_index::scalar::InvertedIndexParams;
|
||||
|
||||
@@ -2624,6 +2624,11 @@ mod tests {
|
||||
serde_json::to_value(InvertedIndexParams::default()).unwrap(),
|
||||
Index::FTS(Default::default()),
|
||||
),
|
||||
(
|
||||
"FTS",
|
||||
serde_json::to_value(InvertedIndexParams::default().skip_merge(true)).unwrap(),
|
||||
Index::FTS(InvertedIndexParams::default().skip_merge(true)),
|
||||
),
|
||||
];
|
||||
|
||||
for (index_type, expected_body, index) in cases {
|
||||
|
||||
Reference in New Issue
Block a user