mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
feat!: upgrade lance to v0.28.0 (#2404)
this introduces some breaking changes in terms of rust API of creating FTS index, and the default index params changed Signed-off-by: BubbleCal <bubble-cal@outlook.com> <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Updated default settings for full-text search (FTS) index creation: stemming, stop word removal, and ASCII folding are now enabled by default, while token position storage is disabled by default. - **Refactor** - Simplified and streamlined the configuration and handling of FTS index parameters for improved maintainability and consistency across interfaces. - Enhanced serialization and request construction for FTS index parameters to reduce manual handling and improve code clarity. - Improved test coverage by explicitly enabling positional indexing in FTS tests to support phrase queries. - **Chores** - Upgraded all internal dependencies related to FTS indexing to the latest version for enhanced compatibility and performance. - Updated package versions for Node.js, Python, and Rust components to the latest beta releases. - Improved CI workflows by adding Rust toolchain setup with formatting and linting tools. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.22.2-beta.0"
|
||||
version = "0.23.0-beta.0"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -102,7 +102,7 @@ class FTS:
|
||||
|
||||
Attributes
|
||||
----------
|
||||
with_position : bool, default True
|
||||
with_position : bool, default False
|
||||
Whether to store the position of the token in the document. Setting this
|
||||
to False can reduce the size of the index and improve indexing speed,
|
||||
but it will disable support for phrase queries.
|
||||
@@ -118,25 +118,25 @@ class FTS:
|
||||
ignored.
|
||||
lower_case : bool, default True
|
||||
Whether to convert the token to lower case. This makes queries case-insensitive.
|
||||
stem : bool, default False
|
||||
stem : bool, default True
|
||||
Whether to stem the token. Stemming reduces words to their root form.
|
||||
For example, in English "running" and "runs" would both be reduced to "run".
|
||||
remove_stop_words : bool, default False
|
||||
remove_stop_words : bool, default True
|
||||
Whether to remove stop words. Stop words are common words that are often
|
||||
removed from text before indexing. For example, in English "the" and "and".
|
||||
ascii_folding : bool, default False
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
"""
|
||||
|
||||
with_position: bool = True
|
||||
with_position: bool = False
|
||||
base_tokenizer: Literal["simple", "raw", "whitespace"] = "simple"
|
||||
language: str = "English"
|
||||
max_token_length: Optional[int] = 40
|
||||
lower_case: bool = True
|
||||
stem: bool = False
|
||||
remove_stop_words: bool = False
|
||||
ascii_folding: bool = False
|
||||
stem: bool = True
|
||||
remove_stop_words: bool = True
|
||||
ascii_folding: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -149,15 +149,15 @@ class RemoteTable(Table):
|
||||
*,
|
||||
replace: bool = False,
|
||||
wait_timeout: timedelta = None,
|
||||
with_position: bool = True,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
stem: bool = False,
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
):
|
||||
config = FTS(
|
||||
with_position=with_position,
|
||||
|
||||
@@ -829,15 +829,15 @@ class Table(ABC):
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
use_tantivy: bool = True,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
with_position: bool = True,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: BaseTokenizerType = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
stem: bool = False,
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
wait_timeout: Optional[timedelta] = None,
|
||||
):
|
||||
"""Create a full-text search index on the table.
|
||||
@@ -867,7 +867,7 @@ class Table(ABC):
|
||||
use_tantivy: bool, default True
|
||||
If True, use the legacy full-text search implementation based on tantivy.
|
||||
If False, use the new full-text search implementation based on lance-index.
|
||||
with_position: bool, default True
|
||||
with_position: bool, default False
|
||||
Only available with use_tantivy=False
|
||||
If False, do not store the positions of the terms in the text.
|
||||
This can reduce the size of the index and improve indexing speed.
|
||||
@@ -885,13 +885,13 @@ class Table(ABC):
|
||||
lower_case : bool, default True
|
||||
Whether to convert the token to lower case. This makes queries
|
||||
case-insensitive.
|
||||
stem : bool, default False
|
||||
stem : bool, default True
|
||||
Whether to stem the token. Stemming reduces words to their root form.
|
||||
For example, in English "running" and "runs" would both be reduced to "run".
|
||||
remove_stop_words : bool, default False
|
||||
remove_stop_words : bool, default True
|
||||
Whether to remove stop words. Stop words are common words that are often
|
||||
removed from text before indexing. For example, in English "the" and "and".
|
||||
ascii_folding : bool, default False
|
||||
ascii_folding : bool, default True
|
||||
Whether to fold ASCII characters. This converts accented characters to
|
||||
their ASCII equivalent. For example, "café" would be converted to "cafe".
|
||||
wait_timeout: timedelta, optional
|
||||
@@ -1972,15 +1972,15 @@ class LanceTable(Table):
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
use_tantivy: bool = True,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
with_position: bool = True,
|
||||
with_position: bool = False,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: BaseTokenizerType = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
stem: bool = False,
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
stem: bool = True,
|
||||
remove_stop_words: bool = True,
|
||||
ascii_folding: bool = True,
|
||||
):
|
||||
if not use_tantivy:
|
||||
if not isinstance(field_names, str):
|
||||
@@ -1990,6 +1990,7 @@ class LanceTable(Table):
|
||||
tokenizer_configs = {
|
||||
"base_tokenizer": base_tokenizer,
|
||||
"language": language,
|
||||
"with_position": with_position,
|
||||
"max_token_length": max_token_length,
|
||||
"lower_case": lower_case,
|
||||
"stem": stem,
|
||||
@@ -2000,7 +2001,6 @@ class LanceTable(Table):
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
|
||||
config = FTS(
|
||||
with_position=with_position,
|
||||
**tokenizer_configs,
|
||||
)
|
||||
|
||||
|
||||
@@ -156,6 +156,9 @@ async def test_vector_search_async():
|
||||
# --8<-- [end:search_result_async_as_list]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
def test_fts_fuzzy_query():
|
||||
uri = "data/fuzzy-example"
|
||||
db = lancedb.connect(uri)
|
||||
@@ -189,6 +192,9 @@ def test_fts_fuzzy_query():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
def test_fts_boost_query():
|
||||
uri = "data/boost-example"
|
||||
db = lancedb.connect(uri)
|
||||
@@ -234,6 +240,9 @@ def test_fts_boost_query():
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
def test_fts_native():
|
||||
# --8<-- [start:basic_fts]
|
||||
uri = "data/sample-lancedb"
|
||||
@@ -282,6 +291,9 @@ def test_fts_native():
|
||||
# --8<-- [end:fts_incremental_index]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.name == "nt", reason="Need to fix https://github.com/lancedb/lance/issues/3905"
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_native_async():
|
||||
# --8<-- [start:basic_fts_async]
|
||||
|
||||
@@ -287,7 +287,7 @@ def test_search_fts_phrase_query(table):
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True, replace=True)
|
||||
results = table.search("puppy").limit(100).to_list()
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert len(results) > len(phrase_results)
|
||||
@@ -312,7 +312,7 @@ async def test_search_fts_phrase_query_async(async_table):
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
await async_table.create_index("text", config=FTS())
|
||||
await async_table.create_index("text", config=FTS(with_position=True))
|
||||
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
|
||||
phrase_results = (
|
||||
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
|
||||
@@ -649,7 +649,7 @@ def test_fts_on_list(mem_db: DBConnection):
|
||||
}
|
||||
)
|
||||
table = mem_db.create_table("test", data=data)
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
||||
|
||||
res = table.search("lance").limit(5).to_list()
|
||||
assert len(res) == 3
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
use lancedb::index::vector::IvfFlatIndexBuilder;
|
||||
use lancedb::index::{
|
||||
scalar::{BTreeIndexBuilder, FtsIndexBuilder, TokenizerConfig},
|
||||
scalar::{BTreeIndexBuilder, FtsIndexBuilder},
|
||||
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
||||
Index as LanceDbIndex,
|
||||
};
|
||||
@@ -38,19 +38,17 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
|
||||
"LabelList" => Ok(LanceDbIndex::LabelList(Default::default())),
|
||||
"FTS" => {
|
||||
let params = source.extract::<FtsParams>()?;
|
||||
let inner_opts = TokenizerConfig::default()
|
||||
let inner_opts = FtsIndexBuilder::default()
|
||||
.base_tokenizer(params.base_tokenizer)
|
||||
.language(¶ms.language)
|
||||
.map_err(|_| PyValueError::new_err(format!("LanceDB does not support the requested language: '{}'", params.language)))?
|
||||
.with_position(params.with_position)
|
||||
.lower_case(params.lower_case)
|
||||
.max_token_length(params.max_token_length)
|
||||
.remove_stop_words(params.remove_stop_words)
|
||||
.stem(params.stem)
|
||||
.ascii_folding(params.ascii_folding);
|
||||
let mut opts = FtsIndexBuilder::default()
|
||||
.with_position(params.with_position);
|
||||
opts.tokenizer_configs = inner_opts;
|
||||
Ok(LanceDbIndex::FTS(opts))
|
||||
Ok(LanceDbIndex::FTS(inner_opts))
|
||||
},
|
||||
"IvfFlat" => {
|
||||
let params = source.extract::<IvfFlatParams>()?;
|
||||
|
||||
Reference in New Issue
Block a user