feat!: upgrade lance to 0.19.1 (#1762)

BREAKING CHANGE: default tokenizer no longer does stemming or stop-word
removal. Users should explicitly turn that option on in the future.

- upgrade lance to 0.19.1
- update the FTS docs
- update the FTS API

Upstream change notes:
https://github.com/lancedb/lance/releases/tag/v0.19.1

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
BubbleCal
2024-10-30 00:03:52 +08:00
committed by GitHub
parent b9802a0d23
commit 32fdcf97db
16 changed files with 459 additions and 166 deletions

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
"pylance==0.18.3-beta.2",
"pylance==0.19.1",
"requests>=2.31.0",
"tqdm>=4.27.0",
"pydantic>=1.10",

View File

@@ -7,6 +7,27 @@ from ._lancedb import (
IndexConfig,
)
lang_mapping = {
"ar": "Arabic",
"da": "Danish",
"du": "Dutch",
"en": "English",
"fi": "Finnish",
"fr": "French",
"de": "German",
"gr": "Greek",
"hu": "Hungarian",
"it": "Italian",
"no": "Norwegian",
"pt": "Portuguese",
"ro": "Romanian",
"ru": "Russian",
"es": "Spanish",
"sv": "Swedish",
"ta": "Tamil",
"tr": "Turkish",
}
class BTree:
"""Describes a btree index configuration
@@ -78,7 +99,17 @@ class FTS:
For example, it works with `title`, `description`, `content`, etc.
"""
def __init__(self, with_position: bool = True):
def __init__(
self,
with_position: bool = True,
base_tokenizer: str = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
stem: bool = False,
remove_stop_words: bool = False,
ascii_folding: bool = False,
):
self._inner = LanceDbIndex.fts(with_position=with_position)

View File

@@ -55,6 +55,7 @@ from .util import (
safe_import_polars,
value_to_sql,
)
from .index import lang_mapping
if TYPE_CHECKING:
import PIL
@@ -497,10 +498,18 @@ class Table(ABC):
ordering_field_names: Union[str, List[str]] = None,
*,
replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
use_tantivy: bool = True,
tokenizer_name: Optional[str] = None,
with_position: bool = True,
# tokenizer configs:
base_tokenizer: str = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
stem: bool = False,
remove_stop_words: bool = False,
ascii_folding: bool = False,
):
"""Create a full-text search index on the table.
@@ -526,7 +535,6 @@ class Table(ABC):
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
language code followed by "_stem". So for english it would be "en_stem".
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
only available with use_tantivy=True for now
use_tantivy: bool, default True
If True, use the legacy full-text search implementation based on tantivy.
If False, use the new full-text search implementation based on lance-index.
@@ -1341,14 +1349,33 @@ class LanceTable(Table):
ordering_field_names: Union[str, List[str]] = None,
*,
replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default",
use_tantivy: bool = True,
tokenizer_name: Optional[str] = None,
with_position: bool = True,
# tokenizer configs:
base_tokenizer: str = "simple",
language: str = "English",
max_token_length: Optional[int] = 40,
lower_case: bool = True,
stem: bool = False,
remove_stop_words: bool = False,
ascii_folding: bool = False,
):
if not use_tantivy:
if not isinstance(field_names, str):
raise ValueError("field_names must be a string when use_tantivy=False")
tokenizer_configs = {
"base_tokenizer": base_tokenizer,
"language": language,
"max_token_length": max_token_length,
"lower_case": lower_case,
"stem": stem,
"remove_stop_words": remove_stop_words,
"ascii_folding": ascii_folding,
}
if tokenizer_name is not None:
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
# delete the existing legacy index if it exists
if replace:
path, fs, exist = self._get_fts_index_path()
@@ -1359,6 +1386,7 @@ class LanceTable(Table):
index_type="INVERTED",
replace=replace,
with_position=with_position,
**tokenizer_configs,
)
return
@@ -1381,6 +1409,8 @@ class LanceTable(Table):
"Full-text search is only supported on the local filesystem"
)
if tokenizer_name is None:
tokenizer_name = "default"
index = create_index(
path,
field_names,
@@ -1395,6 +1425,56 @@ class LanceTable(Table):
writer_heap_size=writer_heap_size,
)
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
if tokenizer_name == "default":
return {
"base_tokenizer": "simple",
"language": "English",
"max_token_length": 40,
"lower_case": True,
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
}
elif tokenizer_name == "raw":
return {
"base_tokenizer": "raw",
"language": "English",
"max_token_length": None,
"lower_case": False,
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
}
elif tokenizer_name == "whitespace":
return {
"base_tokenizer": "whitespace",
"language": "English",
"max_token_length": None,
"lower_case": False,
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
}
# or it's with language stemming with pattern like "en_stem"
if len(tokenizer_name) != 7:
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
lang = tokenizer_name[:2]
if tokenizer_name[-5:] != "_stem":
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
if lang not in lang_mapping:
raise ValueError(f"Invalid language code {lang}")
return {
"base_tokenizer": "simple",
"language": lang_mapping[lang],
"max_token_length": 40,
"lower_case": True,
"stem": True,
"remove_stop_words": False,
"ascii_folding": False,
}
def add(
self,
data: DATA,

View File

@@ -106,12 +106,41 @@ impl Index {
})
}
#[allow(clippy::too_many_arguments)]
#[staticmethod]
pub fn fts(with_position: Option<bool>) -> Self {
pub fn fts(
with_position: Option<bool>,
base_tokenizer: Option<String>,
language: Option<String>,
max_token_length: Option<usize>,
lower_case: Option<bool>,
stem: Option<bool>,
remove_stop_words: Option<bool>,
ascii_folding: Option<bool>,
) -> Self {
let mut opts = FtsIndexBuilder::default();
if let Some(with_position) = with_position {
opts = opts.with_position(with_position);
}
if let Some(base_tokenizer) = base_tokenizer {
opts.tokenizer_configs = opts.tokenizer_configs.base_tokenizer(base_tokenizer);
}
if let Some(language) = language {
opts.tokenizer_configs = opts.tokenizer_configs.language(&language).unwrap();
}
opts.tokenizer_configs = opts.tokenizer_configs.max_token_length(max_token_length);
if let Some(lower_case) = lower_case {
opts.tokenizer_configs = opts.tokenizer_configs.lower_case(lower_case);
}
if let Some(stem) = stem {
opts.tokenizer_configs = opts.tokenizer_configs.stem(stem);
}
if let Some(remove_stop_words) = remove_stop_words {
opts.tokenizer_configs = opts.tokenizer_configs.remove_stop_words(remove_stop_words);
}
if let Some(ascii_folding) = ascii_folding {
opts.tokenizer_configs = opts.tokenizer_configs.ascii_folding(ascii_folding);
}
Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
}