mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 14:49:57 +00:00
feat!: upgrade lance to 0.19.1 (#1762)
BREAKING CHANGE: default tokenizer no longer does stemming or stop-word removal. Users should explicitly turn that option on in the future. - upgrade lance to 0.19.1 - update the FTS docs - update the FTS API Upstream change notes: https://github.com/lancedb/lance/releases/tag/v0.19.1 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
@@ -3,7 +3,7 @@ name = "lancedb"
|
||||
# version in Cargo.toml
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.18.3-beta.2",
|
||||
"pylance==0.19.1",
|
||||
"requests>=2.31.0",
|
||||
"tqdm>=4.27.0",
|
||||
"pydantic>=1.10",
|
||||
|
||||
@@ -7,6 +7,27 @@ from ._lancedb import (
|
||||
IndexConfig,
|
||||
)
|
||||
|
||||
lang_mapping = {
|
||||
"ar": "Arabic",
|
||||
"da": "Danish",
|
||||
"du": "Dutch",
|
||||
"en": "English",
|
||||
"fi": "Finnish",
|
||||
"fr": "French",
|
||||
"de": "German",
|
||||
"gr": "Greek",
|
||||
"hu": "Hungarian",
|
||||
"it": "Italian",
|
||||
"no": "Norwegian",
|
||||
"pt": "Portuguese",
|
||||
"ro": "Romanian",
|
||||
"ru": "Russian",
|
||||
"es": "Spanish",
|
||||
"sv": "Swedish",
|
||||
"ta": "Tamil",
|
||||
"tr": "Turkish",
|
||||
}
|
||||
|
||||
|
||||
class BTree:
|
||||
"""Describes a btree index configuration
|
||||
@@ -78,7 +99,17 @@ class FTS:
|
||||
For example, it works with `title`, `description`, `content`, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, with_position: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
with_position: bool = True,
|
||||
base_tokenizer: str = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
stem: bool = False,
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
):
|
||||
self._inner = LanceDbIndex.fts(with_position=with_position)
|
||||
|
||||
|
||||
|
||||
@@ -55,6 +55,7 @@ from .util import (
|
||||
safe_import_polars,
|
||||
value_to_sql,
|
||||
)
|
||||
from .index import lang_mapping
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import PIL
|
||||
@@ -497,10 +498,18 @@ class Table(ABC):
|
||||
ordering_field_names: Union[str, List[str]] = None,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
tokenizer_name: str = "default",
|
||||
use_tantivy: bool = True,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
with_position: bool = True,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
stem: bool = False,
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
):
|
||||
"""Create a full-text search index on the table.
|
||||
|
||||
@@ -526,7 +535,6 @@ class Table(ABC):
|
||||
The tokenizer to use for the index. Can be "raw", "default" or the 2 letter
|
||||
language code followed by "_stem". So for english it would be "en_stem".
|
||||
For available languages see: https://docs.rs/tantivy/latest/tantivy/tokenizer/enum.Language.html
|
||||
only available with use_tantivy=True for now
|
||||
use_tantivy: bool, default True
|
||||
If True, use the legacy full-text search implementation based on tantivy.
|
||||
If False, use the new full-text search implementation based on lance-index.
|
||||
@@ -1341,14 +1349,33 @@ class LanceTable(Table):
|
||||
ordering_field_names: Union[str, List[str]] = None,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
tokenizer_name: str = "default",
|
||||
use_tantivy: bool = True,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
with_position: bool = True,
|
||||
# tokenizer configs:
|
||||
base_tokenizer: str = "simple",
|
||||
language: str = "English",
|
||||
max_token_length: Optional[int] = 40,
|
||||
lower_case: bool = True,
|
||||
stem: bool = False,
|
||||
remove_stop_words: bool = False,
|
||||
ascii_folding: bool = False,
|
||||
):
|
||||
if not use_tantivy:
|
||||
if not isinstance(field_names, str):
|
||||
raise ValueError("field_names must be a string when use_tantivy=False")
|
||||
tokenizer_configs = {
|
||||
"base_tokenizer": base_tokenizer,
|
||||
"language": language,
|
||||
"max_token_length": max_token_length,
|
||||
"lower_case": lower_case,
|
||||
"stem": stem,
|
||||
"remove_stop_words": remove_stop_words,
|
||||
"ascii_folding": ascii_folding,
|
||||
}
|
||||
if tokenizer_name is not None:
|
||||
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
|
||||
# delete the existing legacy index if it exists
|
||||
if replace:
|
||||
path, fs, exist = self._get_fts_index_path()
|
||||
@@ -1359,6 +1386,7 @@ class LanceTable(Table):
|
||||
index_type="INVERTED",
|
||||
replace=replace,
|
||||
with_position=with_position,
|
||||
**tokenizer_configs,
|
||||
)
|
||||
return
|
||||
|
||||
@@ -1381,6 +1409,8 @@ class LanceTable(Table):
|
||||
"Full-text search is only supported on the local filesystem"
|
||||
)
|
||||
|
||||
if tokenizer_name is None:
|
||||
tokenizer_name = "default"
|
||||
index = create_index(
|
||||
path,
|
||||
field_names,
|
||||
@@ -1395,6 +1425,56 @@ class LanceTable(Table):
|
||||
writer_heap_size=writer_heap_size,
|
||||
)
|
||||
|
||||
def infer_tokenizer_configs(tokenizer_name: str) -> dict:
|
||||
if tokenizer_name == "default":
|
||||
return {
|
||||
"base_tokenizer": "simple",
|
||||
"language": "English",
|
||||
"max_token_length": 40,
|
||||
"lower_case": True,
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
}
|
||||
elif tokenizer_name == "raw":
|
||||
return {
|
||||
"base_tokenizer": "raw",
|
||||
"language": "English",
|
||||
"max_token_length": None,
|
||||
"lower_case": False,
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
}
|
||||
elif tokenizer_name == "whitespace":
|
||||
return {
|
||||
"base_tokenizer": "whitespace",
|
||||
"language": "English",
|
||||
"max_token_length": None,
|
||||
"lower_case": False,
|
||||
"stem": False,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
}
|
||||
|
||||
# or it's with language stemming with pattern like "en_stem"
|
||||
if len(tokenizer_name) != 7:
|
||||
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
|
||||
lang = tokenizer_name[:2]
|
||||
if tokenizer_name[-5:] != "_stem":
|
||||
raise ValueError(f"Invalid tokenizer name {tokenizer_name}")
|
||||
if lang not in lang_mapping:
|
||||
raise ValueError(f"Invalid language code {lang}")
|
||||
return {
|
||||
"base_tokenizer": "simple",
|
||||
"language": lang_mapping[lang],
|
||||
"max_token_length": 40,
|
||||
"lower_case": True,
|
||||
"stem": True,
|
||||
"remove_stop_words": False,
|
||||
"ascii_folding": False,
|
||||
}
|
||||
|
||||
def add(
|
||||
self,
|
||||
data: DATA,
|
||||
|
||||
@@ -106,12 +106,41 @@ impl Index {
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[staticmethod]
|
||||
pub fn fts(with_position: Option<bool>) -> Self {
|
||||
pub fn fts(
|
||||
with_position: Option<bool>,
|
||||
base_tokenizer: Option<String>,
|
||||
language: Option<String>,
|
||||
max_token_length: Option<usize>,
|
||||
lower_case: Option<bool>,
|
||||
stem: Option<bool>,
|
||||
remove_stop_words: Option<bool>,
|
||||
ascii_folding: Option<bool>,
|
||||
) -> Self {
|
||||
let mut opts = FtsIndexBuilder::default();
|
||||
if let Some(with_position) = with_position {
|
||||
opts = opts.with_position(with_position);
|
||||
}
|
||||
if let Some(base_tokenizer) = base_tokenizer {
|
||||
opts.tokenizer_configs = opts.tokenizer_configs.base_tokenizer(base_tokenizer);
|
||||
}
|
||||
if let Some(language) = language {
|
||||
opts.tokenizer_configs = opts.tokenizer_configs.language(&language).unwrap();
|
||||
}
|
||||
opts.tokenizer_configs = opts.tokenizer_configs.max_token_length(max_token_length);
|
||||
if let Some(lower_case) = lower_case {
|
||||
opts.tokenizer_configs = opts.tokenizer_configs.lower_case(lower_case);
|
||||
}
|
||||
if let Some(stem) = stem {
|
||||
opts.tokenizer_configs = opts.tokenizer_configs.stem(stem);
|
||||
}
|
||||
if let Some(remove_stop_words) = remove_stop_words {
|
||||
opts.tokenizer_configs = opts.tokenizer_configs.remove_stop_words(remove_stop_words);
|
||||
}
|
||||
if let Some(ascii_folding) = ascii_folding {
|
||||
opts.tokenizer_configs = opts.tokenizer_configs.ascii_folding(ascii_folding);
|
||||
}
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user