From 8f84328f1da3960e0cf4e2d194a0c738505e1442 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 8 May 2026 20:18:16 +0800 Subject: [PATCH] docs(python): clarify native FTS tokenizer model home --- python/python/lancedb/index.py | 8 +++++--- python/python/lancedb/table.py | 22 ++++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index 73daf57b4..4fbffc50d 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -113,8 +113,8 @@ class FTS: - "whitespace": Split text by whitespace, but not punctuation. - "raw": No tokenization. The entire text is treated as a single token. - "ngram": N-gram tokenizer for substring-style matching. - - "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. - - "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. + - "jieba/*": Jieba tokenizer loaded from Lance's language model home. + - "lindera/*": Lindera tokenizer loaded from Lance's language model home. language : str, default "English" The language to use for stemming and stop-word removal. This is not the primary way to enable CJK tokenization. @@ -136,7 +136,9 @@ class FTS: Notes ----- Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic`` - require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``. + require tokenizer models in Lance's language model home. Set + ``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data + directory under ``lance/language_models``. """ with_position: bool = False diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 3b8f4307b..82768197c 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -132,9 +132,11 @@ def _maybe_add_fts_error_note( _add_unique_note( exception, "Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' " - "require tokenizer models under LANCE_LANGUAGE_MODEL_HOME. Expected " - "layouts include '$LANCE_LANGUAGE_MODEL_HOME/jieba/default/...' and " - "'$LANCE_LANGUAGE_MODEL_HOME/lindera/ipadic/...'.", + "require tokenizer models in Lance's language model home. Set " + "LANCE_LANGUAGE_MODEL_HOME to override the default platform data " + "directory under 'lance/language_models'. Expected layouts include " + "'/jieba/default/...' and " + "'/lindera/ipadic/...'.", ) @@ -1009,8 +1011,10 @@ class Table(ABC): tokenizer_name: str, default "default" A compatibility alias for native tokenizer configs. Can be "raw", "default" or the 2 letter language code followed by "_stem". So - for english it would be "en_stem". Prefer ``base_tokenizer`` for - new code. + for english it would be "en_stem". For new native FTS indexes, use + ``base_tokenizer`` directly; ``tokenizer_name`` is a legacy + compatibility alias and does not expose model-backed tokenizer names + such as ``jieba/default`` or ``lindera/ipadic``. use_tantivy: bool, default False Deprecated legacy Tantivy parameter. Setting this to True raises an error. @@ -1024,8 +1028,8 @@ class Table(ABC): - "whitespace": Split text by whitespace, but not punctuation. - "raw": No tokenization. The entire text is treated as a single token. - "ngram": N-Gram tokenizer. - - "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. - - "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``. + - "jieba/*": Jieba tokenizer loaded from Lance's language model home. + - "lindera/*": Lindera tokenizer loaded from Lance's language model home. language : str, default "English" The language to use for stemming and stop-word removal. This is not the primary way to enable CJK tokenization. @@ -1058,7 +1062,9 @@ class Table(ABC): Notes ----- Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic`` - require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``. + require tokenizer models in Lance's language model home. Set + ``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data + directory under ``lance/language_models``. """ raise NotImplementedError