From 8f84328f1da3960e0cf4e2d194a0c738505e1442 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Fri, 8 May 2026 20:18:16 +0800
Subject: [PATCH] docs(python): clarify native FTS tokenizer model home

---
 python/python/lancedb/index.py |  8 +++++---
 python/python/lancedb/table.py | 22 ++++++++++++++--------
 2 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py
index 73daf57b4..4fbffc50d 100644
--- a/python/python/lancedb/index.py
+++ b/python/python/lancedb/index.py
@@ -113,8 +113,8 @@ class FTS:
         - "whitespace": Split text by whitespace, but not punctuation.
         - "raw": No tokenization. The entire text is treated as a single token.
         - "ngram": N-gram tokenizer for substring-style matching.
-        - "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
-        - "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
+        - "jieba/*": Jieba tokenizer loaded from Lance's language model home.
+        - "lindera/*": Lindera tokenizer loaded from Lance's language model home.
     language : str, default "English"
         The language to use for stemming and stop-word removal. This is not the
         primary way to enable CJK tokenization.
@@ -136,7 +136,9 @@ class FTS:
     Notes
     -----
     Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
-    require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``.
+    require tokenizer models in Lance's language model home. Set
+    ``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
+    directory under ``lance/language_models``.
     """
 
     with_position: bool = False
diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py
index 3b8f4307b..82768197c 100644
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -132,9 +132,11 @@ def _maybe_add_fts_error_note(
     _add_unique_note(
         exception,
         "Model-backed tokenizers such as 'jieba/default' and 'lindera/ipadic' "
-        "require tokenizer models under LANCE_LANGUAGE_MODEL_HOME. Expected "
-        "layouts include '$LANCE_LANGUAGE_MODEL_HOME/jieba/default/...' and "
-        "'$LANCE_LANGUAGE_MODEL_HOME/lindera/ipadic/...'.",
+        "require tokenizer models in Lance's language model home. Set "
+        "LANCE_LANGUAGE_MODEL_HOME to override the default platform data "
+        "directory under 'lance/language_models'. Expected layouts include "
+        "'<model-home>/jieba/default/...' and "
+        "'<model-home>/lindera/ipadic/...'.",
     )
 
 
@@ -1009,8 +1011,10 @@ class Table(ABC):
         tokenizer_name: str, default "default"
             A compatibility alias for native tokenizer configs. Can be "raw",
             "default" or the 2 letter language code followed by "_stem". So
-            for english it would be "en_stem". Prefer ``base_tokenizer`` for
-            new code.
+            for english it would be "en_stem". For new native FTS indexes, use
+            ``base_tokenizer`` directly; ``tokenizer_name`` is a legacy
+            compatibility alias and does not expose model-backed tokenizer names
+            such as ``jieba/default`` or ``lindera/ipadic``.
         use_tantivy: bool, default False
             Deprecated legacy Tantivy parameter. Setting this to True raises an
             error.
@@ -1024,8 +1028,8 @@ class Table(ABC):
             - "whitespace": Split text by whitespace, but not punctuation.
             - "raw": No tokenization. The entire text is treated as a single token.
             - "ngram": N-Gram tokenizer.
-            - "jieba/*": Jieba tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
-            - "lindera/*": Lindera tokenizer loaded from ``LANCE_LANGUAGE_MODEL_HOME``.
+            - "jieba/*": Jieba tokenizer loaded from Lance's language model home.
+            - "lindera/*": Lindera tokenizer loaded from Lance's language model home.
         language : str, default "English"
             The language to use for stemming and stop-word removal. This is not
             the primary way to enable CJK tokenization.
@@ -1058,7 +1062,9 @@ class Table(ABC):
         Notes
         -----
         Model-backed tokenizers such as ``jieba/default`` and ``lindera/ipadic``
-        require tokenizer models under ``LANCE_LANGUAGE_MODEL_HOME``.
+        require tokenizer models in Lance's language model home. Set
+        ``LANCE_LANGUAGE_MODEL_HOME`` to override the default platform data
+        directory under ``lance/language_models``.
         """
         raise NotImplementedError