Bump version: 0.17.1-beta.0 → 0.17.1-beta.1

fix: index params are ignored by RemoteTable (#1928 )
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-12-23 05:19:58 +00:00 · 2024-12-09 08:40:35 +00:00 · 2024-12-09 16:37:01 +08:00 · 2024-12-09 04:01:51 +00:00 · 2024-12-09 04:01:35 +00:00 · 2024-12-09 04:01:19 +00:00
32 changed files with 730 additions and 73 deletions
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.14.0"
+current_version = "0.14.1-beta.0"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/java/core/pom.xml
+++ b/java/core/pom.xml
@@ -8,7 +8,7 @@
    <parent>
        <groupId>com.lancedb</groupId>
        <artifactId>lancedb-parent</artifactId>
-        <version>0.14.0-final.0</version>
+        <version>0.14.1-beta.0</version>
        <relativePath>../pom.xml</relativePath>
    </parent>

--- a/java/pom.xml
+++ b/java/pom.xml
@@ -6,7 +6,7 @@

    <groupId>com.lancedb</groupId>
    <artifactId>lancedb-parent</artifactId>
-    <version>0.14.0-final.0</version>
+    <version>0.14.1-beta.0</version>
    <packaging>pom</packaging>

    <name>LanceDB Parent</name>
--- a/node/package-lock.json
+++ b/node/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "vectordb",
-  "version": "0.14.0-beta.2",
+  "version": "0.14.1-beta.0",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "vectordb",
-      "version": "0.14.0-beta.2",
+      "version": "0.14.1-beta.0",
      "cpu": [
        "x64",
        "arm64"
@@ -52,14 +52,14 @@
        "uuid": "^9.0.0"
      },
      "optionalDependencies": {
-        "@lancedb/vectordb-darwin-arm64": "0.14.0-beta.2",
-        "@lancedb/vectordb-darwin-x64": "0.14.0-beta.2",
-        "@lancedb/vectordb-linux-arm64-gnu": "0.14.0-beta.2",
-        "@lancedb/vectordb-linux-arm64-musl": "0.14.0-beta.2",
-        "@lancedb/vectordb-linux-x64-gnu": "0.14.0-beta.2",
-        "@lancedb/vectordb-linux-x64-musl": "0.14.0-beta.2",
-        "@lancedb/vectordb-win32-arm64-msvc": "0.14.0-beta.2",
-        "@lancedb/vectordb-win32-x64-msvc": "0.14.0-beta.2"
+        "@lancedb/vectordb-darwin-arm64": "0.14.1-beta.0",
+        "@lancedb/vectordb-darwin-x64": "0.14.1-beta.0",
+        "@lancedb/vectordb-linux-arm64-gnu": "0.14.1-beta.0",
+        "@lancedb/vectordb-linux-arm64-musl": "0.14.1-beta.0",
+        "@lancedb/vectordb-linux-x64-gnu": "0.14.1-beta.0",
+        "@lancedb/vectordb-linux-x64-musl": "0.14.1-beta.0",
+        "@lancedb/vectordb-win32-arm64-msvc": "0.14.1-beta.0",
+        "@lancedb/vectordb-win32-x64-msvc": "0.14.1-beta.0"
      },
      "peerDependencies": {
        "@apache-arrow/ts": "^14.0.2",
--- a/node/package.json
+++ b/node/package.json
@@ -1,7 +1,8 @@
 {
  "name": "vectordb",
-  "version": "0.14.0",
+  "version": "0.14.1-beta.0",
  "description": " Serverless, low-latency vector database for AI applications",
+  "private": false,
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
  "scripts": {
@@ -91,13 +92,13 @@
    }
  },
  "optionalDependencies": {
-    "@lancedb/vectordb-darwin-x64": "0.14.0",
-    "@lancedb/vectordb-darwin-arm64": "0.14.0",
-    "@lancedb/vectordb-linux-x64-gnu": "0.14.0",
-    "@lancedb/vectordb-linux-arm64-gnu": "0.14.0",
-    "@lancedb/vectordb-linux-x64-musl": "0.14.0",
-    "@lancedb/vectordb-linux-arm64-musl": "0.14.0",
-    "@lancedb/vectordb-win32-x64-msvc": "0.14.0",
-    "@lancedb/vectordb-win32-arm64-msvc": "0.14.0"
+    "@lancedb/vectordb-darwin-x64": "0.14.1-beta.0",
+    "@lancedb/vectordb-darwin-arm64": "0.14.1-beta.0",
+    "@lancedb/vectordb-linux-x64-gnu": "0.14.1-beta.0",
+    "@lancedb/vectordb-linux-arm64-gnu": "0.14.1-beta.0",
+    "@lancedb/vectordb-linux-x64-musl": "0.14.1-beta.0",
+    "@lancedb/vectordb-linux-arm64-musl": "0.14.1-beta.0",
+    "@lancedb/vectordb-win32-x64-msvc": "0.14.1-beta.0",
+    "@lancedb/vectordb-win32-arm64-msvc": "0.14.1-beta.0"
  }
 }
--- a/nodejs/Cargo.toml
+++ b/nodejs/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "lancedb-nodejs"
 edition.workspace = true
-version = "0.14.0"
+version = "0.14.1-beta.0"
 license.workspace = true
 description.workspace = true
 repository.workspace = true
--- a/nodejs/npm/darwin-arm64/package.json
+++ b/nodejs/npm/darwin-arm64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-arm64",
-	"version": "0.14.0",
+	"version": "0.14.1-beta.0",
 	"os": ["darwin"],
 	"cpu": ["arm64"],
 	"main": "lancedb.darwin-arm64.node",
--- a/nodejs/npm/darwin-x64/package.json
+++ b/nodejs/npm/darwin-x64/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-darwin-x64",
-	"version": "0.14.0",
+	"version": "0.14.1-beta.0",
 	"os": ["darwin"],
 	"cpu": ["x64"],
 	"main": "lancedb.darwin-x64.node",
--- a/nodejs/npm/linux-arm64-gnu/package.json
+++ b/nodejs/npm/linux-arm64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-gnu",
-	"version": "0.14.0",
+	"version": "0.14.1-beta.0",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-gnu.node",
--- a/nodejs/npm/linux-arm64-musl/package.json
+++ b/nodejs/npm/linux-arm64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-arm64-musl",
-	"version": "0.14.0",
+	"version": "0.14.1-beta.0",
 	"os": ["linux"],
 	"cpu": ["arm64"],
 	"main": "lancedb.linux-arm64-musl.node",
--- a/nodejs/npm/linux-x64-gnu/package.json
+++ b/nodejs/npm/linux-x64-gnu/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-gnu",
-	"version": "0.14.0",
+	"version": "0.14.1-beta.0",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-gnu.node",
--- a/nodejs/npm/linux-x64-musl/package.json
+++ b/nodejs/npm/linux-x64-musl/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-linux-x64-musl",
-	"version": "0.14.0",
+	"version": "0.14.1-beta.0",
 	"os": ["linux"],
 	"cpu": ["x64"],
 	"main": "lancedb.linux-x64-musl.node",
--- a/nodejs/npm/win32-arm64-msvc/package.json
+++ b/nodejs/npm/win32-arm64-msvc/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@lancedb/lancedb-win32-arm64-msvc",
-  "version": "0.14.0",
+  "version": "0.14.1-beta.0",
  "os": [
    "win32"
  ],
--- a/nodejs/npm/win32-x64-msvc/package.json
+++ b/nodejs/npm/win32-x64-msvc/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@lancedb/lancedb-win32-x64-msvc",
-	"version": "0.14.0",
+	"version": "0.14.1-beta.0",
 	"os": ["win32"],
 	"cpu": ["x64"],
 	"main": "lancedb.win32-x64-msvc.node",
--- a/nodejs/package-lock.json
+++ b/nodejs/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "@lancedb/lancedb",
-  "version": "0.13.0",
+  "version": "0.14.0",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "@lancedb/lancedb",
-      "version": "0.13.0",
+      "version": "0.14.0",
      "cpu": [
        "x64",
        "arm64"
--- a/nodejs/package.json
+++ b/nodejs/package.json
@@ -10,7 +10,8 @@
    "vector database",
    "ann"
  ],
-  "version": "0.14.0",
+  "private": false,
+  "version": "0.14.1-beta.0",
  "main": "dist/index.js",
  "exports": {
    ".": "./dist/index.js",
@@ -30,7 +31,8 @@
        "aarch64-unknown-linux-gnu",
        "x86_64-unknown-linux-musl",
        "aarch64-unknown-linux-musl",
-        "x86_64-pc-windows-msvc"
+        "x86_64-pc-windows-msvc",
+        "aarch64-pc-windows-msvc"
      ]
    }
  },
--- a/python/.bumpversion.toml
+++ b/python/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.17.0"
+current_version = "0.17.1-beta.1"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.17.0"
+version = "0.17.1-beta.1"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/python/lancedb/init.py
+++ b/python/python/lancedb/init.py
@@ -110,6 +110,7 @@ def connect(
            # TODO: remove this (deprecation warning downstream)
            request_thread_pool=request_thread_pool,
            client_config=client_config,
+            storage_options=storage_options,
            **kwargs,
        )

--- a/python/python/lancedb/_lancedb.pyi
+++ b/python/python/lancedb/_lancedb.pyi
@@ -79,9 +79,21 @@ class Query:
    def limit(self, limit: int): ...
    def offset(self, offset: int): ...
    def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
-    def nearest_to_text(self, query: dict) -> Query: ...
+    def nearest_to_text(self, query: dict) -> FTSQuery: ...
    async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...

+class FTSQuery:
+    def where(self, filter: str): ...
+    def select(self, columns: List[str]): ...
+    def limit(self, limit: int): ...
+    def offset(self, offset: int): ...
+    def fast_search(self): ...
+    def with_row_id(self): ...
+    def postfilter(self): ...
+    def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
+    async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
+    async def explain_plan(self) -> str: ...
+
 class VectorQuery:
    async def execute(self) -> RecordBatchStream: ...
    def where(self, filter: str): ...
@@ -95,6 +107,24 @@ class VectorQuery:
    def refine_factor(self, refine_factor: int): ...
    def nprobes(self, nprobes: int): ...
    def bypass_vector_index(self): ...
+    def nearest_to_text(self, query: dict) -> HybridQuery: ...
+
+class HybridQuery:
+    def where(self, filter: str): ...
+    def select(self, columns: List[str]): ...
+    def limit(self, limit: int): ...
+    def offset(self, offset: int): ...
+    def fast_search(self): ...
+    def with_row_id(self): ...
+    def postfilter(self): ...
+    def distance_type(self, distance_type: str): ...
+    def refine_factor(self, refine_factor: int): ...
+    def nprobes(self, nprobes: int): ...
+    def bypass_vector_index(self): ...
+    def to_vector_query(self) -> VectorQuery: ...
+    def to_fts_query(self) -> FTSQuery: ...
+    def get_limit(self) -> int: ...
+    def get_with_row_id(self) -> bool: ...

 class CompactionStats:
    fragments_removed: int
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -26,6 +26,7 @@ from typing import (
    Union,
 )

+import asyncio
 import deprecation
 import numpy as np
 import pyarrow as pa
@@ -44,6 +45,8 @@ if TYPE_CHECKING:
    import polars as pl

    from ._lancedb import Query as LanceQuery
+    from ._lancedb import FTSQuery as LanceFTSQuery
+    from ._lancedb import HybridQuery as LanceHybridQuery
    from ._lancedb import VectorQuery as LanceVectorQuery
    from .common import VEC
    from .pydantic import LanceModel
@@ -1124,35 +1127,55 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
            fts_results = fts_future.result()
            vector_results = vector_future.result()

-        # convert to ranks first if needed
-        if self._norm == "rank":
-            vector_results = self._rank(vector_results, "_distance")
-            fts_results = self._rank(fts_results, "_score")
+        return self._combine_hybrid_results(
+            fts_results=fts_results,
+            vector_results=vector_results,
+            norm=self._norm,
+            fts_query=self._fts_query._query,
+            reranker=self._reranker,
+            limit=self._limit,
+            with_row_ids=self._with_row_id,
+        )
+
+    @staticmethod
+    def _combine_hybrid_results(
+        fts_results: pa.Table,
+        vector_results: pa.Table,
+        norm: str,
+        fts_query: str,
+        reranker,
+        limit: int,
+        with_row_ids: bool,
+    ) -> pa.Table:
+        if norm == "rank":
+            vector_results = LanceHybridQueryBuilder._rank(vector_results, "_distance")
+            fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")

        # normalize the scores to be between 0 and 1, 0 being most relevant
-        vector_results = self._normalize_scores(vector_results, "_distance")
+        vector_results = LanceHybridQueryBuilder._normalize_scores(
+            vector_results, "_distance"
+        )

        # In fts higher scores represent relevance. Not inverting them here as
        # rerankers might need to preserve this score to support `return_score="all"`
-        fts_results = self._normalize_scores(fts_results, "_score")
+        fts_results = LanceHybridQueryBuilder._normalize_scores(fts_results, "_score")

-        results = self._reranker.rerank_hybrid(
-            self._fts_query._query, vector_results, fts_results
-        )
+        results = reranker.rerank_hybrid(fts_query, vector_results, fts_results)

        check_reranker_result(results)

-        # apply limit after reranking
-        results = results.slice(length=self._limit)
+        results = results.slice(length=limit)

-        if not self._with_row_id:
+        if not with_row_ids:
            results = results.drop(["_rowid"])
+
        return results

    def to_batches(self):
        raise NotImplementedError("to_batches not yet supported on a hybrid query")

-    def _rank(self, results: pa.Table, column: str, ascending: bool = True):
+    @staticmethod
+    def _rank(results: pa.Table, column: str, ascending: bool = True):
        if len(results) == 0:
            return results
        # Get the _score column from results
@@ -1169,7 +1192,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
        )
        return results

-    def _normalize_scores(self, results: pa.Table, column: str, invert=False):
+    @staticmethod
+    def _normalize_scores(results: pa.Table, column: str, invert=False):
        if len(results) == 0:
            return results
        # Get the _score column from results
@@ -1635,7 +1659,7 @@ class AsyncQuery(AsyncQueryBase):

    def nearest_to_text(
        self, query: str, columns: Union[str, List[str]] = []
-    ) -> AsyncQuery:
+    ) -> AsyncFTSQuery:
        """
        Find the documents that are most relevant to the given text query.

@@ -1658,8 +1682,90 @@ class AsyncQuery(AsyncQueryBase):
        """
        if isinstance(columns, str):
            columns = [columns]
-        self._inner.nearest_to_text({"query": query, "columns": columns})
-        return self
+        return AsyncFTSQuery(
+            self._inner.nearest_to_text({"query": query, "columns": columns})
+        )
+
+
+class AsyncFTSQuery(AsyncQueryBase):
+    """A query for full text search for LanceDB."""
+
+    def __init__(self, inner: LanceFTSQuery):
+        super().__init__(inner)
+        self._inner = inner
+
+    def get_query(self):
+        self._inner.get_query()
+
+    def nearest_to(
+        self,
+        query_vector: Union[VEC, Tuple, List[VEC]],
+    ) -> AsyncHybridQuery:
+        """
+        In addition doing text search on the LanceDB Table, also
+        find the nearest vectors to the given query vector.
+
+        This converts the query from a FTS Query to a Hybrid query. Results
+        from the vector search will be combined with results from the FTS query.
+
+        This method will attempt to convert the input to the query vector
+        expected by the embedding model.  If the input cannot be converted
+        then an error will be thrown.
+
+        By default, there is no embedding model, and the input should be
+        something that can be converted to a pyarrow array of floats.  This
+        includes lists, numpy arrays, and tuples.
+
+        If there is only one vector column (a column whose data type is a
+        fixed size list of floats) then the column does not need to be specified.
+        If there is more than one vector column you must use
+        [AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
+        which column you would like to compare with.
+
+        If no index has been created on the vector column then a vector query
+        will perform a distance comparison between the query vector and every
+        vector in the database and then sort the results.  This is sometimes
+        called a "flat search"
+
+        For small databases, with tens of thousands of vectors or less, this can
+        be reasonably fast.  In larger databases you should create a vector index
+        on the column.  If there is a vector index then an "approximate" nearest
+        neighbor search (frequently called an ANN search) will be performed.  This
+        search is much faster, but the results will be approximate.
+
+        The query can be further parameterized using the returned builder.  There
+        are various ANN search parameters that will let you fine tune your recall
+        accuracy vs search latency.
+
+        Hybrid searches always have a [limit][].  If `limit` has not been called then
+        a default `limit` of 10 will be used.
+
+        Typically, a single vector is passed in as the query. However, you can also
+        pass in multiple vectors.  This can be useful if you want to find the nearest
+        vectors to multiple query vectors. This is not expected to be faster than
+        making multiple queries concurrently; it is just a convenience method.
+        If multiple vectors are passed in then an additional column `query_index`
+        will be added to the results.  This column will contain the index of the
+        query vector that the result is nearest to.
+        """
+        if query_vector is None:
+            raise ValueError("query_vector can not be None")
+
+        if (
+            isinstance(query_vector, list)
+            and len(query_vector) > 0
+            and not isinstance(query_vector[0], (float, int))
+        ):
+            # multiple have been passed
+            query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
+            new_self = self._inner.nearest_to(query_vectors[0])
+            for v in query_vectors[1:]:
+                new_self.add_query_vector(v)
+            return AsyncHybridQuery(new_self)
+        else:
+            return AsyncHybridQuery(
+                self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
+            )


 class AsyncVectorQuery(AsyncQueryBase):
@@ -1796,3 +1902,160 @@ class AsyncVectorQuery(AsyncQueryBase):
        """
        self._inner.bypass_vector_index()
        return self
+
+    def nearest_to_text(
+        self, query: str, columns: Union[str, List[str]] = []
+    ) -> AsyncHybridQuery:
+        """
+        Find the documents that are most relevant to the given text query,
+        in addition to vector search.
+
+        This converts the vector query into a hybrid query.
+
+        This search will perform a full text search on the table and return
+        the most relevant documents, combined with the vector query results.
+        The text relevance is determined by BM25.
+
+        The columns to search must be with native FTS index
+        (Tantivy-based can't work with this method).
+
+        By default, all indexed columns are searched,
+        now only one column can be searched at a time.
+
+        Parameters
+        ----------
+        query: str
+            The text query to search for.
+        columns: str or list of str, default None
+            The columns to search in. If None, all indexed columns are searched.
+            For now only one column can be searched at a time.
+        """
+        if isinstance(columns, str):
+            columns = [columns]
+        return AsyncHybridQuery(
+            self._inner.nearest_to_text({"query": query, "columns": columns})
+        )
+
+
+class AsyncHybridQuery(AsyncQueryBase):
+    """
+    A query builder that performs hybrid vector and full text search.
+    Results are combined and reranked based on the specified reranker.
+    By default, the results are reranked using the RRFReranker, which
+    uses reciprocal rank fusion score for reranking.
+
+    To make the vector and fts results comparable, the scores are normalized.
+    Instead of normalizing scores, the `normalize` parameter can be set to "rank"
+    in the `rerank` method to convert the scores to ranks and then normalize them.
+    """
+
+    def __init__(self, inner: LanceHybridQuery):
+        super().__init__(inner)
+        self._inner = inner
+        self._norm = "score"
+        self._reranker = RRFReranker()
+
+    def rerank(
+        self, reranker: Reranker = RRFReranker(), normalize: str = "score"
+    ) -> AsyncHybridQuery:
+        """
+        Rerank the hybrid search results using the specified reranker. The reranker
+        must be an instance of Reranker class.
+
+        Parameters
+        ----------
+        reranker: Reranker, default RRFReranker()
+            The reranker to use. Must be an instance of Reranker class.
+        normalize: str, default "score"
+            The method to normalize the scores. Can be "rank" or "score". If "rank",
+            the scores are converted to ranks and then normalized. If "score", the
+            scores are normalized directly.
+        Returns
+        -------
+        AsyncHybridQuery
+            The AsyncHybridQuery object.
+        """
+        if normalize not in ["rank", "score"]:
+            raise ValueError("normalize must be 'rank' or 'score'.")
+        if reranker and not isinstance(reranker, Reranker):
+            raise ValueError("reranker must be an instance of Reranker class.")
+
+        self._norm = normalize
+        self._reranker = reranker
+
+        return self
+
+    async def to_batches(self):
+        raise NotImplementedError("to_batches not yet supported on a hybrid query")
+
+    async def to_arrow(self) -> pa.Table:
+        fts_query = AsyncFTSQuery(self._inner.to_fts_query())
+        vec_query = AsyncVectorQuery(self._inner.to_vector_query())
+
+        # save the row ID choice that was made on the query builder and force it
+        # to actually fetch the row ids because we need this for reranking
+        with_row_ids = self._inner.get_with_row_id()
+        fts_query.with_row_id()
+        vec_query.with_row_id()
+
+        fts_results, vector_results = await asyncio.gather(
+            fts_query.to_arrow(),
+            vec_query.to_arrow(),
+        )
+
+        return LanceHybridQueryBuilder._combine_hybrid_results(
+            fts_results=fts_results,
+            vector_results=vector_results,
+            norm=self._norm,
+            fts_query=fts_query.get_query(),
+            reranker=self._reranker,
+            limit=self._inner.get_limit(),
+            with_row_ids=with_row_ids,
+        )
+
+    async def explain_plan(self, verbose: Optional[bool] = False):
+        """Return the execution plan for this query.
+
+        The output includes both the vector and FTS search plans.
+
+        Examples
+        --------
+        >>> import asyncio
+        >>> from lancedb import connect_async
+        >>> from lancedb.index import FTS
+        >>> async def doctest_example():
+        ...     conn = await connect_async("./.lancedb")
+        ...     table = await conn.create_table("my_table", [{"vector": [99, 99], "text": "hello world"}])
+        ...     await table.create_index("text", config=FTS(with_position=False))
+        ...     query = [100, 100]
+        ...     plan = await table.query().nearest_to([1, 2]).nearest_to_text("hello").explain_plan(True)
+        ...     print(plan)
+        >>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+        Vector Search Plan:
+        ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
+            Take: columns="vector, _rowid, _distance, (text)"
+                CoalesceBatchesExec: target_batch_size=1024
+                GlobalLimitExec: skip=0, fetch=10
+                    FilterExec: _distance@2 IS NOT NULL
+                    SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
+                        KNNVectorDistance: metric=l2
+                        LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
+        FTS Search Plan:
+        LanceScan: uri=..., projection=[vector, text], row_id=false, row_addr=false, ordered=true
+
+        Parameters
+        ----------
+        verbose : bool, default False
+            Use a verbose output format.
+
+        Returns
+        -------
+        plan
+        """  # noqa: E501
+
+        results = ["Vector Search Plan:"]
+        results.append(await self._inner.to_vector_query().explain_plan(verbose))
+        results.append("FTS Search Plan:")
+        results.append(await self._inner.to_fts_query().explain_plan(verbose))
+
+        return "\n".join(results)
--- a/python/python/lancedb/remote/db.py
+++ b/python/python/lancedb/remote/db.py
@@ -44,9 +44,9 @@ class RemoteDBConnection(DBConnection):
        client_config: Union[ClientConfig, Dict[str, Any], None] = None,
        connection_timeout: Optional[float] = None,
        read_timeout: Optional[float] = None,
+        storage_options: Optional[Dict[str, str]] = None,
    ):
        """Connect to a remote LanceDB database."""
-
        if isinstance(client_config, dict):
            client_config = ClientConfig(**client_config)
        elif client_config is None:
@@ -94,6 +94,7 @@ class RemoteDBConnection(DBConnection):
                region=region,
                host_override=host_override,
                client_config=client_config,
+                storage_options=storage_options,
            )
        )

--- a/python/python/lancedb/remote/table.py
+++ b/python/python/lancedb/remote/table.py
@@ -78,7 +78,7 @@ class RemoteTable(Table):

    def list_versions(self):
        """List all versions of the table"""
-        return self._loop.run_until_complete(self._table.list_versions())
+        return LOOP.run(self._table.list_versions())

    def to_arrow(self) -> pa.Table:
        """to_arrow() is not yet supported on LanceDB cloud."""
@@ -89,10 +89,10 @@ class RemoteTable(Table):
        return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")

    def checkout(self, version):
-        return self._loop.run_until_complete(self._table.checkout(version))
+        return LOOP.run(self._table.checkout(version))

    def checkout_latest(self):
-        return self._loop.run_until_complete(self._table.checkout_latest())
+        return LOOP.run(self._table.checkout_latest())

    def list_indices(self):
        """List all the indices on the table"""
@@ -157,9 +157,7 @@ class RemoteTable(Table):
            remove_stop_words=remove_stop_words,
            ascii_folding=ascii_folding,
        )
-        self._loop.run_until_complete(
-            self._table.create_index(column, config=config, replace=replace)
-        )
+        LOOP.run(self._table.create_index(column, config=config, replace=replace))

    def create_index(
        self,
--- a/python/python/tests/test_hybrid_query.py
+++ b/python/python/tests/test_hybrid_query.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+import lancedb
+
+import pyarrow as pa
+import pytest
+import pytest_asyncio
+
+from lancedb.index import FTS
+from lancedb.table import AsyncTable
+
+
+@pytest_asyncio.fixture
+async def table(tmpdir_factory) -> AsyncTable:
+    tmp_path = str(tmpdir_factory.mktemp("data"))
+    db = await lancedb.connect_async(tmp_path)
+    data = pa.table(
+        {
+            "text": pa.array(["a", "b", "cat", "dog"]),
+            "vector": pa.array(
+                [[0.1, 0.1], [2, 2], [-0.1, -0.1], [0.5, -0.5]],
+                type=pa.list_(pa.float32(), list_size=2),
+            ),
+        }
+    )
+    table = await db.create_table("test", data)
+    await table.create_index("text", config=FTS(with_position=False))
+    return table
+
+
+@pytest.mark.asyncio
+async def test_async_hybrid_query(table: AsyncTable):
+    result = await (
+        table.query().nearest_to([0.0, 0.4]).nearest_to_text("dog").limit(2).to_arrow()
+    )
+    assert len(result) == 2
+    # ensure we get results that would match well for text and vector
+    assert result["text"].to_pylist() == ["a", "dog"]
+
+    # ensure there is no rowid by default
+    assert "_rowid" not in result
+
+
+@pytest.mark.asyncio
+async def test_async_hybrid_query_with_row_ids(table: AsyncTable):
+    result = await (
+        table.query()
+        .nearest_to([0.0, 0.4])
+        .nearest_to_text("dog")
+        .limit(2)
+        .with_row_id()
+        .to_arrow()
+    )
+    assert len(result) == 2
+    # ensure we get results that would match well for text and vector
+    assert result["text"].to_pylist() == ["a", "dog"]
+    assert result["_rowid"].to_pylist() == [0, 3]
+
+
+@pytest.mark.asyncio
+async def test_async_hybrid_query_filters(table: AsyncTable):
+    # test that query params are passed down from the regular builder to
+    # child vector/fts builders
+    result = await (
+        table.query()
+        .where("text not in ('a', 'dog')")
+        .nearest_to([0.3, 0.3])
+        .nearest_to_text("*a*")
+        .limit(2)
+        .to_arrow()
+    )
+    assert len(result) == 2
+    # ensure we get results that would match well for text and vector
+    assert result["text"].to_pylist() == ["cat", "b"]
+
+
+@pytest.mark.asyncio
+async def test_async_hybrid_query_default_limit(table: AsyncTable):
+    # add 10 new rows
+    new_rows = []
+    for i in range(100):
+        if i < 2:
+            new_rows.append({"text": "close_vec", "vector": [0.1, 0.1]})
+        else:
+            new_rows.append({"text": "far_vec", "vector": [5 * i, 5 * i]})
+    await table.add(new_rows)
+    result = await (
+        table.query().nearest_to_text("dog").nearest_to([0.1, 0.1]).to_arrow()
+    )
+
+    # assert we got the default limit of 10
+    assert len(result) == 10
+
+    # assert we got the closest vectors and the text searched for
+    texts = result["text"].to_pylist()
+    assert texts.count("close_vec") == 2
+    assert texts.count("dog") == 1
+    assert texts.count("a") == 1
+
+
+@pytest.mark.asyncio
+async def test_explain_plan(table: AsyncTable):
+    plan = await (
+        table.query().nearest_to_text("dog").nearest_to([0.1, 0.1]).explain_plan(True)
+    )
+
+    assert "Vector Search Plan" in plan
+    assert "KNNVectorDistance" in plan
+    assert "FTS Search Plan" in plan
+    assert "LanceScan" in plan
--- a/python/python/tests/test_remote_db.py
+++ b/python/python/tests/test_remote_db.py
@@ -229,6 +229,44 @@ def test_table_add_in_threadpool():
                future.result()


+def test_table_create_indices():
+    def handler(request):
+        if request.path == "/v1/table/test/create_index/":
+            request.send_response(200)
+            request.end_headers()
+        elif request.path == "/v1/table/test/create/?mode=create":
+            request.send_response(200)
+            request.send_header("Content-Type", "application/json")
+            request.end_headers()
+            request.wfile.write(b"{}")
+        elif request.path == "/v1/table/test/describe/":
+            request.send_response(200)
+            request.send_header("Content-Type", "application/json")
+            request.end_headers()
+            payload = json.dumps(
+                dict(
+                    version=1,
+                    schema=dict(
+                        fields=[
+                            dict(name="id", type={"type": "int64"}, nullable=False),
+                        ]
+                    ),
+                )
+            )
+            request.wfile.write(payload.encode())
+        else:
+            request.send_response(404)
+            request.end_headers()
+
+    with mock_lancedb_connection(handler) as db:
+        # Parameters are well-tested through local and async tests.
+        # This is a smoke-test.
+        table = db.create_table("test", [{"id": 1}])
+        table.create_scalar_index("id")
+        table.create_fts_index("text")
+        table.create_scalar_index("vector")
+
+
@contextlib.contextmanager
 def query_test_table(query_handler):
    def handler(request):
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -18,7 +18,8 @@ use arrow::pyarrow::FromPyArrow;
 use lancedb::index::scalar::FullTextSearchQuery;
 use lancedb::query::QueryExecutionOptions;
 use lancedb::query::{
-    ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
+    ExecutableQuery, HasQuery, Query as LanceDbQuery, QueryBase, Select,
+    VectorQuery as LanceDbVectorQuery,
 };
 use pyo3::exceptions::PyRuntimeError;
 use pyo3::prelude::{PyAnyMethods, PyDictMethods};
@@ -87,7 +88,7 @@ impl Query {
        Ok(VectorQuery { inner })
    }

-    pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<()> {
+    pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<FTSQuery> {
        let query_text = query
            .get_item("query")?
            .ok_or(PyErr::new::<PyRuntimeError, _>(
@@ -100,9 +101,11 @@ impl Query {
            .transpose()?;

        let fts_query = FullTextSearchQuery::new(query_text).columns(columns);
-        self.inner = self.inner.clone().full_text_search(fts_query);

-        Ok(())
+        Ok(FTSQuery {
+            fts_query,
+            inner: self.inner.clone(),
+        })
    }

    #[pyo3(signature = (max_batch_length=None))]
@@ -133,6 +136,87 @@ impl Query {
 }

 #[pyclass]
+#[derive(Clone)]
+pub struct FTSQuery {
+    inner: LanceDbQuery,
+    fts_query: FullTextSearchQuery,
+}
+
+#[pymethods]
+impl FTSQuery {
+    pub fn r#where(&mut self, predicate: String) {
+        self.inner = self.inner.clone().only_if(predicate);
+    }
+
+    pub fn select(&mut self, columns: Vec<(String, String)>) {
+        self.inner = self.inner.clone().select(Select::dynamic(&columns));
+    }
+
+    pub fn limit(&mut self, limit: u32) {
+        self.inner = self.inner.clone().limit(limit as usize);
+    }
+
+    pub fn offset(&mut self, offset: u32) {
+        self.inner = self.inner.clone().offset(offset as usize);
+    }
+
+    pub fn fast_search(&mut self) {
+        self.inner = self.inner.clone().fast_search();
+    }
+
+    pub fn with_row_id(&mut self) {
+        self.inner = self.inner.clone().with_row_id();
+    }
+
+    pub fn postfilter(&mut self) {
+        self.inner = self.inner.clone().postfilter();
+    }
+
+    #[pyo3(signature = (max_batch_length=None))]
+    pub fn execute(
+        self_: PyRef<'_, Self>,
+        max_batch_length: Option<u32>,
+    ) -> PyResult<Bound<'_, PyAny>> {
+        let inner = self_
+            .inner
+            .clone()
+            .full_text_search(self_.fts_query.clone());
+
+        future_into_py(self_.py(), async move {
+            let mut opts = QueryExecutionOptions::default();
+            if let Some(max_batch_length) = max_batch_length {
+                opts.max_batch_length = max_batch_length;
+            }
+            let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
+            Ok(RecordBatchStream::new(inner_stream))
+        })
+    }
+
+    pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<HybridQuery> {
+        let vector_query = Query::new(self.inner.clone()).nearest_to(vector)?;
+        Ok(HybridQuery {
+            inner_fts: self.clone(),
+            inner_vec: vector_query,
+        })
+    }
+
+    pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
+        let inner = self_.inner.clone();
+        future_into_py(self_.py(), async move {
+            inner
+                .explain_plan(verbose)
+                .await
+                .map_err(|e| PyRuntimeError::new_err(e.to_string()))
+        })
+    }
+
+    pub fn get_query(&self) -> String {
+        self.fts_query.query.clone()
+    }
+}
+
+#[pyclass]
+#[derive(Clone)]
 pub struct VectorQuery {
    inner: LanceDbVectorQuery,
 }
@@ -229,4 +313,105 @@ impl VectorQuery {
                .map_err(|e| PyRuntimeError::new_err(e.to_string()))
        })
    }
+
+    pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<HybridQuery> {
+        let fts_query = Query::new(self.inner.mut_query().clone()).nearest_to_text(query)?;
+        Ok(HybridQuery {
+            inner_vec: self.clone(),
+            inner_fts: fts_query,
+        })
+    }
+}
+
+#[pyclass]
+pub struct HybridQuery {
+    inner_vec: VectorQuery,
+    inner_fts: FTSQuery,
+}
+
+#[pymethods]
+impl HybridQuery {
+    pub fn r#where(&mut self, predicate: String) {
+        self.inner_vec.r#where(predicate.clone());
+        self.inner_fts.r#where(predicate);
+    }
+
+    pub fn select(&mut self, columns: Vec<(String, String)>) {
+        self.inner_vec.select(columns.clone());
+        self.inner_fts.select(columns);
+    }
+
+    pub fn limit(&mut self, limit: u32) {
+        self.inner_vec.limit(limit);
+        self.inner_fts.limit(limit);
+    }
+
+    pub fn offset(&mut self, offset: u32) {
+        self.inner_vec.offset(offset);
+        self.inner_fts.offset(offset);
+    }
+
+    pub fn fast_search(&mut self) {
+        self.inner_vec.fast_search();
+        self.inner_fts.fast_search();
+    }
+
+    pub fn with_row_id(&mut self) {
+        self.inner_fts.with_row_id();
+        self.inner_vec.with_row_id();
+    }
+
+    pub fn postfilter(&mut self) {
+        self.inner_vec.postfilter();
+        self.inner_fts.postfilter();
+    }
+
+    pub fn add_query_vector(&mut self, vector: Bound<'_, PyAny>) -> PyResult<()> {
+        self.inner_vec.add_query_vector(vector)
+    }
+
+    pub fn column(&mut self, column: String) {
+        self.inner_vec.column(column);
+    }
+
+    pub fn distance_type(&mut self, distance_type: String) -> PyResult<()> {
+        self.inner_vec.distance_type(distance_type)
+    }
+
+    pub fn refine_factor(&mut self, refine_factor: u32) {
+        self.inner_vec.refine_factor(refine_factor);
+    }
+
+    pub fn nprobes(&mut self, nprobe: u32) {
+        self.inner_vec.nprobes(nprobe);
+    }
+
+    pub fn ef(&mut self, ef: u32) {
+        self.inner_vec.ef(ef);
+    }
+
+    pub fn bypass_vector_index(&mut self) {
+        self.inner_vec.bypass_vector_index();
+    }
+
+    pub fn to_vector_query(&mut self) -> PyResult<VectorQuery> {
+        Ok(VectorQuery {
+            inner: self.inner_vec.inner.clone(),
+        })
+    }
+
+    pub fn to_fts_query(&mut self) -> PyResult<FTSQuery> {
+        Ok(FTSQuery {
+            inner: self.inner_fts.inner.clone(),
+            fts_query: self.inner_fts.fts_query.clone(),
+        })
+    }
+
+    pub fn get_limit(&mut self) -> Option<u32> {
+        self.inner_fts.inner.limit.map(|i| i as u32)
+    }
+
+    pub fn get_with_row_id(&mut self) -> bool {
+        self.inner_fts.inner.with_row_id
+    }
 }
--- a/rust/ffi/node/Cargo.toml
+++ b/rust/ffi/node/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-node"
-version = "0.14.0"
+version = "0.14.1-beta.0"
 description = "Serverless, low-latency vector database for AI applications"
 license.workspace = true
 edition.workspace = true
--- a/rust/lancedb/Cargo.toml
+++ b/rust/lancedb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb"
-version = "0.14.0"
+version = "0.14.1-beta.0"
 edition.workspace = true
 description = "LanceDB: A serverless, low-latency vector database for AI applications"
 license.workspace = true
--- a/rust/lancedb/src/index/scalar.rs
+++ b/rust/lancedb/src/index/scalar.rs
@@ -53,7 +53,10 @@ pub struct LabelListIndexBuilder {}
 /// A full text search index is an index on a string column that allows for full text search
 #[derive(Debug, Clone)]
 pub struct FtsIndexBuilder {
-    pub(crate) with_position: bool,
+    /// Whether to store the position of the tokens
+    /// This is used for phrase queries
+    pub with_position: bool,
+
    pub tokenizer_configs: TokenizerConfig,
 }

--- a/rust/lancedb/src/query.rs
+++ b/rust/lancedb/src/query.rs
@@ -573,7 +573,7 @@ pub struct Query {
    parent: Arc<dyn TableInternal>,

    /// limit the number of rows to return.
-    pub(crate) limit: Option<usize>,
+    pub limit: Option<usize>,

    /// Offset of the query.
    pub(crate) offset: Option<usize>,
@@ -596,7 +596,7 @@ pub struct Query {
    /// If set to true, the query will return the `_rowid` meta column.
    ///
    /// By default, this is false.
-    pub(crate) with_row_id: bool,
+    pub with_row_id: bool,

    /// If set to false, the filter will be applied after the vector search.
    pub(crate) prefilter: bool,
--- a/rust/lancedb/src/remote/db.rs
+++ b/rust/lancedb/src/remote/db.rs
@@ -271,7 +271,7 @@ impl From<StorageOptions> for RemoteOptions {
                filtered.insert(opt.to_string(), v.to_string());
            }
        }
-        RemoteOptions::new(filtered)
+        Self::new(filtered)
    }
 }

--- a/rust/lancedb/src/remote/table.rs
+++ b/rust/lancedb/src/remote/table.rs
@@ -570,7 +570,19 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
            Index::BTree(_) => ("BTREE", None),
            Index::Bitmap(_) => ("BITMAP", None),
            Index::LabelList(_) => ("LABEL_LIST", None),
-            Index::FTS(_) => ("FTS", None),
+            Index::FTS(fts) => {
+                let with_position = fts.with_position;
+                let configs = serde_json::to_value(fts.tokenizer_configs).map_err(|e| {
+                    Error::InvalidInput {
+                        message: format!("failed to serialize FTS index params {:?}", e),
+                    }
+                })?;
+                for (key, value) in configs.as_object().unwrap() {
+                    body[key] = value.clone();
+                }
+                body["with_position"] = serde_json::Value::Bool(with_position);
+                ("FTS", None)
+            }
            Index::Auto => {
                let schema = self.schema().await?;
                let field = schema
@@ -1496,6 +1508,7 @@ mod tests {
        ];

        for (index_type, distance_type, index) in cases {
+            let params = index.clone();
            let table = Table::new_with_handler("my_table", move |request| {
                assert_eq!(request.method(), "POST");
                assert_eq!(request.url().path(), "/v1/table/my_table/create_index/");
@@ -1512,6 +1525,17 @@ mod tests {
                if let Some(distance_type) = distance_type {
                    expected_body["metric_type"] = distance_type.to_lowercase().into();
                }
+                if let Index::FTS(fts) = &params {
+                    expected_body["with_position"] = fts.with_position.into();
+                    expected_body["base_tokenizer"] = "simple".into();
+                    expected_body["language"] = "English".into();
+                    expected_body["max_token_length"] = 40.into();
+                    expected_body["lower_case"] = true.into();
+                    expected_body["stem"] = false.into();
+                    expected_body["remove_stop_words"] = false.into();
+                    expected_body["ascii_folding"] = false.into();
+                }
+
                assert_eq!(body, expected_body);

                http::Response::builder().status(200).body("{}").unwrap()
Author	SHA1	Message	Date
Lance Release	6e5927ce6d	Bump version: 0.17.1-beta.0 → 0.17.1-beta.1	2024-12-09 08:40:35 +00:00
BubbleCal	6c1f32ac11	fix: index params are ignored by RemoteTable (#1928 ) Signed-off-by: BubbleCal <bubble-cal@outlook.com>	2024-12-09 16:37:01 +08:00
Lance Release	4fdf084777	Updating package-lock.json	2024-12-09 04:01:51 +00:00
Lance Release	1fad24fcd8	Bump version: 0.14.0 → 0.14.1-beta.0	2024-12-09 04:01:35 +00:00
Lance Release	6ef20b85ca	Bump version: 0.17.0 → 0.17.1-beta.0	2024-12-09 04:01:19 +00:00
LuQQiu	35bacdd57e	feat: support azure account name storage options in sync db.connect (#1926 ) db.connect with azure storage account name is supported in async connect but not sync connect. Add this functionality --------- Co-authored-by: Will Jones <willjones127@gmail.com>	2024-12-08 20:00:23 -08:00
Will Jones	a5ebe5a6c4	fix: create_scalar_index in cloud (#1922 ) Fixes #1920	2024-12-07 19:48:40 -08:00
Will Jones	bf03ad1b4a	ci: fix release (#1919 ) * Set `private: false` so we can publish new binary packages * Add missing windows binary reference	2024-12-06 12:51:48 -08:00
Bert	2a9e3e2084	feat(python): support hybrid search in async sdk (#1915 ) fixes: https://github.com/lancedb/lancedb/issues/1765 --------- Co-authored-by: Will Jones <willjones127@gmail.com>	2024-12-06 13:53:15 -05:00
Lance Release	f298f15360	Updating package-lock.json	2024-12-06 17:13:37 +00:00