mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
10 Commits
v0.14.0
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6e5927ce6d | ||
|
|
6c1f32ac11 | ||
|
|
4fdf084777 | ||
|
|
1fad24fcd8 | ||
|
|
6ef20b85ca | ||
|
|
35bacdd57e | ||
|
|
a5ebe5a6c4 | ||
|
|
bf03ad1b4a | ||
|
|
2a9e3e2084 | ||
|
|
f298f15360 |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.14.0"
|
||||
current_version = "0.14.1-beta.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.14.0-final.0</version>
|
||||
<version>0.14.1-beta.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.14.0-final.0</version>
|
||||
<version>0.14.1-beta.0</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
|
||||
20
node/package-lock.json
generated
20
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.14.0-beta.2",
|
||||
"version": "0.14.1-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.14.0-beta.2",
|
||||
"version": "0.14.1-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,14 +52,14 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.0-beta.2",
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.0-beta.2",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.0-beta.2",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.0-beta.2",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.0-beta.2",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.0-beta.2",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.0-beta.2",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.0-beta.2"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.1-beta.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"private": false,
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"scripts": {
|
||||
@@ -91,13 +92,13 @@
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.0"
|
||||
"@lancedb/vectordb-darwin-x64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.14.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.14.1-beta.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.14.0"
|
||||
version = "0.14.1-beta.0"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.14.0",
|
||||
"version": "0.14.1-beta.0",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.13.0",
|
||||
"version": "0.14.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.13.0",
|
||||
"version": "0.14.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -10,7 +10,8 @@
|
||||
"vector database",
|
||||
"ann"
|
||||
],
|
||||
"version": "0.14.0",
|
||||
"private": false,
|
||||
"version": "0.14.1-beta.0",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
@@ -30,7 +31,8 @@
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-unknown-linux-musl",
|
||||
"aarch64-unknown-linux-musl",
|
||||
"x86_64-pc-windows-msvc"
|
||||
"x86_64-pc-windows-msvc",
|
||||
"aarch64-pc-windows-msvc"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.17.0"
|
||||
current_version = "0.17.1-beta.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.17.0"
|
||||
version = "0.17.1-beta.1"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -110,6 +110,7 @@ def connect(
|
||||
# TODO: remove this (deprecation warning downstream)
|
||||
request_thread_pool=request_thread_pool,
|
||||
client_config=client_config,
|
||||
storage_options=storage_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -79,9 +79,21 @@ class Query:
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> Query: ...
|
||||
def nearest_to_text(self, query: dict) -> FTSQuery: ...
|
||||
async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
|
||||
|
||||
class FTSQuery:
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: List[str]): ...
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def fast_search(self): ...
|
||||
def with_row_id(self): ...
|
||||
def postfilter(self): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
|
||||
async def execute(self, max_batch_length: Optional[int]) -> RecordBatchStream: ...
|
||||
async def explain_plan(self) -> str: ...
|
||||
|
||||
class VectorQuery:
|
||||
async def execute(self) -> RecordBatchStream: ...
|
||||
def where(self, filter: str): ...
|
||||
@@ -95,6 +107,24 @@ class VectorQuery:
|
||||
def refine_factor(self, refine_factor: int): ...
|
||||
def nprobes(self, nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
||||
|
||||
class HybridQuery:
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: List[str]): ...
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def fast_search(self): ...
|
||||
def with_row_id(self): ...
|
||||
def postfilter(self): ...
|
||||
def distance_type(self, distance_type: str): ...
|
||||
def refine_factor(self, refine_factor: int): ...
|
||||
def nprobes(self, nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def to_vector_query(self) -> VectorQuery: ...
|
||||
def to_fts_query(self) -> FTSQuery: ...
|
||||
def get_limit(self) -> int: ...
|
||||
def get_with_row_id(self) -> bool: ...
|
||||
|
||||
class CompactionStats:
|
||||
fragments_removed: int
|
||||
|
||||
@@ -26,6 +26,7 @@ from typing import (
|
||||
Union,
|
||||
)
|
||||
|
||||
import asyncio
|
||||
import deprecation
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
@@ -44,6 +45,8 @@ if TYPE_CHECKING:
|
||||
import polars as pl
|
||||
|
||||
from ._lancedb import Query as LanceQuery
|
||||
from ._lancedb import FTSQuery as LanceFTSQuery
|
||||
from ._lancedb import HybridQuery as LanceHybridQuery
|
||||
from ._lancedb import VectorQuery as LanceVectorQuery
|
||||
from .common import VEC
|
||||
from .pydantic import LanceModel
|
||||
@@ -1124,35 +1127,55 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
fts_results = fts_future.result()
|
||||
vector_results = vector_future.result()
|
||||
|
||||
# convert to ranks first if needed
|
||||
if self._norm == "rank":
|
||||
vector_results = self._rank(vector_results, "_distance")
|
||||
fts_results = self._rank(fts_results, "_score")
|
||||
return self._combine_hybrid_results(
|
||||
fts_results=fts_results,
|
||||
vector_results=vector_results,
|
||||
norm=self._norm,
|
||||
fts_query=self._fts_query._query,
|
||||
reranker=self._reranker,
|
||||
limit=self._limit,
|
||||
with_row_ids=self._with_row_id,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _combine_hybrid_results(
|
||||
fts_results: pa.Table,
|
||||
vector_results: pa.Table,
|
||||
norm: str,
|
||||
fts_query: str,
|
||||
reranker,
|
||||
limit: int,
|
||||
with_row_ids: bool,
|
||||
) -> pa.Table:
|
||||
if norm == "rank":
|
||||
vector_results = LanceHybridQueryBuilder._rank(vector_results, "_distance")
|
||||
fts_results = LanceHybridQueryBuilder._rank(fts_results, "_score")
|
||||
|
||||
# normalize the scores to be between 0 and 1, 0 being most relevant
|
||||
vector_results = self._normalize_scores(vector_results, "_distance")
|
||||
vector_results = LanceHybridQueryBuilder._normalize_scores(
|
||||
vector_results, "_distance"
|
||||
)
|
||||
|
||||
# In fts higher scores represent relevance. Not inverting them here as
|
||||
# rerankers might need to preserve this score to support `return_score="all"`
|
||||
fts_results = self._normalize_scores(fts_results, "_score")
|
||||
fts_results = LanceHybridQueryBuilder._normalize_scores(fts_results, "_score")
|
||||
|
||||
results = self._reranker.rerank_hybrid(
|
||||
self._fts_query._query, vector_results, fts_results
|
||||
)
|
||||
results = reranker.rerank_hybrid(fts_query, vector_results, fts_results)
|
||||
|
||||
check_reranker_result(results)
|
||||
|
||||
# apply limit after reranking
|
||||
results = results.slice(length=self._limit)
|
||||
results = results.slice(length=limit)
|
||||
|
||||
if not self._with_row_id:
|
||||
if not with_row_ids:
|
||||
results = results.drop(["_rowid"])
|
||||
|
||||
return results
|
||||
|
||||
def to_batches(self):
|
||||
raise NotImplementedError("to_batches not yet supported on a hybrid query")
|
||||
|
||||
def _rank(self, results: pa.Table, column: str, ascending: bool = True):
|
||||
@staticmethod
|
||||
def _rank(results: pa.Table, column: str, ascending: bool = True):
|
||||
if len(results) == 0:
|
||||
return results
|
||||
# Get the _score column from results
|
||||
@@ -1169,7 +1192,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
)
|
||||
return results
|
||||
|
||||
def _normalize_scores(self, results: pa.Table, column: str, invert=False):
|
||||
@staticmethod
|
||||
def _normalize_scores(results: pa.Table, column: str, invert=False):
|
||||
if len(results) == 0:
|
||||
return results
|
||||
# Get the _score column from results
|
||||
@@ -1635,7 +1659,7 @@ class AsyncQuery(AsyncQueryBase):
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str]] = []
|
||||
) -> AsyncQuery:
|
||||
) -> AsyncFTSQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query.
|
||||
|
||||
@@ -1658,8 +1682,90 @@ class AsyncQuery(AsyncQueryBase):
|
||||
"""
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
return self
|
||||
return AsyncFTSQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
|
||||
class AsyncFTSQuery(AsyncQueryBase):
|
||||
"""A query for full text search for LanceDB."""
|
||||
|
||||
def __init__(self, inner: LanceFTSQuery):
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
|
||||
def get_query(self):
|
||||
self._inner.get_query()
|
||||
|
||||
def nearest_to(
|
||||
self,
|
||||
query_vector: Union[VEC, Tuple, List[VEC]],
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
In addition doing text search on the LanceDB Table, also
|
||||
find the nearest vectors to the given query vector.
|
||||
|
||||
This converts the query from a FTS Query to a Hybrid query. Results
|
||||
from the vector search will be combined with results from the FTS query.
|
||||
|
||||
This method will attempt to convert the input to the query vector
|
||||
expected by the embedding model. If the input cannot be converted
|
||||
then an error will be thrown.
|
||||
|
||||
By default, there is no embedding model, and the input should be
|
||||
something that can be converted to a pyarrow array of floats. This
|
||||
includes lists, numpy arrays, and tuples.
|
||||
|
||||
If there is only one vector column (a column whose data type is a
|
||||
fixed size list of floats) then the column does not need to be specified.
|
||||
If there is more than one vector column you must use
|
||||
[AsyncVectorQuery.column][lancedb.query.AsyncVectorQuery.column] to specify
|
||||
which column you would like to compare with.
|
||||
|
||||
If no index has been created on the vector column then a vector query
|
||||
will perform a distance comparison between the query vector and every
|
||||
vector in the database and then sort the results. This is sometimes
|
||||
called a "flat search"
|
||||
|
||||
For small databases, with tens of thousands of vectors or less, this can
|
||||
be reasonably fast. In larger databases you should create a vector index
|
||||
on the column. If there is a vector index then an "approximate" nearest
|
||||
neighbor search (frequently called an ANN search) will be performed. This
|
||||
search is much faster, but the results will be approximate.
|
||||
|
||||
The query can be further parameterized using the returned builder. There
|
||||
are various ANN search parameters that will let you fine tune your recall
|
||||
accuracy vs search latency.
|
||||
|
||||
Hybrid searches always have a [limit][]. If `limit` has not been called then
|
||||
a default `limit` of 10 will be used.
|
||||
|
||||
Typically, a single vector is passed in as the query. However, you can also
|
||||
pass in multiple vectors. This can be useful if you want to find the nearest
|
||||
vectors to multiple query vectors. This is not expected to be faster than
|
||||
making multiple queries concurrently; it is just a convenience method.
|
||||
If multiple vectors are passed in then an additional column `query_index`
|
||||
will be added to the results. This column will contain the index of the
|
||||
query vector that the result is nearest to.
|
||||
"""
|
||||
if query_vector is None:
|
||||
raise ValueError("query_vector can not be None")
|
||||
|
||||
if (
|
||||
isinstance(query_vector, list)
|
||||
and len(query_vector) > 0
|
||||
and not isinstance(query_vector[0], (float, int))
|
||||
):
|
||||
# multiple have been passed
|
||||
query_vectors = [AsyncQuery._query_vec_to_array(v) for v in query_vector]
|
||||
new_self = self._inner.nearest_to(query_vectors[0])
|
||||
for v in query_vectors[1:]:
|
||||
new_self.add_query_vector(v)
|
||||
return AsyncHybridQuery(new_self)
|
||||
else:
|
||||
return AsyncHybridQuery(
|
||||
self._inner.nearest_to(AsyncQuery._query_vec_to_array(query_vector))
|
||||
)
|
||||
|
||||
|
||||
class AsyncVectorQuery(AsyncQueryBase):
|
||||
@@ -1796,3 +1902,160 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
"""
|
||||
self._inner.bypass_vector_index()
|
||||
return self
|
||||
|
||||
def nearest_to_text(
|
||||
self, query: str, columns: Union[str, List[str]] = []
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
Find the documents that are most relevant to the given text query,
|
||||
in addition to vector search.
|
||||
|
||||
This converts the vector query into a hybrid query.
|
||||
|
||||
This search will perform a full text search on the table and return
|
||||
the most relevant documents, combined with the vector query results.
|
||||
The text relevance is determined by BM25.
|
||||
|
||||
The columns to search must be with native FTS index
|
||||
(Tantivy-based can't work with this method).
|
||||
|
||||
By default, all indexed columns are searched,
|
||||
now only one column can be searched at a time.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query: str
|
||||
The text query to search for.
|
||||
columns: str or list of str, default None
|
||||
The columns to search in. If None, all indexed columns are searched.
|
||||
For now only one column can be searched at a time.
|
||||
"""
|
||||
if isinstance(columns, str):
|
||||
columns = [columns]
|
||||
return AsyncHybridQuery(
|
||||
self._inner.nearest_to_text({"query": query, "columns": columns})
|
||||
)
|
||||
|
||||
|
||||
class AsyncHybridQuery(AsyncQueryBase):
|
||||
"""
|
||||
A query builder that performs hybrid vector and full text search.
|
||||
Results are combined and reranked based on the specified reranker.
|
||||
By default, the results are reranked using the RRFReranker, which
|
||||
uses reciprocal rank fusion score for reranking.
|
||||
|
||||
To make the vector and fts results comparable, the scores are normalized.
|
||||
Instead of normalizing scores, the `normalize` parameter can be set to "rank"
|
||||
in the `rerank` method to convert the scores to ranks and then normalize them.
|
||||
"""
|
||||
|
||||
def __init__(self, inner: LanceHybridQuery):
|
||||
super().__init__(inner)
|
||||
self._inner = inner
|
||||
self._norm = "score"
|
||||
self._reranker = RRFReranker()
|
||||
|
||||
def rerank(
|
||||
self, reranker: Reranker = RRFReranker(), normalize: str = "score"
|
||||
) -> AsyncHybridQuery:
|
||||
"""
|
||||
Rerank the hybrid search results using the specified reranker. The reranker
|
||||
must be an instance of Reranker class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reranker: Reranker, default RRFReranker()
|
||||
The reranker to use. Must be an instance of Reranker class.
|
||||
normalize: str, default "score"
|
||||
The method to normalize the scores. Can be "rank" or "score". If "rank",
|
||||
the scores are converted to ranks and then normalized. If "score", the
|
||||
scores are normalized directly.
|
||||
Returns
|
||||
-------
|
||||
AsyncHybridQuery
|
||||
The AsyncHybridQuery object.
|
||||
"""
|
||||
if normalize not in ["rank", "score"]:
|
||||
raise ValueError("normalize must be 'rank' or 'score'.")
|
||||
if reranker and not isinstance(reranker, Reranker):
|
||||
raise ValueError("reranker must be an instance of Reranker class.")
|
||||
|
||||
self._norm = normalize
|
||||
self._reranker = reranker
|
||||
|
||||
return self
|
||||
|
||||
async def to_batches(self):
|
||||
raise NotImplementedError("to_batches not yet supported on a hybrid query")
|
||||
|
||||
async def to_arrow(self) -> pa.Table:
|
||||
fts_query = AsyncFTSQuery(self._inner.to_fts_query())
|
||||
vec_query = AsyncVectorQuery(self._inner.to_vector_query())
|
||||
|
||||
# save the row ID choice that was made on the query builder and force it
|
||||
# to actually fetch the row ids because we need this for reranking
|
||||
with_row_ids = self._inner.get_with_row_id()
|
||||
fts_query.with_row_id()
|
||||
vec_query.with_row_id()
|
||||
|
||||
fts_results, vector_results = await asyncio.gather(
|
||||
fts_query.to_arrow(),
|
||||
vec_query.to_arrow(),
|
||||
)
|
||||
|
||||
return LanceHybridQueryBuilder._combine_hybrid_results(
|
||||
fts_results=fts_results,
|
||||
vector_results=vector_results,
|
||||
norm=self._norm,
|
||||
fts_query=fts_query.get_query(),
|
||||
reranker=self._reranker,
|
||||
limit=self._inner.get_limit(),
|
||||
with_row_ids=with_row_ids,
|
||||
)
|
||||
|
||||
async def explain_plan(self, verbose: Optional[bool] = False):
|
||||
"""Return the execution plan for this query.
|
||||
|
||||
The output includes both the vector and FTS search plans.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import asyncio
|
||||
>>> from lancedb import connect_async
|
||||
>>> from lancedb.index import FTS
|
||||
>>> async def doctest_example():
|
||||
... conn = await connect_async("./.lancedb")
|
||||
... table = await conn.create_table("my_table", [{"vector": [99, 99], "text": "hello world"}])
|
||||
... await table.create_index("text", config=FTS(with_position=False))
|
||||
... query = [100, 100]
|
||||
... plan = await table.query().nearest_to([1, 2]).nearest_to_text("hello").explain_plan(True)
|
||||
... print(plan)
|
||||
>>> asyncio.run(doctest_example()) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
|
||||
Vector Search Plan:
|
||||
ProjectionExec: expr=[vector@0 as vector, text@3 as text, _distance@2 as _distance]
|
||||
Take: columns="vector, _rowid, _distance, (text)"
|
||||
CoalesceBatchesExec: target_batch_size=1024
|
||||
GlobalLimitExec: skip=0, fetch=10
|
||||
FilterExec: _distance@2 IS NOT NULL
|
||||
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
|
||||
KNNVectorDistance: metric=l2
|
||||
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
|
||||
FTS Search Plan:
|
||||
LanceScan: uri=..., projection=[vector, text], row_id=false, row_addr=false, ordered=true
|
||||
|
||||
Parameters
|
||||
----------
|
||||
verbose : bool, default False
|
||||
Use a verbose output format.
|
||||
|
||||
Returns
|
||||
-------
|
||||
plan
|
||||
""" # noqa: E501
|
||||
|
||||
results = ["Vector Search Plan:"]
|
||||
results.append(await self._inner.to_vector_query().explain_plan(verbose))
|
||||
results.append("FTS Search Plan:")
|
||||
results.append(await self._inner.to_fts_query().explain_plan(verbose))
|
||||
|
||||
return "\n".join(results)
|
||||
|
||||
@@ -44,9 +44,9 @@ class RemoteDBConnection(DBConnection):
|
||||
client_config: Union[ClientConfig, Dict[str, Any], None] = None,
|
||||
connection_timeout: Optional[float] = None,
|
||||
read_timeout: Optional[float] = None,
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""Connect to a remote LanceDB database."""
|
||||
|
||||
if isinstance(client_config, dict):
|
||||
client_config = ClientConfig(**client_config)
|
||||
elif client_config is None:
|
||||
@@ -94,6 +94,7 @@ class RemoteDBConnection(DBConnection):
|
||||
region=region,
|
||||
host_override=host_override,
|
||||
client_config=client_config,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ class RemoteTable(Table):
|
||||
|
||||
def list_versions(self):
|
||||
"""List all versions of the table"""
|
||||
return self._loop.run_until_complete(self._table.list_versions())
|
||||
return LOOP.run(self._table.list_versions())
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
"""to_arrow() is not yet supported on LanceDB cloud."""
|
||||
@@ -89,10 +89,10 @@ class RemoteTable(Table):
|
||||
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||
|
||||
def checkout(self, version):
|
||||
return self._loop.run_until_complete(self._table.checkout(version))
|
||||
return LOOP.run(self._table.checkout(version))
|
||||
|
||||
def checkout_latest(self):
|
||||
return self._loop.run_until_complete(self._table.checkout_latest())
|
||||
return LOOP.run(self._table.checkout_latest())
|
||||
|
||||
def list_indices(self):
|
||||
"""List all the indices on the table"""
|
||||
@@ -157,9 +157,7 @@ class RemoteTable(Table):
|
||||
remove_stop_words=remove_stop_words,
|
||||
ascii_folding=ascii_folding,
|
||||
)
|
||||
self._loop.run_until_complete(
|
||||
self._table.create_index(column, config=config, replace=replace)
|
||||
)
|
||||
LOOP.run(self._table.create_index(column, config=config, replace=replace))
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
|
||||
111
python/python/tests/test_hybrid_query.py
Normal file
111
python/python/tests/test_hybrid_query.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import lancedb
|
||||
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from lancedb.index import FTS
|
||||
from lancedb.table import AsyncTable
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def table(tmpdir_factory) -> AsyncTable:
|
||||
tmp_path = str(tmpdir_factory.mktemp("data"))
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
data = pa.table(
|
||||
{
|
||||
"text": pa.array(["a", "b", "cat", "dog"]),
|
||||
"vector": pa.array(
|
||||
[[0.1, 0.1], [2, 2], [-0.1, -0.1], [0.5, -0.5]],
|
||||
type=pa.list_(pa.float32(), list_size=2),
|
||||
),
|
||||
}
|
||||
)
|
||||
table = await db.create_table("test", data)
|
||||
await table.create_index("text", config=FTS(with_position=False))
|
||||
return table
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query(table: AsyncTable):
|
||||
result = await (
|
||||
table.query().nearest_to([0.0, 0.4]).nearest_to_text("dog").limit(2).to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
# ensure we get results that would match well for text and vector
|
||||
assert result["text"].to_pylist() == ["a", "dog"]
|
||||
|
||||
# ensure there is no rowid by default
|
||||
assert "_rowid" not in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query_with_row_ids(table: AsyncTable):
|
||||
result = await (
|
||||
table.query()
|
||||
.nearest_to([0.0, 0.4])
|
||||
.nearest_to_text("dog")
|
||||
.limit(2)
|
||||
.with_row_id()
|
||||
.to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
# ensure we get results that would match well for text and vector
|
||||
assert result["text"].to_pylist() == ["a", "dog"]
|
||||
assert result["_rowid"].to_pylist() == [0, 3]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query_filters(table: AsyncTable):
|
||||
# test that query params are passed down from the regular builder to
|
||||
# child vector/fts builders
|
||||
result = await (
|
||||
table.query()
|
||||
.where("text not in ('a', 'dog')")
|
||||
.nearest_to([0.3, 0.3])
|
||||
.nearest_to_text("*a*")
|
||||
.limit(2)
|
||||
.to_arrow()
|
||||
)
|
||||
assert len(result) == 2
|
||||
# ensure we get results that would match well for text and vector
|
||||
assert result["text"].to_pylist() == ["cat", "b"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_hybrid_query_default_limit(table: AsyncTable):
|
||||
# add 10 new rows
|
||||
new_rows = []
|
||||
for i in range(100):
|
||||
if i < 2:
|
||||
new_rows.append({"text": "close_vec", "vector": [0.1, 0.1]})
|
||||
else:
|
||||
new_rows.append({"text": "far_vec", "vector": [5 * i, 5 * i]})
|
||||
await table.add(new_rows)
|
||||
result = await (
|
||||
table.query().nearest_to_text("dog").nearest_to([0.1, 0.1]).to_arrow()
|
||||
)
|
||||
|
||||
# assert we got the default limit of 10
|
||||
assert len(result) == 10
|
||||
|
||||
# assert we got the closest vectors and the text searched for
|
||||
texts = result["text"].to_pylist()
|
||||
assert texts.count("close_vec") == 2
|
||||
assert texts.count("dog") == 1
|
||||
assert texts.count("a") == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_explain_plan(table: AsyncTable):
|
||||
plan = await (
|
||||
table.query().nearest_to_text("dog").nearest_to([0.1, 0.1]).explain_plan(True)
|
||||
)
|
||||
|
||||
assert "Vector Search Plan" in plan
|
||||
assert "KNNVectorDistance" in plan
|
||||
assert "FTS Search Plan" in plan
|
||||
assert "LanceScan" in plan
|
||||
@@ -229,6 +229,44 @@ def test_table_add_in_threadpool():
|
||||
future.result()
|
||||
|
||||
|
||||
def test_table_create_indices():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/create_index/":
|
||||
request.send_response(200)
|
||||
request.end_headers()
|
||||
elif request.path == "/v1/table/test/create/?mode=create":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(b"{}")
|
||||
elif request.path == "/v1/table/test/describe/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
payload = json.dumps(
|
||||
dict(
|
||||
version=1,
|
||||
schema=dict(
|
||||
fields=[
|
||||
dict(name="id", type={"type": "int64"}, nullable=False),
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
request.wfile.write(payload.encode())
|
||||
else:
|
||||
request.send_response(404)
|
||||
request.end_headers()
|
||||
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
# Parameters are well-tested through local and async tests.
|
||||
# This is a smoke-test.
|
||||
table = db.create_table("test", [{"id": 1}])
|
||||
table.create_scalar_index("id")
|
||||
table.create_fts_index("text")
|
||||
table.create_scalar_index("vector")
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def query_test_table(query_handler):
|
||||
def handler(request):
|
||||
|
||||
@@ -18,7 +18,8 @@ use arrow::pyarrow::FromPyArrow;
|
||||
use lancedb::index::scalar::FullTextSearchQuery;
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::{
|
||||
ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
|
||||
ExecutableQuery, HasQuery, Query as LanceDbQuery, QueryBase, Select,
|
||||
VectorQuery as LanceDbVectorQuery,
|
||||
};
|
||||
use pyo3::exceptions::PyRuntimeError;
|
||||
use pyo3::prelude::{PyAnyMethods, PyDictMethods};
|
||||
@@ -87,7 +88,7 @@ impl Query {
|
||||
Ok(VectorQuery { inner })
|
||||
}
|
||||
|
||||
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<()> {
|
||||
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<FTSQuery> {
|
||||
let query_text = query
|
||||
.get_item("query")?
|
||||
.ok_or(PyErr::new::<PyRuntimeError, _>(
|
||||
@@ -100,9 +101,11 @@ impl Query {
|
||||
.transpose()?;
|
||||
|
||||
let fts_query = FullTextSearchQuery::new(query_text).columns(columns);
|
||||
self.inner = self.inner.clone().full_text_search(fts_query);
|
||||
|
||||
Ok(())
|
||||
Ok(FTSQuery {
|
||||
fts_query,
|
||||
inner: self.inner.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
@@ -133,6 +136,87 @@ impl Query {
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub struct FTSQuery {
|
||||
inner: LanceDbQuery,
|
||||
fts_query: FullTextSearchQuery,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl FTSQuery {
|
||||
pub fn r#where(&mut self, predicate: String) {
|
||||
self.inner = self.inner.clone().only_if(predicate);
|
||||
}
|
||||
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
}
|
||||
|
||||
pub fn limit(&mut self, limit: u32) {
|
||||
self.inner = self.inner.clone().limit(limit as usize);
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: u32) {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner = self.inner.clone().fast_search();
|
||||
}
|
||||
|
||||
pub fn with_row_id(&mut self) {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
pub fn postfilter(&mut self) {
|
||||
self.inner = self.inner.clone().postfilter();
|
||||
}
|
||||
|
||||
#[pyo3(signature = (max_batch_length=None))]
|
||||
pub fn execute(
|
||||
self_: PyRef<'_, Self>,
|
||||
max_batch_length: Option<u32>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_
|
||||
.inner
|
||||
.clone()
|
||||
.full_text_search(self_.fts_query.clone());
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
let mut opts = QueryExecutionOptions::default();
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
|
||||
Ok(RecordBatchStream::new(inner_stream))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<HybridQuery> {
|
||||
let vector_query = Query::new(self.inner.clone()).nearest_to(vector)?;
|
||||
Ok(HybridQuery {
|
||||
inner_fts: self.clone(),
|
||||
inner_vec: vector_query,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
inner
|
||||
.explain_plan(verbose)
|
||||
.await
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_query(&self) -> String {
|
||||
self.fts_query.query.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub struct VectorQuery {
|
||||
inner: LanceDbVectorQuery,
|
||||
}
|
||||
@@ -229,4 +313,105 @@ impl VectorQuery {
|
||||
.map_err(|e| PyRuntimeError::new_err(e.to_string()))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn nearest_to_text(&mut self, query: Bound<'_, PyDict>) -> PyResult<HybridQuery> {
|
||||
let fts_query = Query::new(self.inner.mut_query().clone()).nearest_to_text(query)?;
|
||||
Ok(HybridQuery {
|
||||
inner_vec: self.clone(),
|
||||
inner_fts: fts_query,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct HybridQuery {
|
||||
inner_vec: VectorQuery,
|
||||
inner_fts: FTSQuery,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl HybridQuery {
|
||||
pub fn r#where(&mut self, predicate: String) {
|
||||
self.inner_vec.r#where(predicate.clone());
|
||||
self.inner_fts.r#where(predicate);
|
||||
}
|
||||
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner_vec.select(columns.clone());
|
||||
self.inner_fts.select(columns);
|
||||
}
|
||||
|
||||
pub fn limit(&mut self, limit: u32) {
|
||||
self.inner_vec.limit(limit);
|
||||
self.inner_fts.limit(limit);
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: u32) {
|
||||
self.inner_vec.offset(offset);
|
||||
self.inner_fts.offset(offset);
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner_vec.fast_search();
|
||||
self.inner_fts.fast_search();
|
||||
}
|
||||
|
||||
pub fn with_row_id(&mut self) {
|
||||
self.inner_fts.with_row_id();
|
||||
self.inner_vec.with_row_id();
|
||||
}
|
||||
|
||||
pub fn postfilter(&mut self) {
|
||||
self.inner_vec.postfilter();
|
||||
self.inner_fts.postfilter();
|
||||
}
|
||||
|
||||
pub fn add_query_vector(&mut self, vector: Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner_vec.add_query_vector(vector)
|
||||
}
|
||||
|
||||
pub fn column(&mut self, column: String) {
|
||||
self.inner_vec.column(column);
|
||||
}
|
||||
|
||||
pub fn distance_type(&mut self, distance_type: String) -> PyResult<()> {
|
||||
self.inner_vec.distance_type(distance_type)
|
||||
}
|
||||
|
||||
pub fn refine_factor(&mut self, refine_factor: u32) {
|
||||
self.inner_vec.refine_factor(refine_factor);
|
||||
}
|
||||
|
||||
pub fn nprobes(&mut self, nprobe: u32) {
|
||||
self.inner_vec.nprobes(nprobe);
|
||||
}
|
||||
|
||||
pub fn ef(&mut self, ef: u32) {
|
||||
self.inner_vec.ef(ef);
|
||||
}
|
||||
|
||||
pub fn bypass_vector_index(&mut self) {
|
||||
self.inner_vec.bypass_vector_index();
|
||||
}
|
||||
|
||||
pub fn to_vector_query(&mut self) -> PyResult<VectorQuery> {
|
||||
Ok(VectorQuery {
|
||||
inner: self.inner_vec.inner.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn to_fts_query(&mut self) -> PyResult<FTSQuery> {
|
||||
Ok(FTSQuery {
|
||||
inner: self.inner_fts.inner.clone(),
|
||||
fts_query: self.inner_fts.fts_query.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_limit(&mut self) -> Option<u32> {
|
||||
self.inner_fts.inner.limit.map(|i| i as u32)
|
||||
}
|
||||
|
||||
pub fn get_with_row_id(&mut self) -> bool {
|
||||
self.inner_fts.inner.with_row_id
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.14.0"
|
||||
version = "0.14.1-beta.0"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.14.0"
|
||||
version = "0.14.1-beta.0"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -53,7 +53,10 @@ pub struct LabelListIndexBuilder {}
|
||||
/// A full text search index is an index on a string column that allows for full text search
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FtsIndexBuilder {
|
||||
pub(crate) with_position: bool,
|
||||
/// Whether to store the position of the tokens
|
||||
/// This is used for phrase queries
|
||||
pub with_position: bool,
|
||||
|
||||
pub tokenizer_configs: TokenizerConfig,
|
||||
}
|
||||
|
||||
|
||||
@@ -573,7 +573,7 @@ pub struct Query {
|
||||
parent: Arc<dyn TableInternal>,
|
||||
|
||||
/// limit the number of rows to return.
|
||||
pub(crate) limit: Option<usize>,
|
||||
pub limit: Option<usize>,
|
||||
|
||||
/// Offset of the query.
|
||||
pub(crate) offset: Option<usize>,
|
||||
@@ -596,7 +596,7 @@ pub struct Query {
|
||||
/// If set to true, the query will return the `_rowid` meta column.
|
||||
///
|
||||
/// By default, this is false.
|
||||
pub(crate) with_row_id: bool,
|
||||
pub with_row_id: bool,
|
||||
|
||||
/// If set to false, the filter will be applied after the vector search.
|
||||
pub(crate) prefilter: bool,
|
||||
|
||||
@@ -271,7 +271,7 @@ impl From<StorageOptions> for RemoteOptions {
|
||||
filtered.insert(opt.to_string(), v.to_string());
|
||||
}
|
||||
}
|
||||
RemoteOptions::new(filtered)
|
||||
Self::new(filtered)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -570,7 +570,19 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
Index::BTree(_) => ("BTREE", None),
|
||||
Index::Bitmap(_) => ("BITMAP", None),
|
||||
Index::LabelList(_) => ("LABEL_LIST", None),
|
||||
Index::FTS(_) => ("FTS", None),
|
||||
Index::FTS(fts) => {
|
||||
let with_position = fts.with_position;
|
||||
let configs = serde_json::to_value(fts.tokenizer_configs).map_err(|e| {
|
||||
Error::InvalidInput {
|
||||
message: format!("failed to serialize FTS index params {:?}", e),
|
||||
}
|
||||
})?;
|
||||
for (key, value) in configs.as_object().unwrap() {
|
||||
body[key] = value.clone();
|
||||
}
|
||||
body["with_position"] = serde_json::Value::Bool(with_position);
|
||||
("FTS", None)
|
||||
}
|
||||
Index::Auto => {
|
||||
let schema = self.schema().await?;
|
||||
let field = schema
|
||||
@@ -1496,6 +1508,7 @@ mod tests {
|
||||
];
|
||||
|
||||
for (index_type, distance_type, index) in cases {
|
||||
let params = index.clone();
|
||||
let table = Table::new_with_handler("my_table", move |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/create_index/");
|
||||
@@ -1512,6 +1525,17 @@ mod tests {
|
||||
if let Some(distance_type) = distance_type {
|
||||
expected_body["metric_type"] = distance_type.to_lowercase().into();
|
||||
}
|
||||
if let Index::FTS(fts) = ¶ms {
|
||||
expected_body["with_position"] = fts.with_position.into();
|
||||
expected_body["base_tokenizer"] = "simple".into();
|
||||
expected_body["language"] = "English".into();
|
||||
expected_body["max_token_length"] = 40.into();
|
||||
expected_body["lower_case"] = true.into();
|
||||
expected_body["stem"] = false.into();
|
||||
expected_body["remove_stop_words"] = false.into();
|
||||
expected_body["ascii_folding"] = false.into();
|
||||
}
|
||||
|
||||
assert_eq!(body, expected_body);
|
||||
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
|
||||
Reference in New Issue
Block a user