Compare commits

..

1 Commits

Author SHA1 Message Date
Lance Release
204a075be9 Bump version: 0.9.0-beta.2 → 0.9.0-beta.3 2024-12-18 16:25:09 +00:00
18 changed files with 59 additions and 263 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.5.2-final.1"
current_version = "0.5.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -28,7 +28,7 @@ runs:
args: ${{ inputs.args }}
docker-options: "-e PIP_EXTRA_INDEX_URL=https://pypi.fury.io/lancedb/"
working-directory: python
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
with:
name: windows-wheels
path: python\target\wheels

View File

@@ -35,7 +35,7 @@ arrow-schema = "51.0"
arrow-arith = "51.0"
arrow-cast = "51.0"
async-trait = "0"
chrono = "=0.4.39"
chrono = "0.4.35"
datafusion-physical-plan = "37.1"
half = { "version" = "=2.4.1", default-features = false, features = [
"num-traits",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.5.2-final.1",
"version": "0.5.2",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.5.2-final.1",
"version": "0.5.2",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.5.2-final.1",
"version": "0.5.2",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.5.2-final.1",
"version": "0.5.2",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.5.2-final.1",
"version": "0.5.2",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.5.2-final.1",
"version": "0.5.2",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -10,7 +10,7 @@
"vector database",
"ann"
],
"version": "0.5.2-final.1",
"version": "0.5.2",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.9.0-beta.8"
current_version = "0.9.0-beta.3"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.9.0-beta.8"
version = "0.9.0-beta.3"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true
@@ -19,8 +19,6 @@ lancedb = { path = "../rust/lancedb" }
env_logger = "0.10"
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
base64ct = "=1.6.0" # workaround for https://github.com/RustCrypto/formats/issues/1684
chrono = "=0.4.39"
# Prevent dynamic linking of lzma, which comes from datafusion
lzma-sys = { version = "*", features = ["static"] }

View File

@@ -117,8 +117,6 @@ class Query(pydantic.BaseModel):
with_row_id: bool = False
fast_search: bool = False
class LanceQueryBuilder(ABC):
"""An abstract query builder. Subclasses are defined for vector search,
@@ -127,14 +125,12 @@ class LanceQueryBuilder(ABC):
@classmethod
def create(
cls,
table: "Table",
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
query_type: str,
vector_column_name: str,
ordering_field_name: Optional[str] = None,
fts_columns: Union[str, List[str]] = [],
fast_search: bool = False,
cls,
table: "Table",
query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]],
query_type: str,
vector_column_name: str,
ordering_field_name: str = None,
) -> LanceQueryBuilder:
"""
Create a query builder based on the given query and query type.
@@ -151,19 +147,14 @@ class LanceQueryBuilder(ABC):
If "auto", the query type is inferred based on the query.
vector_column_name: str
The name of the vector column to use for vector search.
fast_search: bool
Skip flat search of unindexed data.
"""
# Check hybrid search first as it supports empty query pattern
if query_type == "hybrid":
# hybrid fts and vector query
return LanceHybridQueryBuilder(
table, query, vector_column_name, fts_columns=fts_columns
)
if query is None:
return LanceEmptyQueryBuilder(table)
if query_type == "hybrid":
# hybrid fts and vector query
return LanceHybridQueryBuilder(table, query, vector_column_name)
# remember the string query for reranking purpose
str_query = query if isinstance(query, str) else None
@@ -174,17 +165,12 @@ class LanceQueryBuilder(ABC):
)
if query_type == "hybrid":
return LanceHybridQueryBuilder(
table, query, vector_column_name, fts_columns=fts_columns
)
return LanceHybridQueryBuilder(table, query, vector_column_name)
if isinstance(query, str):
# fts
return LanceFtsQueryBuilder(
table,
query,
ordering_field_name=ordering_field_name,
fts_columns=fts_columns,
table, query, ordering_field_name=ordering_field_name
)
if isinstance(query, list):
@@ -194,9 +180,7 @@ class LanceQueryBuilder(ABC):
else:
raise TypeError(f"Unsupported query type: {type(query)}")
return LanceVectorQueryBuilder(
table, query, vector_column_name, str_query, fast_search
)
return LanceVectorQueryBuilder(table, query, vector_column_name, str_query)
@classmethod
def _resolve_query(cls, table, query, query_type, vector_column_name):
@@ -212,6 +196,8 @@ class LanceQueryBuilder(ABC):
elif query_type == "auto":
if isinstance(query, (list, np.ndarray)):
return query, "vector"
if isinstance(query, tuple):
return query, "hybrid"
else:
conf = table.embedding_functions.get(vector_column_name)
if conf is not None:
@@ -238,14 +224,9 @@ class LanceQueryBuilder(ABC):
def __init__(self, table: "Table"):
self._table = table
self._limit = 10
self._offset = 0
self._columns = None
self._where = None
self._prefilter = False
self._with_row_id = False
self._vector = None
self._text = None
self._ef = None
@deprecation.deprecated(
deprecated_in="0.3.1",
@@ -356,13 +337,11 @@ class LanceQueryBuilder(ABC):
----------
limit: int
The maximum number of results to return.
The default query limit is 10 results.
For ANN/KNN queries, you must specify a limit.
Entering 0, a negative number, or None will reset
the limit to the default value of 10.
*WARNING* if you have a large dataset, setting
the limit to a large number, e.g. the table size,
can potentially result in reading a
By default the query is limited to the first 10.
Call this method and pass 0, a negative value,
or None to remove the limit.
*WARNING* if you have a large dataset, removing
the limit can potentially result in reading a
large amount of data into memory and cause
out of memory issues.
@@ -372,33 +351,11 @@ class LanceQueryBuilder(ABC):
The LanceQueryBuilder object.
"""
if limit is None or limit <= 0:
if isinstance(self, LanceVectorQueryBuilder):
raise ValueError("Limit is required for ANN/KNN queries")
else:
self._limit = None
self._limit = None
else:
self._limit = limit
return self
def offset(self, offset: int) -> LanceQueryBuilder:
"""Set the offset for the results.
Parameters
----------
offset: int
The offset to start fetching results from.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
if offset is None or offset <= 0:
self._offset = 0
else:
self._offset = offset
return self
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
"""Set the columns to return.
@@ -460,80 +417,6 @@ class LanceQueryBuilder(ABC):
self._with_row_id = with_row_id
return self
def explain_plan(self, verbose: Optional[bool] = False) -> str:
"""Return the execution plan for this query.
Examples
--------
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> table = db.create_table("my_table", [{"vector": [99, 99]}])
>>> query = [100, 100]
>>> plan = table.search(query).explain_plan(True)
>>> print(plan) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
ProjectionExec: expr=[vector@0 as vector, _distance@2 as _distance]
GlobalLimitExec: skip=0, fetch=10
FilterExec: _distance@2 IS NOT NULL
SortExec: TopK(fetch=10), expr=[_distance@2 ASC NULLS LAST], preserve_partitioning=[false]
KNNVectorDistance: metric=l2
LanceScan: uri=..., projection=[vector], row_id=true, row_addr=false, ordered=false
Parameters
----------
verbose : bool, default False
Use a verbose output format.
Returns
-------
plan : str
""" # noqa: E501
ds = self._table.to_lance()
return ds.scanner(
nearest={
"column": self._vector_column,
"q": self._query,
"k": self._limit,
"metric": self._metric,
"nprobes": self._nprobes,
"refine_factor": self._refine_factor,
},
prefilter=self._prefilter,
filter=self._str_query,
limit=self._limit,
with_row_id=self._with_row_id,
offset=self._offset,
).explain_plan(verbose)
def vector(self, vector: Union[np.ndarray, list]) -> LanceQueryBuilder:
"""Set the vector to search for.
Parameters
----------
vector: np.ndarray or list
The vector to search for.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
def text(self, text: str) -> LanceQueryBuilder:
"""Set the text to search for.
Parameters
----------
text: str
The text to search for.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError
class LanceVectorQueryBuilder(LanceQueryBuilder):
"""
@@ -557,12 +440,11 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
"""
def __init__(
self,
table: "Table",
query: Union[np.ndarray, list, "PIL.Image.Image"],
vector_column: str,
str_query: Optional[str] = None,
fast_search: bool = False,
self,
table: "Table",
query: Union[np.ndarray, list, "PIL.Image.Image"],
vector_column: str,
str_query: Optional[str] = None,
):
super().__init__(table)
self._query = query
@@ -573,14 +455,13 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._prefilter = False
self._reranker = None
self._str_query = str_query
self._fast_search = fast_search
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
"""Set the distance metric to use.
Parameters
----------
metric: "L2" or "cosine" or "dot"
metric: "L2" or "cosine"
The distance metric to use. By default "L2" is used.
Returns
@@ -588,7 +469,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._metric = metric.lower()
self._metric = metric
return self
def nprobes(self, nprobes: int) -> LanceVectorQueryBuilder:
@@ -613,28 +494,6 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._nprobes = nprobes
return self
def ef(self, ef: int) -> LanceVectorQueryBuilder:
"""Set the number of candidates to consider during search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
This only applies to the HNSW-related index.
The default value is 1.5 * limit.
Parameters
----------
ef: int
The number of candidates to consider during search.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._ef = ef
return self
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
"""Set the refine factor to use, increasing the number of vectors sampled.
@@ -695,11 +554,15 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
refine_factor=self._refine_factor,
vector_column=self._vector_column,
with_row_id=self._with_row_id,
offset=self._offset,
fast_search=self._fast_search,
ef=self._ef,
)
result_set = self._table._execute_query(query, batch_size)
if self._reranker is not None:
rs_table = result_set.read_all()
result_set = self._reranker.rerank_vector(self._str_query, rs_table)
# convert result_set back to RecordBatchReader
result_set = pa.RecordBatchReader.from_batches(
result_set.schema, result_set.to_batches()
)
return result_set
@@ -728,7 +591,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
return self
def rerank(
self, reranker: Reranker, query_string: Optional[str] = None
self, reranker: Reranker, query_string: Optional[str] = None
) -> LanceVectorQueryBuilder:
"""Rerank the results using the specified reranker.
@@ -893,34 +756,12 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
class LanceEmptyQueryBuilder(LanceQueryBuilder):
def to_arrow(self) -> pa.Table:
return self.to_batches().read_all()
def to_batches(self, /, batch_size: Optional[int] = None) -> pa.RecordBatchReader:
query = Query(
ds = self._table.to_lance()
return ds.to_table(
columns=self._columns,
filter=self._where,
k=self._limit or 10,
with_row_id=self._with_row_id,
vector=[],
# not actually respected in remote query
offset=self._offset or 0,
limit=self._limit,
)
return self._table._execute_query(query)
def rerank(self, reranker: Reranker) -> LanceEmptyQueryBuilder:
"""Rerank the results using the specified reranker.
Parameters
----------
reranker: Reranker
The reranker to use.
Returns
-------
LanceEmptyQueryBuilder
The LanceQueryBuilder object.
"""
raise NotImplementedError("Reranking is not yet supported.")
class LanceHybridQueryBuilder(LanceQueryBuilder):

View File

@@ -172,7 +172,6 @@ class RestfulLanceDBClient:
headers["content-type"] = content_type
if request_id is not None:
headers["x-request-id"] = request_id
with self.session.post(
urljoin(self.url, uri),
headers=headers,

View File

@@ -15,14 +15,13 @@ import logging
import uuid
from concurrent.futures import Future
from functools import cached_property
from typing import Dict, Iterable, Optional, Union, Literal
from typing import Dict, Iterable, Optional, Union
import pyarrow as pa
from lance import json_to_schema
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
from lancedb.merge import LanceMergeInsertBuilder
from lancedb.query import LanceQueryBuilder
from ..query import LanceVectorQueryBuilder
from ..table import Query, Table, _sanitize_data
@@ -82,7 +81,6 @@ class RemoteTable(Table):
def create_scalar_index(
self,
column: str,
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST", "scalar"] = "scalar",
):
"""Creates a scalar index
Parameters
@@ -91,6 +89,8 @@ class RemoteTable(Table):
The column to be indexed. Must be a boolean, integer, float,
or string column.
"""
index_type = "scalar"
data = {
"column": column,
"index_type": index_type,
@@ -228,21 +228,10 @@ class RemoteTable(Table):
content_type=ARROW_STREAM_CONTENT_TYPE,
)
def query(
self,
query: Union[VEC, str] = None,
query_type: str = "vector",
vector_column_name: Optional[str] = None,
fast_search: bool = False,
) -> LanceVectorQueryBuilder:
return self.search(query, query_type, vector_column_name, fast_search)
def search(
self,
query: Union[VEC, str] = None,
query_type: str = "vector",
query: Union[VEC, str],
vector_column_name: Optional[str] = None,
fast_search: bool = False,
) -> LanceVectorQueryBuilder:
"""Create a search query to find the nearest neighbors
of the given query vector. We currently support [vector search][search]
@@ -289,11 +278,6 @@ class RemoteTable(Table):
- If the table has multiple vector columns then the *vector_column_name*
needs to be specified. Otherwise, an error is raised.
fast_search: bool, optional
Skip a flat search of unindexed data. This may improve
search performance but search results will not include unindexed data.
- *default False*.
Returns
-------
LanceQueryBuilder
@@ -309,14 +293,7 @@ class RemoteTable(Table):
"""
if vector_column_name is None:
vector_column_name = inf_vector_column_query(self.schema)
return LanceQueryBuilder.create(
self,
query,
query_type,
vector_column_name=vector_column_name,
fast_search=fast_search,
)
return LanceVectorQueryBuilder(self, query, vector_column_name)
def _execute_query(
self, query: Query, batch_size: Optional[int] = None

View File

@@ -21,7 +21,6 @@ class FakeLanceDBClient:
pass
def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
print(f"{query=}")
assert table_name == "test"
t = pa.schema([]).empty_table()
return VectorQueryResult(t)
@@ -40,21 +39,3 @@ def test_remote_db():
table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
table.search([1.0, 2.0]).to_pandas()
def test_empty_query_with_filter():
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
setattr(conn, "_client", FakeLanceDBClient())
table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
print(table.query().select(["vector"]).where("foo == bar").to_arrow())
def test_fast_search_query_with_filter():
conn = lancedb.connect("db://client-will-be-injected", api_key="fake")
setattr(conn, "_client", FakeLanceDBClient())
table = conn["test"]
table.schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), 2))])
print(table.query([0, 0], fast_search=True).select(["vector"]).where("foo == bar").to_arrow())

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.5.2-final.1"
version = "0.5.2"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.5.2-final.1"
version = "0.5.2"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true