Compare commits

..

8 Commits

Author SHA1 Message Date
Lance Release
81f2cdf736 [python] Bump version: 0.6.3 → 0.6.4 2024-03-16 18:59:14 +00:00
Lance Release
d404a3590c Updating package-lock.json 2024-03-16 05:21:58 +00:00
Lance Release
e688484bd3 Bump version: 0.4.12 → 0.4.13 2024-03-16 05:21:44 +00:00
Weston Pace
3bcd61c8de feat: bump lance to 0.10.4 (#1123) 2024-03-15 22:21:04 -07:00
vincent d warmerdam
c76ec48603 Explain vonoroi seed initalisation (#1114)
This PR fixes https://github.com/lancedb/lancedb/issues/1112. It turned
out that K-means is currently used internally, so I figured adding that
context to the docs would be nice.
2024-03-15 14:16:05 -07:00
Christian Di Lorenzo
d974413745 fix(python): Add python azure blob read support (#1102)
I know there's a larger effort to have the python client based on the
core rust implementation, but in the meantime there have been several
issues (#1072 and #485) with some of the azure blob storage calls due to
pyarrow not natively supporting an azure backend. To this end, I've
added an optional import of the fsspec implementation of azure blob
storage [`adlfs`](https://pypi.org/project/adlfs/) and passed it to
`pyarrow.fs`. I've modified the existing test and manually verified it
with some real credentials to make sure it behaves as expected.

It should be now as simple as:

```python
import lancedb

db = lancedb.connect("az://blob_name/path")
table = db.open_table("test")
table.search(...)
```

Thank you for this cool project and we're excited to start using this
for real shortly! 🎉 And thanks to @dwhitena for bringing it to my
attention with his prediction guard posts.

Co-authored-by: christiandilorenzo <christian.dilorenzo@infiniaml.com>
2024-03-15 14:15:41 -07:00
Weston Pace
ec4f2fbd30 feat: update lance to v0.10.3 (#1094) 2024-03-15 08:50:28 -07:00
Ayush Chaurasia
6375ea419a chore(python): Increase event interval for telemetry (#1108)
Increasing event reporting interval from 5mins to 60mins
2024-03-15 17:04:43 +05:30
15 changed files with 76 additions and 95 deletions

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.12
current_version = 0.4.13
commit = True
message = Bump version: {current_version} → {new_version}
tag = True

View File

@@ -14,10 +14,10 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
[workspace.dependencies]
lance = { "version" = "=0.10.2", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.10.2" }
lance-linalg = { "version" = "=0.10.2" }
lance-testing = { "version" = "=0.10.2" }
lance = { "version" = "=0.10.4", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.10.4" }
lance-linalg = { "version" = "=0.10.4" }
lance-testing = { "version" = "=0.10.4" }
# Note that this one does not include pyarrow
arrow = { version = "50.0", optional = false }
arrow-array = "50.0"

View File

@@ -31,7 +31,7 @@ As an example, consider starting with 128-dimensional vector consisting of 32-bi
While PQ helps with reducing the size of the index, IVF primarily addresses search performance. The primary purpose of an inverted file index is to facilitate rapid and effective nearest neighbor search by narrowing down the search space.
In IVF, the PQ vector space is divided into *Voronoi cells*, which are essentially partitions that consist of all the points in the space that are within a threshold distance of the given region's seed point. These seed points are used to create an inverted index that correlates each centroid with a list of vectors in the space, allowing a search to be restricted to just a subset of vectors in the index.
In IVF, the PQ vector space is divided into *Voronoi cells*, which are essentially partitions that consist of all the points in the space that are within a threshold distance of the given region's seed point. These seed points are initialized by running K-means over the stored vectors. The centroids of K-means turn into the seed points which then each define a region. These regions are then are used to create an inverted index that correlates each centroid with a list of vectors in the space, allowing a search to be restricted to just a subset of vectors in the index.
![](../assets/ivfpq_ivf_desc.webp)

74
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.4.12",
"version": "0.4.13",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.4.12",
"version": "0.4.13",
"cpu": [
"x64",
"arm64"
@@ -52,11 +52,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.12",
"@lancedb/vectordb-darwin-x64": "0.4.12",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.12",
"@lancedb/vectordb-linux-x64-gnu": "0.4.12",
"@lancedb/vectordb-win32-x64-msvc": "0.4.12"
"@lancedb/vectordb-darwin-arm64": "0.4.13",
"@lancedb/vectordb-darwin-x64": "0.4.13",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.13",
"@lancedb/vectordb-linux-x64-gnu": "0.4.13",
"@lancedb/vectordb-win32-x64-msvc": "0.4.13"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -333,66 +333,6 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.4.12",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.12.tgz",
"integrity": "sha512-38/rkJRlWXkPWXuj9onzvbrhnIWcIUQjgEp5G9v5ixPosBowm7A4j8e2Q8CJMsVSNcVX2JLqwWVldiWegZFuYw==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.4.12",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.12.tgz",
"integrity": "sha512-psE48dztyO450hXWdv9Rl9aayM2HQ1uF9wErfC0gKmDUh1N0NdVq2viDuFpZxnmCis/nvGwKlYiYT9OnYNCJ9g==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.12",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.12.tgz",
"integrity": "sha512-xwkgF6MiF5aAdG9JG8v4ke652YxUJrhs9z4OrsEfrENnvsIQd2C5UyKMepVLdvij4BI/XPFRFWXdjPvP7S9rTA==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.12",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.12.tgz",
"integrity": "sha512-gJqYR0aymrS+C60xc4EQPzmQ5/69XfeFv2ofBvAj7qW+c6BcnoAcfVl+7s1IrcWeGz251sm5cD5Lx4AzJd89dA==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.12",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.12.tgz",
"integrity": "sha512-LhCzpyEeBUyO6L2fuVqeP3mW8kYDryyU9PNqcM01m88sZB1Do6AlwiM+GjPRQ0SpzD0LK9oxQqSmJrdcNGqjbw==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"win32"
]
},
"node_modules/@neon-rs/cli": {
"version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.4.12",
"version": "0.4.13",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -88,10 +88,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.12",
"@lancedb/vectordb-darwin-x64": "0.4.12",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.12",
"@lancedb/vectordb-linux-x64-gnu": "0.4.12",
"@lancedb/vectordb-win32-x64-msvc": "0.4.12"
"@lancedb/vectordb-darwin-arm64": "0.4.13",
"@lancedb/vectordb-darwin-x64": "0.4.13",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.13",
"@lancedb/vectordb-linux-x64-gnu": "0.4.13",
"@lancedb/vectordb-win32-x64-msvc": "0.4.13"
}
}

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.3
current_version = 0.6.4
commit = True
message = [python] Bump version: {current_version} → {new_version}
tag = True

View File

@@ -1,9 +1,9 @@
[project]
name = "lancedb"
version = "0.6.3"
version = "0.6.4"
dependencies = [
"deprecation",
"pylance==0.10.2",
"pylance==0.10.4",
"ratelimiter~=1.0",
"retry>=0.9.2",
"tqdm>=4.27.0",
@@ -81,6 +81,7 @@ embeddings = [
"awscli>=1.29.57",
"botocore>=1.31.57",
]
azure = ["adlfs>=2024.2.0"]
[tool.maturin]
python-source = "python"

View File

@@ -271,8 +271,7 @@ class LanceQueryBuilder(ABC):
and also the "_distance" column which is the distance between the query
vector and the returned vectors.
"""
# raise NotImplementedError
self.to_arrow()
raise NotImplementedError
def to_list(self) -> List[dict]:
"""
@@ -435,12 +434,12 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._vector_column = vector_column
self._prefilter = False
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
"""Set the distance metric to use.
Parameters
----------
metric: "L2" or "cosine" or "dot"
metric: "L2" or "cosine"
The distance metric to use. By default "L2" is used.
Returns

View File

@@ -296,7 +296,6 @@ class RemoteTable(Table):
return LanceVectorQueryBuilder(self, query, vector_column_name)
def _execute_query(self, query: Query) -> pa.Table:
print("query metric", query.metric)
if (
query.vector is not None
and len(query.vector) > 0

View File

@@ -1522,7 +1522,7 @@ class LanceTable(Table):
def _execute_query(self, query: Query) -> pa.Table:
ds = self.to_lance()
print("metric:", query.metric)
return ds.to_table(
columns=query.columns,
filter=query.filter,

View File

@@ -26,6 +26,18 @@ import pyarrow as pa
import pyarrow.fs as pa_fs
def safe_import_adlfs():
try:
import adlfs
return adlfs
except ImportError:
return None
adlfs = safe_import_adlfs()
def get_uri_scheme(uri: str) -> str:
"""
Get the scheme of a URI. If the URI does not have a scheme, assume it is a file URI.
@@ -92,6 +104,17 @@ def fs_from_uri(uri: str) -> Tuple[pa_fs.FileSystem, str]:
path = get_uri_location(uri)
return fs, path
elif get_uri_scheme(uri) == "az" and adlfs is not None:
az_blob_fs = adlfs.AzureBlobFileSystem(
account_name=os.environ.get("AZURE_STORAGE_ACCOUNT_NAME"),
account_key=os.environ.get("AZURE_STORAGE_ACCOUNT_KEY"),
)
fs = pa_fs.PyFileSystem(pa_fs.FSSpecHandler(az_blob_fs))
path = get_uri_location(uri)
return fs, path
return pa_fs.FileSystem.from_uri(uri)

View File

@@ -69,7 +69,7 @@ class _Events:
self.throttled_event_names = ["search_table"]
self.throttled_events = set()
self.max_events = 5 # max events to store in memory
self.rate_limit = 60.0 * 5 # rate limit (seconds)
self.rate_limit = 60.0 * 60.0 # rate limit (seconds)
self.time = 0.0
if is_git_dir():

View File

@@ -16,16 +16,35 @@ import os
import lancedb
import pytest
# AWS:
# You need to setup AWS credentials an a base path to run this test. Example
# AWS_PROFILE=default TEST_S3_BASE_URL=s3://my_bucket/dataset pytest tests/test_io.py
#
# Azure:
# You need to setup Azure credentials an a base path to run this test. Example
# export AZURE_STORAGE_ACCOUNT_NAME="<account>"
# export AZURE_STORAGE_ACCOUNT_KEY="<key>"
# export REMOTE_BASE_URL=az://my_blob/dataset
# pytest tests/test_io.py
@pytest.fixture(autouse=True, scope="module")
def setup():
yield
if remote_url := os.environ.get("REMOTE_BASE_URL"):
db = lancedb.connect(remote_url)
for table in db.table_names():
db.drop_table(table)
@pytest.mark.skipif(
(os.environ.get("TEST_S3_BASE_URL") is None),
reason="please setup s3 base url",
(os.environ.get("REMOTE_BASE_URL") is None),
reason="please setup remote base url",
)
def test_s3_io():
db = lancedb.connect(os.environ.get("TEST_S3_BASE_URL"))
def test_remote_io():
db = lancedb.connect(os.environ.get("REMOTE_BASE_URL"))
assert db.table_names() == []
table = db.create_table(

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.4.12"
version = "0.4.13"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.4.12"
version = "0.4.13"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true