mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
9 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a022368426 | ||
|
|
8b815ef5a8 | ||
|
|
e4c3a9346c | ||
|
|
1d1f8964d2 | ||
|
|
d326146a40 | ||
|
|
693bca1eba | ||
|
|
343e274ea5 | ||
|
|
a695fb8030 | ||
|
|
bc8670d7af |
@@ -5,8 +5,8 @@ exclude = ["python"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.7.5", "features" = ["dynamodb"] }
|
||||
lance-linalg = { "version" = "=0.7.5" }
|
||||
lance = { "version" = "=0.8.1", "features" = ["dynamodb"] }
|
||||
lance-linalg = { "version" = "=0.8.1" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "43.0.0", optional = false }
|
||||
arrow-array = "43.0"
|
||||
|
||||
@@ -154,28 +154,28 @@ You can select the columns returned by the query using a select clause.
|
||||
|
||||
## FAQ
|
||||
|
||||
### When is it necessary to create an ANN vector index.
|
||||
### When is it necessary to create an ANN vector index?
|
||||
|
||||
`LanceDB` has manually tuned SIMD code for computing vector distances.
|
||||
In our benchmarks, computing 100K pairs of 1K dimension vectors only take less than 20ms.
|
||||
For small dataset (<100K rows) or the applications which can accept 100ms latency, vector indices are usually not necessary.
|
||||
`LanceDB` has manually-tuned SIMD code for computing vector distances.
|
||||
In our benchmarks, computing 100K pairs of 1K dimension vectors takes **less than 20ms**.
|
||||
For small datasets (< 100K rows) or applications that can accept 100ms latency, vector indices are usually not necessary.
|
||||
|
||||
For large-scale or higher dimension vectors, it is beneficial to create vector index.
|
||||
|
||||
### How big is my index, and how many memory will it take.
|
||||
### How big is my index, and how many memory will it take?
|
||||
|
||||
In LanceDB, all vector indices are disk-based, meaning that when responding to a vector query, only the relevant pages from the index file are loaded from disk and cached in memory. Additionally, each sub-vector is usually encoded into 1 byte PQ code.
|
||||
In LanceDB, all vector indices are **disk-based**, meaning that when responding to a vector query, only the relevant pages from the index file are loaded from disk and cached in memory. Additionally, each sub-vector is usually encoded into 1 byte PQ code.
|
||||
|
||||
For example, with a 1024-dimension dataset, if we choose `num_sub_vectors=64`, each sub-vector has `1024 / 64 = 16` float32 numbers.
|
||||
Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` times of space reduction.
|
||||
|
||||
### How to choose `num_partitions` and `num_sub_vectors` for `IVF_PQ` index.
|
||||
### How to choose `num_partitions` and `num_sub_vectors` for `IVF_PQ` index?
|
||||
|
||||
`num_partitions` is used to decide how many partitions the first level `IVF` index uses.
|
||||
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
||||
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
||||
|
||||
`num_sub_vectors` decides how many Product Quantization code to generate on each vector. Because
|
||||
Product Quantization is a lossy compression of the original vector, the more `num_sub_vectors` usually results to
|
||||
less space distortion, and thus yield better accuracy. However, similarly, more `num_sub_vectors` causes heavier I/O and
|
||||
more PQ computation, thus, higher latency. `dimension / num_sub_vectors` should be aligned with 8 for better SIMD efficiency.
|
||||
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
|
||||
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
|
||||
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||
@@ -123,9 +123,15 @@ After a table has been created, you can always add more data to it using
|
||||
|
||||
=== "Python"
|
||||
```python
|
||||
df = pd.DataFrame([{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
||||
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}])
|
||||
tbl.add(df)
|
||||
|
||||
# Option 1: Add a list of dicts to a table
|
||||
data = [{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
||||
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}]
|
||||
tbl.add(data)
|
||||
|
||||
# Option 2: Add a pandas DataFrame to a table
|
||||
df = pd.DataFrame(data)
|
||||
tbl.add(data)
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
|
||||
@@ -6,17 +6,19 @@ to make this available for JS as well.
|
||||
|
||||
## Installation
|
||||
|
||||
To use full text search, you must install optional dependency tantivy-py:
|
||||
To use full text search, you must install the dependency `tantivy-py`:
|
||||
|
||||
# tantivy 0.19.2
|
||||
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||
# tantivy 0.20.1
|
||||
```sh
|
||||
pip install tantivy==0.20.1
|
||||
```
|
||||
|
||||
|
||||
## Quickstart
|
||||
|
||||
Assume:
|
||||
1. `table` is a LanceDB Table
|
||||
2. `text` is the name of the Table column that we want to index
|
||||
2. `text` is the name of the `Table` column that we want to index
|
||||
|
||||
For example,
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
import pandas as pd
|
||||
|
||||
data = pd.DataFrame({
|
||||
"vector": [[1.1, 1.2], [0.2, 1.8]],
|
||||
"vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],
|
||||
"lat": [45.5, 40.1],
|
||||
"long": [-122.7, -74.1]
|
||||
})
|
||||
@@ -56,7 +56,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
```python
|
||||
custom_schema = pa.schema([
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("lat", pa.float32()),
|
||||
pa.field("long", pa.float32())
|
||||
])
|
||||
@@ -70,8 +70,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
```python
|
||||
table = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]],
|
||||
pa.list_(pa.float32(), 2)),
|
||||
pa.array([[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]],
|
||||
pa.list_(pa.float32(), 4)),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
@@ -131,8 +131,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
for i in range(5):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]],
|
||||
pa.list_(pa.float32(), 2)),
|
||||
pa.array([[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]],
|
||||
pa.list_(pa.float32(), 4)),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
@@ -140,7 +140,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
)
|
||||
|
||||
schema = pa.schema([
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
])
|
||||
|
||||
@@ -25,8 +25,8 @@ Currently, we support the following metrics:
|
||||
|
||||
### Flat Search
|
||||
|
||||
If LanceDB does not create a vector index, LanceDB would need to scan (`Flat Search`) the entire vector column
|
||||
and compute the distance for each vector in order to find the closest matches.
|
||||
If you do not create a vector index, LanceDB would need to exhaustively scan the entire vector column (via `Flat Search`)
|
||||
and compute the distance for *every* vector in order to find the closest matches. This is effectively a KNN search.
|
||||
|
||||
|
||||
<!-- Setup Code
|
||||
@@ -110,7 +110,7 @@ as well.
|
||||
|
||||
To accelerate vector retrievals, it is common to build vector indices.
|
||||
A vector index is a data structure specifically designed to efficiently organize and
|
||||
search vector data based on their similarity or distance metrics.
|
||||
search vector data based on their similarity via the chosen distance metric.
|
||||
By constructing a vector index, you can reduce the search space and avoid the need
|
||||
for brute-force scanning of the entire vector column.
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.2.5
|
||||
current_version = 0.3.0
|
||||
commit = True
|
||||
message = [python] Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
@@ -26,6 +26,7 @@ import numpy as np
|
||||
import pyarrow as pa
|
||||
from cachetools import cached
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class EmbeddingFunctionRegistry:
|
||||
@@ -514,7 +515,7 @@ class OpenClipEmbeddings(EmbeddingFunction):
|
||||
executor.submit(self.generate_image_embedding, image)
|
||||
for image in images
|
||||
]
|
||||
return [future.result() for future in futures]
|
||||
return [future.result() for future in tqdm(futures)]
|
||||
|
||||
def generate_image_embedding(
|
||||
self, image: Union[str, bytes, "PIL.Image.Image"]
|
||||
@@ -557,7 +558,7 @@ class OpenClipEmbeddings(EmbeddingFunction):
|
||||
"""
|
||||
encode a single image tensor and optionally normalize the output
|
||||
"""
|
||||
image_features = self._model.encode_image(image_tensor)
|
||||
image_features = self._model.encode_image(image_tensor.to(self.device))
|
||||
if self.normalize:
|
||||
image_features /= image_features.norm(dim=-1, keepdim=True)
|
||||
return image_features.cpu().numpy().squeeze()
|
||||
|
||||
@@ -38,6 +38,9 @@ class Query(pydantic.BaseModel):
|
||||
# sql filter to refine the query with
|
||||
filter: Optional[str] = None
|
||||
|
||||
# if True then apply the filter before vector search
|
||||
prefilter: bool = False
|
||||
|
||||
# top k results to return
|
||||
k: int
|
||||
|
||||
@@ -162,7 +165,7 @@ class LanceQueryBuilder(ABC):
|
||||
for row in self.to_arrow().to_pylist()
|
||||
]
|
||||
|
||||
def limit(self, limit: int) -> LanceVectorQueryBuilder:
|
||||
def limit(self, limit: int) -> LanceQueryBuilder:
|
||||
"""Set the maximum number of results to return.
|
||||
|
||||
Parameters
|
||||
@@ -172,13 +175,13 @@ class LanceQueryBuilder(ABC):
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceVectorQueryBuilder
|
||||
LanceQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._limit = limit
|
||||
return self
|
||||
|
||||
def select(self, columns: list) -> LanceVectorQueryBuilder:
|
||||
def select(self, columns: list) -> LanceQueryBuilder:
|
||||
"""Set the columns to return.
|
||||
|
||||
Parameters
|
||||
@@ -188,13 +191,13 @@ class LanceQueryBuilder(ABC):
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceVectorQueryBuilder
|
||||
LanceQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._columns = columns
|
||||
return self
|
||||
|
||||
def where(self, where: str) -> LanceVectorQueryBuilder:
|
||||
def where(self, where) -> LanceQueryBuilder:
|
||||
"""Set the where clause.
|
||||
|
||||
Parameters
|
||||
@@ -204,7 +207,7 @@ class LanceQueryBuilder(ABC):
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceVectorQueryBuilder
|
||||
LanceQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._where = where
|
||||
@@ -246,6 +249,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
self._nprobes = 20
|
||||
self._refine_factor = None
|
||||
self._vector_column = vector_column
|
||||
self._prefilter = False
|
||||
|
||||
def metric(self, metric: Literal["L2", "cosine"]) -> LanceVectorQueryBuilder:
|
||||
"""Set the distance metric to use.
|
||||
@@ -320,6 +324,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
query = Query(
|
||||
vector=vector,
|
||||
filter=self._where,
|
||||
prefilter=self._prefilter,
|
||||
k=self._limit,
|
||||
metric=self._metric,
|
||||
columns=self._columns,
|
||||
@@ -329,6 +334,30 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
)
|
||||
return self._table._execute_query(query)
|
||||
|
||||
def where(self, where: str, prefilter: bool = False) -> LanceVectorQueryBuilder:
|
||||
"""Set the where clause.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where: str
|
||||
The where clause.
|
||||
prefilter: bool, default False
|
||||
If True, apply the filter before vector search, otherwise the
|
||||
filter is applied on the result of vector search.
|
||||
This feature is **EXPERIMENTAL** and may be removed and modified
|
||||
without warning in the future. Currently this is only supported
|
||||
in OSS and can only be used with a table that does not have an ANN
|
||||
index.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._where = where
|
||||
self._prefilter = prefilter
|
||||
return self
|
||||
|
||||
|
||||
class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
def __init__(self, table: "lancedb.table.Table", query: str):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
import abc
|
||||
from typing import List, Optional
|
||||
|
||||
import attr
|
||||
import attrs
|
||||
import pyarrow as pa
|
||||
from pydantic import BaseModel
|
||||
|
||||
@@ -44,7 +44,7 @@ class VectorQuery(BaseModel):
|
||||
refine_factor: Optional[int] = None
|
||||
|
||||
|
||||
@attr.define
|
||||
@attrs.define
|
||||
class VectorQueryResult:
|
||||
# for now the response is directly seralized into a pandas dataframe
|
||||
tbl: pa.Table
|
||||
|
||||
@@ -16,7 +16,7 @@ import functools
|
||||
from typing import Any, Callable, Dict, Optional, Union
|
||||
|
||||
import aiohttp
|
||||
import attr
|
||||
import attrs
|
||||
import pyarrow as pa
|
||||
from pydantic import BaseModel
|
||||
|
||||
@@ -43,14 +43,14 @@ async def _read_ipc(resp: aiohttp.ClientResponse) -> pa.Table:
|
||||
return reader.read_all()
|
||||
|
||||
|
||||
@attr.define(slots=False)
|
||||
@attrs.define(slots=False)
|
||||
class RestfulLanceDBClient:
|
||||
db_name: str
|
||||
region: str
|
||||
api_key: Credential
|
||||
host_override: Optional[str] = attr.field(default=None)
|
||||
host_override: Optional[str] = attrs.field(default=None)
|
||||
|
||||
closed: bool = attr.field(default=False, init=False)
|
||||
closed: bool = attrs.field(default=False, init=False)
|
||||
|
||||
@functools.cached_property
|
||||
def session(self) -> aiohttp.ClientSession:
|
||||
|
||||
@@ -98,6 +98,8 @@ class RemoteTable(Table):
|
||||
return LanceVectorQueryBuilder(self, query, vector_column_name)
|
||||
|
||||
def _execute_query(self, query: Query) -> pa.Table:
|
||||
if query.prefilter:
|
||||
raise NotImplementedError("Cloud support for prefiltering is coming soon")
|
||||
result = self._conn._client.query(self._name, query)
|
||||
return self._conn._loop.run_until_complete(result).to_arrow()
|
||||
|
||||
|
||||
@@ -844,9 +844,16 @@ class LanceTable(Table):
|
||||
|
||||
def _execute_query(self, query: Query) -> pa.Table:
|
||||
ds = self.to_lance()
|
||||
if query.prefilter:
|
||||
for idx in ds.list_indices():
|
||||
if query.vector_column in idx["fields"]:
|
||||
raise NotImplementedError(
|
||||
"Prefiltering for indexed vector column is coming soon."
|
||||
)
|
||||
return ds.to_table(
|
||||
columns=query.columns,
|
||||
filter=query.filter,
|
||||
prefilter=query.prefilter,
|
||||
nearest={
|
||||
"column": query.vector_column,
|
||||
"q": query.vector,
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
[project]
|
||||
name = "lancedb"
|
||||
version = "0.2.5"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"pylance==0.7.4",
|
||||
"ratelimiter",
|
||||
"retry",
|
||||
"tqdm",
|
||||
"pylance==0.8.1",
|
||||
"ratelimiter~=1.0",
|
||||
"retry>=0.9.2",
|
||||
"tqdm>=4.1.0",
|
||||
"aiohttp",
|
||||
"pydantic",
|
||||
"attr",
|
||||
"pydantic>=1.10",
|
||||
"attrs>=21.3.0",
|
||||
"semver>=3.0",
|
||||
"cachetools"
|
||||
]
|
||||
|
||||
@@ -38,6 +38,7 @@ class MockTable:
|
||||
return ds.to_table(
|
||||
columns=query.columns,
|
||||
filter=query.filter,
|
||||
prefilter=query.prefilter,
|
||||
nearest={
|
||||
"column": query.vector_column,
|
||||
"q": query.vector,
|
||||
@@ -97,6 +98,25 @@ def test_query_builder_with_filter(table):
|
||||
assert all(df["vector"].values[0] == [3, 4])
|
||||
|
||||
|
||||
def test_query_builder_with_prefilter(table):
|
||||
df = (
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
.where("id = 2")
|
||||
.limit(1)
|
||||
.to_df()
|
||||
)
|
||||
assert len(df) == 0
|
||||
|
||||
df = (
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
.where("id = 2", prefilter=True)
|
||||
.limit(1)
|
||||
.to_df()
|
||||
)
|
||||
assert df["id"].values[0] == 2
|
||||
assert all(df["vector"].values[0] == [3, 4])
|
||||
|
||||
|
||||
def test_query_builder_with_metric(table):
|
||||
query = [4, 8]
|
||||
vector_column_name = "vector"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import attr
|
||||
import attrs
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
@@ -21,10 +21,10 @@ from aiohttp import web
|
||||
from lancedb.remote.client import RestfulLanceDBClient, VectorQuery
|
||||
|
||||
|
||||
@attr.define
|
||||
@attrs.define
|
||||
class MockLanceDBServer:
|
||||
runner: web.AppRunner = attr.field(init=False)
|
||||
site: web.TCPSite = attr.field(init=False)
|
||||
runner: web.AppRunner = attrs.field(init=False)
|
||||
site: web.TCPSite = attrs.field(init=False)
|
||||
|
||||
async def query_handler(self, request: web.Request) -> web.Response:
|
||||
table_name = request.match_info["table_name"]
|
||||
|
||||
@@ -12,8 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::index::vector::{ivf::IvfBuildParams, pq::PQBuildParams};
|
||||
use lance_linalg::distance::MetricType;
|
||||
use neon::context::FunctionContext;
|
||||
use neon::prelude::*;
|
||||
@@ -79,11 +78,9 @@ fn get_index_params_builder(
|
||||
|
||||
num_partitions.map(|np| {
|
||||
let max_iters = max_iters.unwrap_or(50);
|
||||
let ivf_params = IvfBuildParams {
|
||||
num_partitions: np,
|
||||
max_iters,
|
||||
centroids: None,
|
||||
};
|
||||
let mut ivf_params = IvfBuildParams::default();
|
||||
ivf_params.num_partitions = np;
|
||||
ivf_params.max_iters = max_iters;
|
||||
index_builder.ivf_params(ivf_params)
|
||||
});
|
||||
|
||||
|
||||
@@ -190,9 +190,8 @@ impl Table {
|
||||
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
|
||||
use lance::index::DatasetIndexExt;
|
||||
|
||||
let dataset = self
|
||||
.dataset
|
||||
.create_index(
|
||||
let mut dataset = self.dataset.as_ref().clone();
|
||||
dataset.create_index(
|
||||
&[index_builder
|
||||
.get_column()
|
||||
.unwrap_or(VECTOR_COLUMN_NAME.to_string())
|
||||
|
||||
Reference in New Issue
Block a user