mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 05:49:57 +00:00
Compare commits
48 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
342b726ed7 | ||
|
|
159b175316 | ||
|
|
7876156d54 | ||
|
|
d64e85e9d7 | ||
|
|
3e79b4d9cb | ||
|
|
3eac75e61a | ||
|
|
b19ce10184 | ||
|
|
ce34d055af | ||
|
|
8bf4d169e2 | ||
|
|
4f7f33f7b7 | ||
|
|
32b21e1d20 | ||
|
|
6062bfdb8f | ||
|
|
93a5c5c15c | ||
|
|
99310e099e | ||
|
|
85dda53779 | ||
|
|
d7c5793803 | ||
|
|
08e67d04bb | ||
|
|
ec197b1855 | ||
|
|
23d4e3561f | ||
|
|
de6bfab124 | ||
|
|
d7fb2b1d6b | ||
|
|
cdb534076f | ||
|
|
c38d80cab2 | ||
|
|
45e02bb62b | ||
|
|
b3fdabdf45 | ||
|
|
1c3f9f1e3b | ||
|
|
f0ea1d898b | ||
|
|
3ba7fa15a4 | ||
|
|
370867836c | ||
|
|
682f09480c | ||
|
|
cd8807bc97 | ||
|
|
41c44ae92e | ||
|
|
6865d66d37 | ||
|
|
aeecd809cc | ||
|
|
3360678d60 | ||
|
|
177eddfc20 | ||
|
|
d735a69b6e | ||
|
|
a2bd2854e1 | ||
|
|
c32b6880e7 | ||
|
|
c6fe5e38f1 | ||
|
|
1c8b52f07b | ||
|
|
f544c5dd31 | ||
|
|
eba533da4f | ||
|
|
404211d4fb | ||
|
|
5d7832c8a5 | ||
|
|
4eba83fdc9 | ||
|
|
6906a5f912 | ||
|
|
69fd80e9f2 |
@@ -41,9 +41,14 @@ pip install lancedb
|
||||
```python
|
||||
import lancedb
|
||||
|
||||
uri = "/tmp/lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
table = db.create_table("my_table",
|
||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
result = table.search([100, 100]).limit(2).to_df()
|
||||
```
|
||||
|
||||
## Blogs, Tutorials & Videos
|
||||
* 📈 <a href="https://blog.eto.ai/benchmarking-random-access-in-lance-ed690757a826">2000x better performance with Lance over Parquet</a>
|
||||
* 🤖 <a href="https://github.com/lancedb/lancedb/blob/main/notebooks/youtube_transcript_search.ipynb">Build a question and answer bot with LanceDB</a>
|
||||
|
||||
@@ -3,6 +3,8 @@ docs_dir: src
|
||||
|
||||
theme:
|
||||
name: "material"
|
||||
features:
|
||||
- content.code.copy
|
||||
|
||||
plugins:
|
||||
- search
|
||||
@@ -11,4 +13,16 @@ plugins:
|
||||
|
||||
nav:
|
||||
- Home: index.md
|
||||
- Basics: basic.md
|
||||
- Embeddings: embedding.md
|
||||
- Integrations: integrations.md
|
||||
- Python API: python.md
|
||||
|
||||
markdown_extensions:
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
line_spans: __span
|
||||
pygments_lang_class: true
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.snippets
|
||||
- pymdownx.superfences
|
||||
|
||||
80
docs/src/ann_indexes.md
Normal file
80
docs/src/ann_indexes.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# ANN (Approximate Nearest Neighbor) Indexes
|
||||
|
||||
You can create an index over your vector data to make search faster. Vector indexes are faster but less accurate than exhaustive search. LanceDB provides many parameters to fine-tune the index's size, the speed of queries, and the accuracy of results.
|
||||
|
||||
Currently, LanceDB does not automatically create the ANN index. In the future we will look to improve this experience and automate index creation and configuration.
|
||||
|
||||
## Creating an ANN Index
|
||||
|
||||
Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) function.
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
import numpy as np
|
||||
uri = "~/.lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
|
||||
# Create 10,000 sample vectors
|
||||
data = [{"vector": row, "item": f"item {i}"}
|
||||
for i, row in enumerate(np.random.random((10_000, 768)).astype('float32'))]
|
||||
|
||||
# Add the vectors to a table
|
||||
tbl = db.create_table("my_vectors", data=data)
|
||||
|
||||
# Create and train the index - you need to have enough data in the table for an effective training step
|
||||
tbl.create_index(num_partitions=256, num_sub_vectors=96)
|
||||
```
|
||||
|
||||
Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index
|
||||
creation by providing the following parameters:
|
||||
|
||||
- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table
|
||||
with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional.
|
||||
A higher number leads to faster queries, but it makes index generation slower.
|
||||
- **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes
|
||||
search more accurate, but also makes the index larger and slower to build.
|
||||
|
||||
## Querying an ANN Index
|
||||
|
||||
Querying vector indexes is done via the [search](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.search) function.
|
||||
|
||||
There are a couple of parameters that can be used to fine-tune the search:
|
||||
|
||||
- **limit** (default: 10): The amount of results that will be returned
|
||||
- **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower.
|
||||
- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes
|
||||
search more accurate but also slower.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.random((768))) \
|
||||
.limit(2) \
|
||||
.nprobes(20) \
|
||||
.refine_factor(20) \
|
||||
.to_df()
|
||||
|
||||
vector item score
|
||||
0 [0.44949695, 0.8444449, 0.06281311, 0.23338133... item 1141 103.575333
|
||||
1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867
|
||||
```
|
||||
|
||||
The search will return the data requested in addition to the score of each item. The score is the distance between the query vector and the element. A lower number means that the result is more relevant.
|
||||
|
||||
### Filtering (where clause)
|
||||
|
||||
You can further filter the elements returned by a search using a where clause.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.random((768))).where("item != 'item 1141'").to_df()
|
||||
```
|
||||
|
||||
### Projections (select clause)
|
||||
|
||||
You can select the columns returned by the query using a select clause.
|
||||
|
||||
```python
|
||||
tbl.search(np.random.random((768))).select(["vector"]).to_df()
|
||||
vector score
|
||||
0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092
|
||||
1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485
|
||||
...
|
||||
```
|
||||
77
docs/src/basic.md
Normal file
77
docs/src/basic.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Basic LanceDB Functionality
|
||||
|
||||
## How to connect to a database
|
||||
|
||||
In local mode, LanceDB stores data in a directory on your local machine. To connect to a local database, you can use the following code:
|
||||
```python
|
||||
import lancedb
|
||||
uri = "~/.lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
```
|
||||
|
||||
LanceDB will create the directory if it doesn't exist (including parent directories).
|
||||
|
||||
If you need a reminder of the uri, use the `db.uri` property.
|
||||
|
||||
## How to create a table
|
||||
|
||||
To create a table, you can use the following code:
|
||||
```python
|
||||
tbl = db.create_table("my_table",
|
||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
```
|
||||
|
||||
Under the hood, LanceDB is converting the input data into an Apache Arrow table
|
||||
and persisting it to disk in [Lance format](github.com/eto-ai/lance).
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
If you want to overwrite the table, you can pass in `mode="overwrite"`
|
||||
to the `create_table` method.
|
||||
|
||||
You can also pass in a pandas DataFrame directly:
|
||||
```python
|
||||
import pandas as pd
|
||||
df = pd.DataFrame([{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
tbl = db.create_table("table_from_df", data=df)
|
||||
```
|
||||
|
||||
## How to open an existing table
|
||||
|
||||
Once created, you can open a table using the following code:
|
||||
```python
|
||||
tbl = db.open_table("my_table")
|
||||
```
|
||||
|
||||
If you forget the name of your table, you can always get a listing of all table names:
|
||||
|
||||
```python
|
||||
db.table_names()
|
||||
```
|
||||
|
||||
## How to add data to a table
|
||||
|
||||
After a table has been created, you can always add more data to it using
|
||||
|
||||
```python
|
||||
df = pd.DataFrame([{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
||||
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}])
|
||||
tbl.add(df)
|
||||
```
|
||||
|
||||
## How to search for (approximate) nearest neighbors
|
||||
|
||||
Once you've embedded the query, you can find its nearest neighbors using the following code:
|
||||
|
||||
```python
|
||||
tbl.search([100, 100]).limit(2).to_df()
|
||||
```
|
||||
|
||||
This returns a pandas DataFrame with the results.
|
||||
|
||||
## What's next
|
||||
|
||||
This section covered the very basics of the LanceDB API.
|
||||
LanceDB supports many additional features when creating indices to speed up search and options for search.
|
||||
These are contained in the next section of the documentation.
|
||||
97
docs/src/embedding.md
Normal file
97
docs/src/embedding.md
Normal file
@@ -0,0 +1,97 @@
|
||||
# Embedding Functions
|
||||
|
||||
Embeddings are high dimensional floating-point vector representations of your data or query.
|
||||
Anything can be embedded using some embedding model or function.
|
||||
For a given embedding function, the output will always have the same number of dimensions.
|
||||
|
||||
## Creating an embedding function
|
||||
|
||||
Any function that takes as input a batch (list) of data and outputs a batch (list) of embeddings
|
||||
can be used by LanceDB as an embedding function. The input and output batch sizes should be the same.
|
||||
|
||||
### HuggingFace example
|
||||
|
||||
One popular free option would be to use the [sentence-transformers](https://www.sbert.net/) library from HuggingFace.
|
||||
You can install this using pip: `pip install sentence-transformers`.
|
||||
|
||||
```python
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
name="paraphrase-albert-small-v2"
|
||||
model = SentenceTransformer(name)
|
||||
|
||||
# used for both training and querying
|
||||
def embed_func(batch):
|
||||
return [model.encode(sentence) for sentence in batch]
|
||||
```
|
||||
|
||||
### OpenAI example
|
||||
|
||||
You can also use an external API like OpenAI to generate embeddings
|
||||
|
||||
```python
|
||||
import openai
|
||||
import os
|
||||
|
||||
# Configuring the environment variable OPENAI_API_KEY
|
||||
if "OPENAI_API_KEY" not in os.environ:
|
||||
# OR set the key here as a variable
|
||||
openai.api_key = "sk-..."
|
||||
|
||||
# verify that the API key is working
|
||||
assert len(openai.Model.list()["data"]) > 0
|
||||
|
||||
def embed_func(c):
|
||||
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
|
||||
return [record["embedding"] for record in rs["data"]]
|
||||
```
|
||||
|
||||
## Applying an embedding function
|
||||
|
||||
Using an embedding function, you can apply it to raw data
|
||||
to generate embeddings for each row.
|
||||
|
||||
Say if you have a pandas DataFrame with a `text` column that you want to be embedded,
|
||||
you can use the [with_embeddings](https://lancedb.github.io/lancedb/python/#lancedb.embeddings.with_embeddings)
|
||||
function to generate embeddings and add create a combined pyarrow table:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
from lancedb.embeddings import with_embeddings
|
||||
|
||||
df = pd.DataFrame([{"text": "pepperoni"},
|
||||
{"text": "pineapple"}])
|
||||
data = with_embeddings(embed_func, df)
|
||||
|
||||
# The output is used to create / append to a table
|
||||
# db.create_table("my_table", data=data)
|
||||
```
|
||||
|
||||
If your data is in a different column, you can specify the `column` kwarg to `with_embeddings`.
|
||||
|
||||
By default, LanceDB calls the function with batches of 1000 rows. This can be configured
|
||||
using the `batch_size` parameter to `with_embeddings`.
|
||||
|
||||
LanceDB automatically wraps the function with retry and rate-limit logic to ensure the OpenAI
|
||||
API call is reliable.
|
||||
|
||||
## Searching with an embedding function
|
||||
|
||||
At inference time, you also need the same embedding function to embed your query text.
|
||||
It's important that you use the same model / function otherwise the embedding vectors don't
|
||||
belong in the same latent space and your results will be nonsensical.
|
||||
|
||||
```python
|
||||
query = "What's the best pizza topping?"
|
||||
query_vector = embed_func([query])[0]
|
||||
tbl.search(query_vector).limit(10).to_df()
|
||||
```
|
||||
|
||||
The above snippet returns a pandas DataFrame with the 10 closest vectors to the query.
|
||||
|
||||
## Roadmap
|
||||
|
||||
In the near future, we'll be integrating the embedding functions deeper into LanceDB<br/>.
|
||||
The goal is that you just have to configure the function once when you create the table,
|
||||
and then you'll never have to deal with embeddings / vectors after that unless you want to.
|
||||
We'll also integrate more popular models and APIs.
|
||||
@@ -33,7 +33,14 @@ table = db.create_table("my_table",
|
||||
result = table.search([100, 100]).limit(2).to_df()
|
||||
```
|
||||
|
||||
## Complete Demos
|
||||
|
||||
We will be adding completed demo apps built using LanceDB.
|
||||
- [YouTube Transcript Search](../notebooks/youtube_transcript_search.ipynb)
|
||||
|
||||
|
||||
## Documentation Quick Links
|
||||
|
||||
* [`Basic Operations`](basic.md) - basic functionality of LanceDB.
|
||||
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
|
||||
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
|
||||
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.
|
||||
|
||||
111
docs/src/integrations.md
Normal file
111
docs/src/integrations.md
Normal file
@@ -0,0 +1,111 @@
|
||||
# Integrations
|
||||
|
||||
Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, PyArrow and DuckDB.
|
||||
|
||||
## Pandas and PyArrow
|
||||
|
||||
First, we need to connect to a `LanceDB` database.
|
||||
|
||||
``` py
|
||||
|
||||
import lancedb
|
||||
|
||||
db = lancedb.connect("/tmp/lancedb")
|
||||
```
|
||||
|
||||
And write a `Pandas DataFrame` to LanceDB directly.
|
||||
|
||||
```py
|
||||
import pandas as pd
|
||||
|
||||
data = pd.DataFrame({
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0]
|
||||
})
|
||||
table = db.create_table("pd_table", data=data)
|
||||
|
||||
# Optionally, create a IVF_PQ index
|
||||
table.create_index(num_partitions=256, num_sub_vectors=96)
|
||||
```
|
||||
|
||||
You will find detailed instructions of creating dataset and index in [Basic Operations](basic.md) and [Indexing](indexing.md)
|
||||
sections.
|
||||
|
||||
|
||||
We can now perform similarity searches via `LanceDB`.
|
||||
|
||||
```py
|
||||
# Open the table previously created.
|
||||
table = db.open_table("pd_table")
|
||||
|
||||
query_vector = [100, 100]
|
||||
# Pandas DataFrame
|
||||
df = table.search(query_vector).limit(1).to_df()
|
||||
print(df)
|
||||
```
|
||||
|
||||
```
|
||||
vector item price score
|
||||
0 [5.9, 26.5] bar 20.0 14257.05957
|
||||
```
|
||||
|
||||
If you have a simple filter, it's faster to provide a where clause to `LanceDB`'s search query.
|
||||
If you have more complex criteria, you can always apply the filter to the resulting pandas `DataFrame` from the search query.
|
||||
|
||||
```python
|
||||
|
||||
# Apply the filter via LanceDB
|
||||
results = table.search([100, 100]).where("price < 15").to_df()
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
|
||||
# Apply the filter via Pandas
|
||||
df = results = table.search([100, 100]).to_df()
|
||||
results = df[df.price < 15]
|
||||
assert len(results) == 1
|
||||
assert results["item"].iloc[0] == "foo"
|
||||
```
|
||||
|
||||
## DuckDB
|
||||
|
||||
`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow).
|
||||
|
||||
Let us start with installing `duckdb` and `lancedb`.
|
||||
|
||||
```shell
|
||||
pip install duckdb lancedb
|
||||
```
|
||||
|
||||
We will re-use the dataset created previously
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
|
||||
db = lancedb.connect("/tmp/lancedb")
|
||||
table = db.open_table("pd_table")
|
||||
arrow_table = table.to_arrow()
|
||||
```
|
||||
|
||||
`DuckDB` can directly query the `arrow_table`:
|
||||
|
||||
```python
|
||||
In [15]: duckdb.query("SELECT * FROM t")
|
||||
Out[15]:
|
||||
┌─────────────┬─────────┬────────┐
|
||||
│ vector │ item │ price │
|
||||
│ float[] │ varchar │ double │
|
||||
├─────────────┼─────────┼────────┤
|
||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||
└─────────────┴─────────┴────────┘
|
||||
|
||||
In [16]: duckdb.query("SELECT mean(price) FROM t")
|
||||
Out[16]:
|
||||
┌─────────────┐
|
||||
│ mean(price) │
|
||||
│ double │
|
||||
├─────────────┤
|
||||
│ 15.0 │
|
||||
└─────────────┘
|
||||
```
|
||||
@@ -10,3 +10,5 @@ pip install lancedb
|
||||
::: lancedb.db
|
||||
::: lancedb.table
|
||||
::: lancedb.query
|
||||
::: lancedb.embeddings
|
||||
::: lancedb.context
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -55,7 +55,11 @@ class LanceDBConnection:
|
||||
return self.open_table(name)
|
||||
|
||||
def create_table(
|
||||
self, name: str, data: DATA = None, schema: pa.Schema = None
|
||||
self,
|
||||
name: str,
|
||||
data: DATA = None,
|
||||
schema: pa.Schema = None,
|
||||
mode: str = "create",
|
||||
) -> LanceTable:
|
||||
"""Create a table in the database.
|
||||
|
||||
@@ -67,6 +71,10 @@ class LanceDBConnection:
|
||||
The data to insert into the table.
|
||||
schema: pyarrow.Schema; optional
|
||||
The schema of the table.
|
||||
mode: str; default "create"
|
||||
The mode to use when creating the table.
|
||||
By default, if the table already exists, an exception is raised.
|
||||
If you want to overwrite the table, use mode="overwrite".
|
||||
|
||||
Note
|
||||
----
|
||||
@@ -78,7 +86,7 @@ class LanceDBConnection:
|
||||
A LanceTable object representing the table.
|
||||
"""
|
||||
if data is not None:
|
||||
tbl = LanceTable.create(self, name, data, schema)
|
||||
tbl = LanceTable.create(self, name, data, schema, mode=mode)
|
||||
else:
|
||||
tbl = LanceTable(self, name)
|
||||
return tbl
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import ratelimiter
|
||||
from retry import retry
|
||||
from typing import Callable, Union
|
||||
|
||||
@@ -32,11 +31,12 @@ def with_embeddings(
|
||||
):
|
||||
func = EmbeddingFunction(func)
|
||||
if wrap_api:
|
||||
func = func.retry().rate_limit().batch_size(batch_size)
|
||||
func = func.retry().rate_limit()
|
||||
func = func.batch_size(batch_size)
|
||||
if show_progress:
|
||||
func = func.show_progress()
|
||||
if isinstance(data, pd.DataFrame):
|
||||
data = pa.Table.from_pandas(data)
|
||||
data = pa.Table.from_pandas(data, preserve_index=False)
|
||||
embeddings = func(data[column].to_numpy())
|
||||
table = vec_to_table(np.array(embeddings))
|
||||
return data.append_column("vector", table["vector"])
|
||||
@@ -52,23 +52,38 @@ class EmbeddingFunction:
|
||||
|
||||
def __call__(self, text):
|
||||
# Get the embedding with retry
|
||||
@retry(**self.retry_kwargs)
|
||||
def embed_func(c):
|
||||
return self.func(c.tolist())
|
||||
if len(self.retry_kwargs) > 0:
|
||||
|
||||
max_calls = self.rate_limiter_kwargs["max_calls"]
|
||||
limiter = ratelimiter.RateLimiter(
|
||||
max_calls, period=self.rate_limiter_kwargs["period"]
|
||||
)
|
||||
rate_limited = limiter(embed_func)
|
||||
@retry(**self.retry_kwargs)
|
||||
def embed_func(c):
|
||||
return self.func(c.tolist())
|
||||
|
||||
else:
|
||||
|
||||
def embed_func(c):
|
||||
return self.func(c.tolist())
|
||||
|
||||
if len(self.rate_limiter_kwargs) > 0:
|
||||
import ratelimiter
|
||||
|
||||
max_calls = self.rate_limiter_kwargs["max_calls"]
|
||||
limiter = ratelimiter.RateLimiter(
|
||||
max_calls, period=self.rate_limiter_kwargs["period"]
|
||||
)
|
||||
embed_func = limiter(embed_func)
|
||||
batches = self.to_batches(text)
|
||||
embeds = [emb for c in batches for emb in rate_limited(c)]
|
||||
embeds = [emb for c in batches for emb in embed_func(c)]
|
||||
return embeds
|
||||
|
||||
def __repr__(self):
|
||||
return f"EmbeddingFunction(func={self.func})"
|
||||
|
||||
def rate_limit(self, max_calls=0.9, period=1.0):
|
||||
import sys
|
||||
|
||||
v = int(sys.version_info.minor)
|
||||
if v >= 11:
|
||||
raise ValueError("rate limit only support up to 3.10")
|
||||
self.rate_limiter_kwargs = dict(max_calls=max_calls, period=period)
|
||||
return self
|
||||
|
||||
@@ -102,4 +117,4 @@ class EmbeddingFunction:
|
||||
|
||||
yield from tqdm(_chunker(arr), total=math.ceil(length / self._batch_size))
|
||||
else:
|
||||
return _chunker(arr)
|
||||
yield from _chunker(arr)
|
||||
|
||||
@@ -46,15 +46,62 @@ class LanceTable:
|
||||
A table in a LanceDB database.
|
||||
"""
|
||||
|
||||
def __init__(self, connection: "lancedb.db.LanceDBConnection", name: str):
|
||||
def __init__(
|
||||
self, connection: "lancedb.db.LanceDBConnection", name: str, version: int = None
|
||||
):
|
||||
self._conn = connection
|
||||
self.name = name
|
||||
self._version = version
|
||||
|
||||
def _reset_dataset(self):
|
||||
try:
|
||||
del self.__dict__["_dataset"]
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
"""Return the schema of the table."""
|
||||
return self._dataset.schema
|
||||
|
||||
def list_versions(self):
|
||||
"""List all versions of the table"""
|
||||
return self._dataset.versions()
|
||||
|
||||
@property
|
||||
def version(self):
|
||||
"""Get the current version of the table"""
|
||||
return self._dataset.version
|
||||
|
||||
def checkout(self, version: int):
|
||||
"""Checkout a version of the table"""
|
||||
max_ver = max([v["version"] for v in self._dataset.versions()])
|
||||
if version < 1 or version > max_ver:
|
||||
raise ValueError(f"Invalid version {version}")
|
||||
self._version = version
|
||||
self._reset_dataset()
|
||||
|
||||
def __len__(self):
|
||||
return self._dataset.count_rows()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"LanceTable({self.name})"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def head(self, n=5) -> pa.Table:
|
||||
"""Return the first n rows of the table."""
|
||||
return self._dataset.head(n)
|
||||
|
||||
def to_pandas(self) -> pd.DataFrame:
|
||||
"""Return the table as a pandas DataFrame."""
|
||||
return self.to_arrow().to_pandas()
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
"""Return the table as a pyarrow Table."""
|
||||
return self._dataset.to_table()
|
||||
|
||||
@property
|
||||
def _dataset_uri(self) -> str:
|
||||
return os.path.join(self._conn.uri, f"{self.name}.lance")
|
||||
@@ -71,16 +118,17 @@ class LanceTable:
|
||||
The number of PQ sub-vectors to use when creating the index.
|
||||
Default is 96.
|
||||
"""
|
||||
return self._dataset.create_index(
|
||||
self._dataset.create_index(
|
||||
column=VECTOR_COLUMN_NAME,
|
||||
index_type="IVF_PQ",
|
||||
num_partitions=num_partitions,
|
||||
num_sub_vectors=num_sub_vectors,
|
||||
)
|
||||
self._reset_dataset()
|
||||
|
||||
@cached_property
|
||||
def _dataset(self) -> LanceDataset:
|
||||
return lance.dataset(self._dataset_uri)
|
||||
return lance.dataset(self._dataset_uri, version=self._version)
|
||||
|
||||
def to_lance(self) -> LanceDataset:
|
||||
"""Return the LanceDataset backing this table."""
|
||||
@@ -102,8 +150,9 @@ class LanceTable:
|
||||
The number of vectors added to the table.
|
||||
"""
|
||||
data = _sanitize_data(data, self.schema)
|
||||
ds = lance.write_dataset(data, self._dataset_uri, mode=mode)
|
||||
return ds.count_rows()
|
||||
lance.write_dataset(data, self._dataset_uri, mode=mode)
|
||||
self._reset_dataset()
|
||||
return len(self)
|
||||
|
||||
def search(self, query: VEC) -> LanceQueryBuilder:
|
||||
"""Create a search query to find the nearest neighbors
|
||||
@@ -127,10 +176,10 @@ class LanceTable:
|
||||
return LanceQueryBuilder(self, query)
|
||||
|
||||
@classmethod
|
||||
def create(cls, db, name, data, schema=None):
|
||||
def create(cls, db, name, data, schema=None, mode="create"):
|
||||
tbl = LanceTable(db, name)
|
||||
data = _sanitize_data(data, schema)
|
||||
lance.write_dataset(data, tbl._dataset_uri, mode="create")
|
||||
lance.write_dataset(data, tbl._dataset_uri, mode=mode)
|
||||
return tbl
|
||||
|
||||
|
||||
@@ -150,6 +199,7 @@ def _sanitize_schema(data: pa.Table, schema: pa.Schema = None) -> pa.Table:
|
||||
return data
|
||||
# cast the columns to the expected types
|
||||
data = data.combine_chunks()
|
||||
data = _sanitize_vector_column(data, vector_column_name=VECTOR_COLUMN_NAME)
|
||||
return pa.Table.from_arrays(
|
||||
[data[name] for name in schema.names], schema=schema
|
||||
)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[project]
|
||||
name = "lancedb"
|
||||
version = "0.0.2"
|
||||
dependencies = ["pylance", "ratelimiter", "retry", "tqdm"]
|
||||
version = "0.1"
|
||||
dependencies = ["pylance>=0.4.3", "ratelimiter", "retry", "tqdm"]
|
||||
description = "lancedb"
|
||||
authors = [
|
||||
{ name = "Lance Devs", email = "dev@eto.ai" },
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
# limitations under the License.
|
||||
|
||||
import lancedb
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
|
||||
def test_basic(tmp_path):
|
||||
@@ -40,3 +42,57 @@ def test_basic(tmp_path):
|
||||
assert len(db) == 1
|
||||
|
||||
assert db.open_table("test").name == db["test"].name
|
||||
|
||||
|
||||
def test_ingest_pd(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
assert db.uri == str(tmp_path)
|
||||
assert db.table_names() == []
|
||||
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
table = db.create_table("test", data=data)
|
||||
rs = table.search([100, 100]).limit(1).to_df()
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "bar"
|
||||
|
||||
rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
|
||||
assert len(rs) == 1
|
||||
assert rs["item"].iloc[0] == "foo"
|
||||
|
||||
assert db.table_names() == ["test"]
|
||||
assert "test" in db
|
||||
assert len(db) == 1
|
||||
|
||||
assert db.open_table("test").name == db["test"].name
|
||||
|
||||
|
||||
def test_create_mode(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["foo", "bar"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
db.create_table("test", data=data)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
db.create_table("test", data=data)
|
||||
|
||||
new_data = pd.DataFrame(
|
||||
{
|
||||
"vector": [[3.1, 4.1], [5.9, 26.5]],
|
||||
"item": ["fizz", "buzz"],
|
||||
"price": [10.0, 20.0],
|
||||
}
|
||||
)
|
||||
tbl = db.create_table("test", data=new_data, mode="overwrite")
|
||||
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
|
||||
|
||||
42
python/tests/test_embeddings.py
Normal file
42
python/tests/test_embeddings.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# Copyright 2023 LanceDB Developers
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
|
||||
from lancedb.embeddings import with_embeddings
|
||||
|
||||
|
||||
def mock_embed_func(input_data):
|
||||
return [np.random.randn(128).tolist() for _ in range(len(input_data))]
|
||||
|
||||
|
||||
def test_with_embeddings():
|
||||
for wrap_api in [True, False]:
|
||||
if wrap_api and sys.version_info.minor >= 11:
|
||||
# ratelimiter package doesn't work on 3.11
|
||||
continue
|
||||
data = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
names=["text", "price"],
|
||||
)
|
||||
data = with_embeddings(mock_embed_func, data, wrap_api=wrap_api)
|
||||
assert data.num_columns == 3
|
||||
assert data.num_rows == 2
|
||||
assert data.column_names == ["text", "price", "vector"]
|
||||
assert data.column("text").to_pylist() == ["foo", "bar"]
|
||||
assert data.column("price").to_pylist() == [10.0, 20.0]
|
||||
@@ -46,17 +46,17 @@ def test_basic(db):
|
||||
assert table.to_lance().to_table() == ds.to_table()
|
||||
|
||||
|
||||
def test_add(db):
|
||||
def test_create_table(db):
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32())),
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("item", pa.string()),
|
||||
pa.field("price", pa.float32()),
|
||||
]
|
||||
)
|
||||
expected = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]]),
|
||||
pa.FixedSizeListArray.from_arrays(pa.array([3.1, 4.1, 5.9, 26.5]), 2),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
@@ -79,3 +79,61 @@ def test_add(db):
|
||||
.to_table()
|
||||
)
|
||||
assert expected == tbl
|
||||
|
||||
|
||||
def test_add(db):
|
||||
table = LanceTable.create(
|
||||
db,
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
],
|
||||
)
|
||||
|
||||
# table = LanceTable(db, "test")
|
||||
assert len(table) == 2
|
||||
|
||||
count = table.add([{"vector": [6.3, 100.5], "item": "new", "price": 30.0}])
|
||||
assert count == 3
|
||||
|
||||
expected = pa.Table.from_arrays(
|
||||
[
|
||||
pa.FixedSizeListArray.from_arrays(
|
||||
pa.array([3.1, 4.1, 5.9, 26.5, 6.3, 100.5]), 2
|
||||
),
|
||||
pa.array(["foo", "bar", "new"]),
|
||||
pa.array([10.0, 20.0, 30.0]),
|
||||
],
|
||||
schema=pa.schema(
|
||||
[
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("item", pa.string()),
|
||||
pa.field("price", pa.float64()),
|
||||
]
|
||||
),
|
||||
)
|
||||
assert expected == table.to_arrow()
|
||||
|
||||
|
||||
def test_versioning(db):
|
||||
table = LanceTable.create(
|
||||
db,
|
||||
"test",
|
||||
data=[
|
||||
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||
],
|
||||
)
|
||||
|
||||
assert len(table.list_versions()) == 1
|
||||
assert table.version == 1
|
||||
|
||||
table.add([{"vector": [6.3, 100.5], "item": "new", "price": 30.0}])
|
||||
assert len(table.list_versions()) == 2
|
||||
assert table.version == 2
|
||||
assert len(table) == 3
|
||||
|
||||
table.checkout(1)
|
||||
assert table.version == 1
|
||||
assert len(table) == 2
|
||||
|
||||
Reference in New Issue
Block a user