Compare commits

...

27 Commits
v0.0.4 ... v0.1

Author SHA1 Message Date
Chang She
342b726ed7 bump version for v0.1 2023-04-19 23:26:46 -07:00
Chang She
159b175316 Merge pull request #34 from lancedb/changhiskhan/overwrite-table
Add mode to overwrite table if already exists
2023-04-19 21:11:56 -07:00
Chang She
7876156d54 Merge pull request #36 from lancedb/gsilvestrin/ann_docs
[DOC] ann indexes documentation
2023-04-19 21:11:26 -07:00
Chang She
d64e85e9d7 Merge pull request #35 from lancedb/changhiskhan/table-versioning
expose methods to work with versioning in tables
2023-04-19 21:09:32 -07:00
gsilvestrin
3e79b4d9cb review comments 2023-04-19 20:33:23 -07:00
gsilvestrin
3eac75e61a review comments 2023-04-19 20:23:18 -07:00
gsilvestrin
b19ce10184 review comments 2023-04-19 19:28:40 -07:00
gsilvestrin
ce34d055af search docs 2023-04-19 19:26:27 -07:00
gsilvestrin
8bf4d169e2 wip ann indexes doc 2023-04-19 17:23:31 -07:00
gsilvestrin
4f7f33f7b7 wip ann indexes doc 2023-04-19 17:23:06 -07:00
gsilvestrin
32b21e1d20 wip ann indexes doc 2023-04-19 17:20:58 -07:00
Chang She
6062bfdb8f fix docs link 2023-04-19 16:59:30 -07:00
Chang She
93a5c5c15c Merge pull request #33 from lancedb/changhiskhan/doc-embedding-function
[DOC] embedding function documentation
2023-04-19 16:55:22 -07:00
Chang She
99310e099e expose methods to work with versioning in tables 2023-04-19 16:48:06 -07:00
Chang She
85dda53779 review comments 2023-04-19 16:35:48 -07:00
Chang She
d7c5793803 Add mode to overwrite table if already exists 2023-04-19 16:22:11 -07:00
Chang She
08e67d04bb [DOC] embedding function documentation 2023-04-19 15:54:32 -07:00
Lei Xu
ec197b1855 Merge pull request #31 from lancedb/lei/doc
[Doc] Pandas, Parrow, DuckDB integration
2023-04-19 14:55:42 -07:00
Lei Xu
23d4e3561f add link to basic and indexing 2023-04-19 14:53:45 -07:00
Lei Xu
de6bfab124 comments 2023-04-19 14:42:43 -07:00
Chang She
d7fb2b1d6b Merge pull request #32 from lancedb/changhiskhan/doc-basic-ops
[DOC] basic operations
2023-04-19 14:39:40 -07:00
Chang She
cdb534076f [DOC] basic operations 2023-04-19 14:29:03 -07:00
Lei Xu
c38d80cab2 remove print 2023-04-19 14:26:07 -07:00
Lei Xu
45e02bb62b no polars for now 2023-04-19 14:24:58 -07:00
Lei Xu
b3fdabdf45 use python and arrow 2023-04-19 14:15:18 -07:00
Chang She
1c3f9f1e3b Merge pull request #26 from lancedb/changhiskhan/invalidate-dataset
invalidate cached dataset after create_index and add
2023-04-19 14:02:29 -07:00
Chang She
f0ea1d898b invalidate cached dataset after create_index and add 2023-04-18 16:51:26 -07:00
12 changed files with 527 additions and 20 deletions

View File

@@ -3,6 +3,8 @@ docs_dir: src
theme:
name: "material"
features:
- content.code.copy
plugins:
- search
@@ -11,4 +13,16 @@ plugins:
nav:
- Home: index.md
- Basics: basic.md
- Embeddings: embedding.md
- Integrations: integrations.md
- Python API: python.md
markdown_extensions:
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
- pymdownx.superfences

80
docs/src/ann_indexes.md Normal file
View File

@@ -0,0 +1,80 @@
# ANN (Approximate Nearest Neighbor) Indexes
You can create an index over your vector data to make search faster. Vector indexes are faster but less accurate than exhaustive search. LanceDB provides many parameters to fine-tune the index's size, the speed of queries, and the accuracy of results.
Currently, LanceDB does not automatically create the ANN index. In the future we will look to improve this experience and automate index creation and configuration.
## Creating an ANN Index
Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) function.
```python
import lancedb
import numpy as np
uri = "~/.lancedb"
db = lancedb.connect(uri)
# Create 10,000 sample vectors
data = [{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((10_000, 768)).astype('float32'))]
# Add the vectors to a table
tbl = db.create_table("my_vectors", data=data)
# Create and train the index - you need to have enough data in the table for an effective training step
tbl.create_index(num_partitions=256, num_sub_vectors=96)
```
Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index
creation by providing the following parameters:
- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table
with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional.
A higher number leads to faster queries, but it makes index generation slower.
- **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes
search more accurate, but also makes the index larger and slower to build.
## Querying an ANN Index
Querying vector indexes is done via the [search](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.search) function.
There are a couple of parameters that can be used to fine-tune the search:
- **limit** (default: 10): The amount of results that will be returned
- **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower.
- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes
search more accurate but also slower.
```python
tbl.search(np.random.random((768))) \
.limit(2) \
.nprobes(20) \
.refine_factor(20) \
.to_df()
vector item score
0 [0.44949695, 0.8444449, 0.06281311, 0.23338133... item 1141 103.575333
1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867
```
The search will return the data requested in addition to the score of each item. The score is the distance between the query vector and the element. A lower number means that the result is more relevant.
### Filtering (where clause)
You can further filter the elements returned by a search using a where clause.
```python
tbl.search(np.random.random((768))).where("item != 'item 1141'").to_df()
```
### Projections (select clause)
You can select the columns returned by the query using a select clause.
```python
tbl.search(np.random.random((768))).select(["vector"]).to_df()
vector score
0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092
1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485
...
```

77
docs/src/basic.md Normal file
View File

@@ -0,0 +1,77 @@
# Basic LanceDB Functionality
## How to connect to a database
In local mode, LanceDB stores data in a directory on your local machine. To connect to a local database, you can use the following code:
```python
import lancedb
uri = "~/.lancedb"
db = lancedb.connect(uri)
```
LanceDB will create the directory if it doesn't exist (including parent directories).
If you need a reminder of the uri, use the `db.uri` property.
## How to create a table
To create a table, you can use the following code:
```python
tbl = db.create_table("my_table",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
```
Under the hood, LanceDB is converting the input data into an Apache Arrow table
and persisting it to disk in [Lance format](github.com/eto-ai/lance).
If the table already exists, LanceDB will raise an error by default.
If you want to overwrite the table, you can pass in `mode="overwrite"`
to the `create_table` method.
You can also pass in a pandas DataFrame directly:
```python
import pandas as pd
df = pd.DataFrame([{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
tbl = db.create_table("table_from_df", data=df)
```
## How to open an existing table
Once created, you can open a table using the following code:
```python
tbl = db.open_table("my_table")
```
If you forget the name of your table, you can always get a listing of all table names:
```python
db.table_names()
```
## How to add data to a table
After a table has been created, you can always add more data to it using
```python
df = pd.DataFrame([{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}])
tbl.add(df)
```
## How to search for (approximate) nearest neighbors
Once you've embedded the query, you can find its nearest neighbors using the following code:
```python
tbl.search([100, 100]).limit(2).to_df()
```
This returns a pandas DataFrame with the results.
## What's next
This section covered the very basics of the LanceDB API.
LanceDB supports many additional features when creating indices to speed up search and options for search.
These are contained in the next section of the documentation.

97
docs/src/embedding.md Normal file
View File

@@ -0,0 +1,97 @@
# Embedding Functions
Embeddings are high dimensional floating-point vector representations of your data or query.
Anything can be embedded using some embedding model or function.
For a given embedding function, the output will always have the same number of dimensions.
## Creating an embedding function
Any function that takes as input a batch (list) of data and outputs a batch (list) of embeddings
can be used by LanceDB as an embedding function. The input and output batch sizes should be the same.
### HuggingFace example
One popular free option would be to use the [sentence-transformers](https://www.sbert.net/) library from HuggingFace.
You can install this using pip: `pip install sentence-transformers`.
```python
from sentence_transformers import SentenceTransformer
name="paraphrase-albert-small-v2"
model = SentenceTransformer(name)
# used for both training and querying
def embed_func(batch):
return [model.encode(sentence) for sentence in batch]
```
### OpenAI example
You can also use an external API like OpenAI to generate embeddings
```python
import openai
import os
# Configuring the environment variable OPENAI_API_KEY
if "OPENAI_API_KEY" not in os.environ:
# OR set the key here as a variable
openai.api_key = "sk-..."
# verify that the API key is working
assert len(openai.Model.list()["data"]) > 0
def embed_func(c):
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
return [record["embedding"] for record in rs["data"]]
```
## Applying an embedding function
Using an embedding function, you can apply it to raw data
to generate embeddings for each row.
Say if you have a pandas DataFrame with a `text` column that you want to be embedded,
you can use the [with_embeddings](https://lancedb.github.io/lancedb/python/#lancedb.embeddings.with_embeddings)
function to generate embeddings and add create a combined pyarrow table:
```python
import pandas as pd
from lancedb.embeddings import with_embeddings
df = pd.DataFrame([{"text": "pepperoni"},
{"text": "pineapple"}])
data = with_embeddings(embed_func, df)
# The output is used to create / append to a table
# db.create_table("my_table", data=data)
```
If your data is in a different column, you can specify the `column` kwarg to `with_embeddings`.
By default, LanceDB calls the function with batches of 1000 rows. This can be configured
using the `batch_size` parameter to `with_embeddings`.
LanceDB automatically wraps the function with retry and rate-limit logic to ensure the OpenAI
API call is reliable.
## Searching with an embedding function
At inference time, you also need the same embedding function to embed your query text.
It's important that you use the same model / function otherwise the embedding vectors don't
belong in the same latent space and your results will be nonsensical.
```python
query = "What's the best pizza topping?"
query_vector = embed_func([query])[0]
tbl.search(query_vector).limit(10).to_df()
```
The above snippet returns a pandas DataFrame with the 10 closest vectors to the query.
## Roadmap
In the near future, we'll be integrating the embedding functions deeper into LanceDB<br/>.
The goal is that you just have to configure the function once when you create the table,
and then you'll never have to deal with embeddings / vectors after that unless you want to.
We'll also integrate more popular models and APIs.

View File

@@ -33,7 +33,14 @@ table = db.create_table("my_table",
result = table.search([100, 100]).limit(2).to_df()
```
## Complete Demos
We will be adding completed demo apps built using LanceDB.
- [YouTube Transcript Search](../notebooks/youtube_transcript_search.ipynb)
## Documentation Quick Links
* [`Basic Operations`](basic.md) - basic functionality of LanceDB.
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.

111
docs/src/integrations.md Normal file
View File

@@ -0,0 +1,111 @@
# Integrations
Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, PyArrow and DuckDB.
## Pandas and PyArrow
First, we need to connect to a `LanceDB` database.
``` py
import lancedb
db = lancedb.connect("/tmp/lancedb")
```
And write a `Pandas DataFrame` to LanceDB directly.
```py
import pandas as pd
data = pd.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0]
})
table = db.create_table("pd_table", data=data)
# Optionally, create a IVF_PQ index
table.create_index(num_partitions=256, num_sub_vectors=96)
```
You will find detailed instructions of creating dataset and index in [Basic Operations](basic.md) and [Indexing](indexing.md)
sections.
We can now perform similarity searches via `LanceDB`.
```py
# Open the table previously created.
table = db.open_table("pd_table")
query_vector = [100, 100]
# Pandas DataFrame
df = table.search(query_vector).limit(1).to_df()
print(df)
```
```
vector item price score
0 [5.9, 26.5] bar 20.0 14257.05957
```
If you have a simple filter, it's faster to provide a where clause to `LanceDB`'s search query.
If you have more complex criteria, you can always apply the filter to the resulting pandas `DataFrame` from the search query.
```python
# Apply the filter via LanceDB
results = table.search([100, 100]).where("price < 15").to_df()
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# Apply the filter via Pandas
df = results = table.search([100, 100]).to_df()
results = df[df.price < 15]
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
```
## DuckDB
`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow).
Let us start with installing `duckdb` and `lancedb`.
```shell
pip install duckdb lancedb
```
We will re-use the dataset created previously
```python
import lancedb
db = lancedb.connect("/tmp/lancedb")
table = db.open_table("pd_table")
arrow_table = table.to_arrow()
```
`DuckDB` can directly query the `arrow_table`:
```python
In [15]: duckdb.query("SELECT * FROM t")
Out[15]:
┌─────────────┬─────────┬────────┐
│ vector │ item │ price │
│ float[] │ varchar │ double │
├─────────────┼─────────┼────────┤
│ [3.1, 4.1] │ foo │ 10.0 │
│ [5.9, 26.5] │ bar │ 20.0 │
└─────────────┴─────────┴────────┘
In [16]: duckdb.query("SELECT mean(price) FROM t")
Out[16]:
┌─────────────┐
│ mean(price) │
│ double │
├─────────────┤
│ 15.0 │
└─────────────┘
```

View File

@@ -10,3 +10,5 @@ pip install lancedb
::: lancedb.db
::: lancedb.table
::: lancedb.query
::: lancedb.embeddings
::: lancedb.context

View File

@@ -55,7 +55,11 @@ class LanceDBConnection:
return self.open_table(name)
def create_table(
self, name: str, data: DATA = None, schema: pa.Schema = None
self,
name: str,
data: DATA = None,
schema: pa.Schema = None,
mode: str = "create",
) -> LanceTable:
"""Create a table in the database.
@@ -67,6 +71,10 @@ class LanceDBConnection:
The data to insert into the table.
schema: pyarrow.Schema; optional
The schema of the table.
mode: str; default "create"
The mode to use when creating the table.
By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite".
Note
----
@@ -78,7 +86,7 @@ class LanceDBConnection:
A LanceTable object representing the table.
"""
if data is not None:
tbl = LanceTable.create(self, name, data, schema)
tbl = LanceTable.create(self, name, data, schema, mode=mode)
else:
tbl = LanceTable(self, name)
return tbl

View File

@@ -46,15 +46,41 @@ class LanceTable:
A table in a LanceDB database.
"""
def __init__(self, connection: "lancedb.db.LanceDBConnection", name: str):
def __init__(
self, connection: "lancedb.db.LanceDBConnection", name: str, version: int = None
):
self._conn = connection
self.name = name
self._version = version
def _reset_dataset(self):
try:
del self.__dict__["_dataset"]
except AttributeError:
pass
@property
def schema(self) -> pa.Schema:
"""Return the schema of the table."""
return self._dataset.schema
def list_versions(self):
"""List all versions of the table"""
return self._dataset.versions()
@property
def version(self):
"""Get the current version of the table"""
return self._dataset.version
def checkout(self, version: int):
"""Checkout a version of the table"""
max_ver = max([v["version"] for v in self._dataset.versions()])
if version < 1 or version > max_ver:
raise ValueError(f"Invalid version {version}")
self._version = version
self._reset_dataset()
def __len__(self):
return self._dataset.count_rows()
@@ -92,16 +118,17 @@ class LanceTable:
The number of PQ sub-vectors to use when creating the index.
Default is 96.
"""
return self._dataset.create_index(
self._dataset.create_index(
column=VECTOR_COLUMN_NAME,
index_type="IVF_PQ",
num_partitions=num_partitions,
num_sub_vectors=num_sub_vectors,
)
self._reset_dataset()
@cached_property
def _dataset(self) -> LanceDataset:
return lance.dataset(self._dataset_uri)
return lance.dataset(self._dataset_uri, version=self._version)
def to_lance(self) -> LanceDataset:
"""Return the LanceDataset backing this table."""
@@ -123,8 +150,9 @@ class LanceTable:
The number of vectors added to the table.
"""
data = _sanitize_data(data, self.schema)
ds = lance.write_dataset(data, self._dataset_uri, mode=mode)
return ds.count_rows()
lance.write_dataset(data, self._dataset_uri, mode=mode)
self._reset_dataset()
return len(self)
def search(self, query: VEC) -> LanceQueryBuilder:
"""Create a search query to find the nearest neighbors
@@ -148,10 +176,10 @@ class LanceTable:
return LanceQueryBuilder(self, query)
@classmethod
def create(cls, db, name, data, schema=None):
def create(cls, db, name, data, schema=None, mode="create"):
tbl = LanceTable(db, name)
data = _sanitize_data(data, schema)
lance.write_dataset(data, tbl._dataset_uri, mode="create")
lance.write_dataset(data, tbl._dataset_uri, mode=mode)
return tbl

View File

@@ -1,7 +1,7 @@
[project]
name = "lancedb"
version = "0.0.4"
dependencies = ["pylance", "ratelimiter", "retry", "tqdm"]
version = "0.1"
dependencies = ["pylance>=0.4.3", "ratelimiter", "retry", "tqdm"]
description = "lancedb"
authors = [
{ name = "Lance Devs", email = "dev@eto.ai" },

View File

@@ -12,6 +12,8 @@
# limitations under the License.
import lancedb
import pandas as pd
import pytest
def test_basic(tmp_path):
@@ -40,3 +42,57 @@ def test_basic(tmp_path):
assert len(db) == 1
assert db.open_table("test").name == db["test"].name
def test_ingest_pd(tmp_path):
db = lancedb.connect(tmp_path)
assert db.uri == str(tmp_path)
assert db.table_names() == []
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
table = db.create_table("test", data=data)
rs = table.search([100, 100]).limit(1).to_df()
assert len(rs) == 1
assert rs["item"].iloc[0] == "bar"
rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
assert len(rs) == 1
assert rs["item"].iloc[0] == "foo"
assert db.table_names() == ["test"]
assert "test" in db
assert len(db) == 1
assert db.open_table("test").name == db["test"].name
def test_create_mode(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
db.create_table("test", data=data)
with pytest.raises(Exception):
db.create_table("test", data=data)
new_data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["fizz", "buzz"],
"price": [10.0, 20.0],
}
)
tbl = db.create_table("test", data=new_data, mode="overwrite")
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]

View File

@@ -99,14 +99,41 @@ def test_add(db):
expected = pa.Table.from_arrays(
[
pa.FixedSizeListArray.from_arrays(pa.array([3.1, 4.1, 5.9, 26.5]), 2),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
pa.FixedSizeListArray.from_arrays(
pa.array([3.1, 4.1, 5.9, 26.5, 6.3, 100.5]), 2
),
pa.array(["foo", "bar", "new"]),
pa.array([10.0, 20.0, 30.0]),
],
schema=pa.schema([
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float64()),
]),
schema=pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float64()),
]
),
)
assert expected == table.to_arrow()
def test_versioning(db):
table = LanceTable.create(
db,
"test",
data=[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
],
)
assert len(table.list_versions()) == 1
assert table.version == 1
table.add([{"vector": [6.3, 100.5], "item": "new", "price": 30.0}])
assert len(table.list_versions()) == 2
assert table.version == 2
assert len(table) == 3
table.checkout(1)
assert table.version == 1
assert len(table) == 2