Compare commits

..

82 Commits

Author SHA1 Message Date
Chang She
59014a01e0 bump version for v0.1.2 2023-05-05 11:27:09 -07:00
Chang She
47ae17ea05 Merge pull request #58 from lancedb/changhiskhan/parse-schema
Add method to get the URI scheme to support cloud storage
2023-05-04 14:36:45 -07:00
Chang She
b6739f3f66 windows paths 2023-05-04 11:41:05 -07:00
Chang She
3a2df0ce45 Add method to get the URI scheme to support cloud storage 2023-05-04 09:47:03 -07:00
Chang She
c0bc65cdfa Merge pull request #55 from lancedb/jaichopra/update-tagline
update tagline
2023-05-03 21:06:41 -07:00
Jai Chopra
298b81f0b0 update tagline 2023-05-03 19:55:10 -07:00
Jai
fe7a3ccd60 Merge pull request #53 from lancedb/jaichopra/update-major-features-readme
also update docs index
2023-05-03 07:51:54 -07:00
Jai Chopra
baf8d7c1a1 also update docs index 2023-05-03 07:50:44 -07:00
Chang She
2021e1bf6d Merge pull request #52 from lancedb/jaichopra/update-major-features-readme 2023-05-03 07:36:09 -07:00
Jai Chopra
2dbe71cf88 add new feature to readme.md 2023-05-03 07:30:46 -07:00
Lei Xu
afe19ade7f Merge pull request #49 from lancedb/lei/rust_core
Rust core directory
2023-04-27 10:40:21 -07:00
Lei Xu
118efdce73 add cargo metadata 2023-04-27 10:36:01 -07:00
Lei Xu
b0426387e7 initialize the rust core 2023-04-27 10:31:50 -07:00
Chang She
afa7fe19e6 bump version for v0.1.1 2023-04-26 16:55:25 -07:00
Chang She
66080d791b Merge pull request #46 from lancedb/changhiskhan/improve-index-docs 2023-04-25 21:13:51 -07:00
Chang She
5554fddd54 Merge branch 'main' into changhiskhan/improve-index-docs 2023-04-25 21:04:01 -07:00
Chang She
f06ea935fe Merge pull request #47 from lancedb/changhiskhan/expose-metric
Make distance metric configurable in LanceDB
2023-04-25 21:02:59 -07:00
Chang She
a8db7f56d2 tolerance 2023-04-25 20:08:18 -07:00
Chang She
7a375185a1 increment lance version to include cosine distance fix 2023-04-25 19:57:58 -07:00
Chang She
6592b4c13b document metric in create_index 2023-04-24 22:46:21 -07:00
Chang She
72a44eb927 specify metric during index creation 2023-04-24 22:45:37 -07:00
Chang She
b0e578c609 add documentation for metric 2023-04-24 22:42:30 -07:00
Chang She
89e6232aeb Make distance metric configurable during search 2023-04-24 22:40:40 -07:00
Chang She
44ea687984 Merge pull request #45 from lancedb/changhiskhan/notebook-fix
Minor notebook fix. Closes #40
2023-04-24 20:12:03 -07:00
Chang She
4f2dae8a0d Add more detailed docs for the ANN index and search features 2023-04-24 19:19:55 -07:00
Chang She
5e748e6e70 Minor notebook fix. Closes #40 2023-04-24 18:46:05 -07:00
Chang She
177192f852 Merge pull request #37 from lancedb/gsilvestrin/ratelimit_3.11
skipping embeddings rate limit when python version > 3.10
2023-04-22 21:03:18 -07:00
Lei Xu
1fb596942f Merge pull request #39 from wilhelmjung/patch-1
Update index.md
2023-04-22 20:32:36 -07:00
YangWeiliang_DeepNova@Deepexi
73d3cb78e6 Update index.md
Just a typo. Fixed.
2023-04-23 09:52:23 +08:00
gsilvestrin
a1583444ec add ann_index to main doc page 2023-04-20 16:07:25 -07:00
gsilvestrin
78e4f4d1a8 add ann_index to main doc page 2023-04-20 13:19:10 -07:00
gsilvestrin
b92eb988b6 add ann_index to main doc page 2023-04-20 11:51:42 -07:00
gsilvestrin
0cd092814d skipping rate limit when python version > 3.10 2023-04-20 10:28:14 -07:00
Jai
a6294925df Update README.md 2023-04-20 10:22:03 -07:00
Chang She
342b726ed7 bump version for v0.1 2023-04-19 23:26:46 -07:00
Chang She
159b175316 Merge pull request #34 from lancedb/changhiskhan/overwrite-table
Add mode to overwrite table if already exists
2023-04-19 21:11:56 -07:00
Chang She
7876156d54 Merge pull request #36 from lancedb/gsilvestrin/ann_docs
[DOC] ann indexes documentation
2023-04-19 21:11:26 -07:00
Chang She
d64e85e9d7 Merge pull request #35 from lancedb/changhiskhan/table-versioning
expose methods to work with versioning in tables
2023-04-19 21:09:32 -07:00
gsilvestrin
3e79b4d9cb review comments 2023-04-19 20:33:23 -07:00
gsilvestrin
3eac75e61a review comments 2023-04-19 20:23:18 -07:00
gsilvestrin
b19ce10184 review comments 2023-04-19 19:28:40 -07:00
gsilvestrin
ce34d055af search docs 2023-04-19 19:26:27 -07:00
gsilvestrin
8bf4d169e2 wip ann indexes doc 2023-04-19 17:23:31 -07:00
gsilvestrin
4f7f33f7b7 wip ann indexes doc 2023-04-19 17:23:06 -07:00
gsilvestrin
32b21e1d20 wip ann indexes doc 2023-04-19 17:20:58 -07:00
Chang She
6062bfdb8f fix docs link 2023-04-19 16:59:30 -07:00
Chang She
93a5c5c15c Merge pull request #33 from lancedb/changhiskhan/doc-embedding-function
[DOC] embedding function documentation
2023-04-19 16:55:22 -07:00
Chang She
99310e099e expose methods to work with versioning in tables 2023-04-19 16:48:06 -07:00
Chang She
85dda53779 review comments 2023-04-19 16:35:48 -07:00
Chang She
d7c5793803 Add mode to overwrite table if already exists 2023-04-19 16:22:11 -07:00
Chang She
08e67d04bb [DOC] embedding function documentation 2023-04-19 15:54:32 -07:00
Lei Xu
ec197b1855 Merge pull request #31 from lancedb/lei/doc
[Doc] Pandas, Parrow, DuckDB integration
2023-04-19 14:55:42 -07:00
Lei Xu
23d4e3561f add link to basic and indexing 2023-04-19 14:53:45 -07:00
Lei Xu
de6bfab124 comments 2023-04-19 14:42:43 -07:00
Chang She
d7fb2b1d6b Merge pull request #32 from lancedb/changhiskhan/doc-basic-ops
[DOC] basic operations
2023-04-19 14:39:40 -07:00
Chang She
cdb534076f [DOC] basic operations 2023-04-19 14:29:03 -07:00
Lei Xu
c38d80cab2 remove print 2023-04-19 14:26:07 -07:00
Lei Xu
45e02bb62b no polars for now 2023-04-19 14:24:58 -07:00
Lei Xu
b3fdabdf45 use python and arrow 2023-04-19 14:15:18 -07:00
Chang She
1c3f9f1e3b Merge pull request #26 from lancedb/changhiskhan/invalidate-dataset
invalidate cached dataset after create_index and add
2023-04-19 14:02:29 -07:00
Chang She
f0ea1d898b invalidate cached dataset after create_index and add 2023-04-18 16:51:26 -07:00
Chang She
3ba7fa15a4 bump version for v0.0.4 2023-04-18 09:20:53 -07:00
Chang She
370867836c Merge pull request #25 from lancedb/gsilvestrin-patch-1
Update README.md
2023-04-18 09:19:40 -07:00
gsilvestrin
682f09480c Update README.md 2023-04-17 12:30:04 -07:00
gsilvestrin
cd8807bc97 bugfix for LanceTable.add to convert python lists 2023-04-17 08:48:56 -07:00
gsilvestrin
41c44ae92e Update README.md 2023-04-14 16:57:51 -07:00
gsilvestrin
6865d66d37 renaming test case 2023-04-14 16:32:31 -07:00
gsilvestrin
aeecd809cc bugfix for LanceTable.add to convert python lists into arrow fixed size lists
- Fixed `add` unit test to create the correct expected result
- Added a unit test for LanceTable.add
- Need to discuss if len(LanceTable) is handled correctly
2023-04-14 14:13:01 -07:00
Chang She
3360678d60 Merge pull request #19 from lancedb/jaichopra/notebook-imp-2
add more explanations to the notebook
2023-03-31 14:01:11 -07:00
Jai Chopra
177eddfc20 add more explanations to the notebook 2023-03-31 13:05:24 -07:00
Chang She
d735a69b6e Merge pull request #18 from lancedb/changhiskhan/notebook-updates 2023-03-30 20:16:13 -07:00
Chang She
a2bd2854e1 update tutorial notebook based on feedback 2023-03-30 19:44:39 -07:00
Chang She
c32b6880e7 bump version for v0.0.3 2023-03-30 19:19:29 -07:00
Chang She
c6fe5e38f1 Merge pull request #16 from lancedb/jaichopra/readme-lance-benchmarking
add Lance benchmarking blog
2023-03-30 19:04:44 -07:00
Jai
1c8b52f07b add Lance benchmarking blog 2023-03-29 11:44:36 -07:00
Chang She
f544c5dd31 Merge pull request #14 from lancedb/changhiskhan/updates
update for release
2023-03-24 19:50:35 -07:00
Chang She
eba533da4f fix 3.11 2023-03-24 19:45:46 -07:00
Chang She
404211d4fb fix 3.11 2023-03-24 19:00:22 -07:00
Chang She
5d7832c8a5 update for release 2023-03-24 18:16:29 -07:00
Jai
4eba83fdc9 Merge pull request #13 from lancedb/jaichopra/add-tutorials-readme
Update README.md
2023-03-24 12:17:32 -07:00
Jai
6906a5f912 Update README.md 2023-03-24 12:16:44 -07:00
Jai
69fd80e9f2 Update README.md 2023-03-24 12:09:16 -07:00
25 changed files with 1189 additions and 117 deletions

2
.gitignore vendored
View File

@@ -15,3 +15,5 @@ python/build
python/dist
notebooks/.ipynb_checkpoints
**/.hypothesis

View File

@@ -3,12 +3,12 @@
<img width="275" alt="LanceDB Logo" src="https://user-images.githubusercontent.com/917119/226205734-6063d87a-1ecc-45fe-85be-1dea6383a3d8.png">
**Serverless, low-latency vector database for AI applications**
**Developer-friendly, serverless vector database for AI applications**
<a href="https://lancedb.github.io/lancedb/">Documentation</a>
<a href="https://blog.eto.ai/">Blog</a>
<a href="https://discord.gg/zMM32dvNtd">Discord</a>
<a href="https://twitter.com/etodotai">Twitter</a>
<a href="https://twitter.com/lancedb">Twitter</a>
</p>
</div>
@@ -21,6 +21,10 @@ The key features of LanceDB include:
* Production-scale vector search with no servers to manage.
* Optimized for multi-modal data (text, images, videos, point clouds and more).
* Native Python and Javascript/Typescript support (coming soon).
* Combine attribute-based information with vectors and store them as a single source-of-truth.
* Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure.
@@ -41,9 +45,14 @@ pip install lancedb
```python
import lancedb
uri = "/tmp/lancedb"
db = lancedb.connect(uri)
table = db.create_table("my_table",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
result = table.search([100, 100]).limit(2).to_df()
```
## Blogs, Tutorials & Videos
* 📈 <a href="https://blog.eto.ai/benchmarking-random-access-in-lance-ed690757a826">2000x better performance with Lance over Parquet</a>
* 🤖 <a href="https://github.com/lancedb/lancedb/blob/main/notebooks/youtube_transcript_search.ipynb">Build a question and answer bot with LanceDB</a>

View File

@@ -3,6 +3,8 @@ docs_dir: src
theme:
name: "material"
features:
- content.code.copy
plugins:
- search
@@ -11,4 +13,17 @@ plugins:
nav:
- Home: index.md
- Basics: basic.md
- Embeddings: embedding.md
- Indexing: ann_indexes.md
- Integrations: integrations.md
- Python API: python.md
markdown_extensions:
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
- pymdownx.superfences

95
docs/src/ann_indexes.md Normal file
View File

@@ -0,0 +1,95 @@
# ANN (Approximate Nearest Neighbor) Indexes
You can create an index over your vector data to make search faster.
Vector indexes are faster but less accurate than exhaustive search.
LanceDB provides many parameters to fine-tune the index's size, the speed of queries, and the accuracy of results.
Currently, LanceDB does *not* automatically create the ANN index.
LanceDB has optimized code for KNN as well. For many use-cases, datasets under 100K vectors won't require index creation at all.
If you can live with <100ms latency, skipping index creation is a simpler workflow while guaranteeing 100% recall.
In the future we will look to automatically create and configure the ANN index.
## Creating an ANN Index
Creating indexes is done via the [create_index](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.create_index) method.
```python
import lancedb
import numpy as np
uri = "~/.lancedb"
db = lancedb.connect(uri)
# Create 10,000 sample vectors
data = [{"vector": row, "item": f"item {i}"}
for i, row in enumerate(np.random.random((10_000, 768)).astype('float32'))]
# Add the vectors to a table
tbl = db.create_table("my_vectors", data=data)
# Create and train the index - you need to have enough data in the table for an effective training step
tbl.create_index(num_partitions=256, num_sub_vectors=96)
```
Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index
creation by providing the following parameters:
- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance.
- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table
with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional.
A higher number leads to faster queries, but it makes index generation slower.
- **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes
search more accurate, but also makes the index larger and slower to build.
## Querying an ANN Index
Querying vector indexes is done via the [search](https://lancedb.github.io/lancedb/python/#lancedb.table.LanceTable.search) function.
There are a couple of parameters that can be used to fine-tune the search:
- **limit** (default: 10): The amount of results that will be returned
- **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower.<br/>
Most of the time, setting nprobes to cover 5-10% of the dataset should achieve high recall with low latency.<br/>
e.g., for 1M vectors divided up into 256 partitions, nprobes should be set to ~20-40.<br/>
Note: nprobes is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory.<br/>
A higher number makes search more accurate but also slower. If you find the recall is less than idea, try refine_factor=10 to start.<br/>
e.g., for 1M vectors divided into 256 partitions, if you're looking for top 20, then refine_factor=200 reranks the whole partition.<br/>
Note: refine_factor is only applicable if an ANN index is present. If specified on a table without an ANN index, it is ignored.
```python
tbl.search(np.random.random((768))) \
.limit(2) \
.nprobes(20) \
.refine_factor(10) \
.to_df()
vector item score
0 [0.44949695, 0.8444449, 0.06281311, 0.23338133... item 1141 103.575333
1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867
```
The search will return the data requested in addition to the score of each item.
**Note:** The score is the distance between the query vector and the element. A lower number means that the result is more relevant.
### Filtering (where clause)
You can further filter the elements returned by a search using a where clause.
```python
tbl.search(np.random.random((768))).where("item != 'item 1141'").to_df()
```
### Projections (select clause)
You can select the columns returned by the query using a select clause.
```python
tbl.search(np.random.random((768))).select(["vector"]).to_df()
vector score
0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092
1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485
...
```

77
docs/src/basic.md Normal file
View File

@@ -0,0 +1,77 @@
# Basic LanceDB Functionality
## How to connect to a database
In local mode, LanceDB stores data in a directory on your local machine. To connect to a local database, you can use the following code:
```python
import lancedb
uri = "~/.lancedb"
db = lancedb.connect(uri)
```
LanceDB will create the directory if it doesn't exist (including parent directories).
If you need a reminder of the uri, use the `db.uri` property.
## How to create a table
To create a table, you can use the following code:
```python
tbl = db.create_table("my_table",
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
```
Under the hood, LanceDB is converting the input data into an Apache Arrow table
and persisting it to disk in [Lance format](github.com/eto-ai/lance).
If the table already exists, LanceDB will raise an error by default.
If you want to overwrite the table, you can pass in `mode="overwrite"`
to the `create_table` method.
You can also pass in a pandas DataFrame directly:
```python
import pandas as pd
df = pd.DataFrame([{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
tbl = db.create_table("table_from_df", data=df)
```
## How to open an existing table
Once created, you can open a table using the following code:
```python
tbl = db.open_table("my_table")
```
If you forget the name of your table, you can always get a listing of all table names:
```python
db.table_names()
```
## How to add data to a table
After a table has been created, you can always add more data to it using
```python
df = pd.DataFrame([{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}])
tbl.add(df)
```
## How to search for (approximate) nearest neighbors
Once you've embedded the query, you can find its nearest neighbors using the following code:
```python
tbl.search([100, 100]).limit(2).to_df()
```
This returns a pandas DataFrame with the results.
## What's next
This section covered the very basics of the LanceDB API.
LanceDB supports many additional features when creating indices to speed up search and options for search.
These are contained in the next section of the documentation.

97
docs/src/embedding.md Normal file
View File

@@ -0,0 +1,97 @@
# Embedding Functions
Embeddings are high dimensional floating-point vector representations of your data or query.
Anything can be embedded using some embedding model or function.
For a given embedding function, the output will always have the same number of dimensions.
## Creating an embedding function
Any function that takes as input a batch (list) of data and outputs a batch (list) of embeddings
can be used by LanceDB as an embedding function. The input and output batch sizes should be the same.
### HuggingFace example
One popular free option would be to use the [sentence-transformers](https://www.sbert.net/) library from HuggingFace.
You can install this using pip: `pip install sentence-transformers`.
```python
from sentence_transformers import SentenceTransformer
name="paraphrase-albert-small-v2"
model = SentenceTransformer(name)
# used for both training and querying
def embed_func(batch):
return [model.encode(sentence) for sentence in batch]
```
### OpenAI example
You can also use an external API like OpenAI to generate embeddings
```python
import openai
import os
# Configuring the environment variable OPENAI_API_KEY
if "OPENAI_API_KEY" not in os.environ:
# OR set the key here as a variable
openai.api_key = "sk-..."
# verify that the API key is working
assert len(openai.Model.list()["data"]) > 0
def embed_func(c):
rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
return [record["embedding"] for record in rs["data"]]
```
## Applying an embedding function
Using an embedding function, you can apply it to raw data
to generate embeddings for each row.
Say if you have a pandas DataFrame with a `text` column that you want to be embedded,
you can use the [with_embeddings](https://lancedb.github.io/lancedb/python/#lancedb.embeddings.with_embeddings)
function to generate embeddings and add create a combined pyarrow table:
```python
import pandas as pd
from lancedb.embeddings import with_embeddings
df = pd.DataFrame([{"text": "pepperoni"},
{"text": "pineapple"}])
data = with_embeddings(embed_func, df)
# The output is used to create / append to a table
# db.create_table("my_table", data=data)
```
If your data is in a different column, you can specify the `column` kwarg to `with_embeddings`.
By default, LanceDB calls the function with batches of 1000 rows. This can be configured
using the `batch_size` parameter to `with_embeddings`.
LanceDB automatically wraps the function with retry and rate-limit logic to ensure the OpenAI
API call is reliable.
## Searching with an embedding function
At inference time, you also need the same embedding function to embed your query text.
It's important that you use the same model / function otherwise the embedding vectors don't
belong in the same latent space and your results will be nonsensical.
```python
query = "What's the best pizza topping?"
query_vector = embed_func([query])[0]
tbl.search(query_vector).limit(10).to_df()
```
The above snippet returns a pandas DataFrame with the 10 closest vectors to the query.
## Roadmap
In the near future, we'll be integrating the embedding functions deeper into LanceDB<br/>.
The goal is that you just have to configure the function once when you create the table,
and then you'll never have to deal with embeddings / vectors after that unless you want to.
We'll also integrate more popular models and APIs.

View File

@@ -1,11 +1,15 @@
# Welcome to LanceDB's Documentation
LanceDB is an open-source database for vector-search built with persistent storage, which greatly simplifies retrevial, filtering and management of embeddings.
LanceDB is an open-source database for vector-search built with persistent storage, which greatly simplifies retrivial, filtering and management of embeddings.
The key features of LanceDB include:
* Production-scale vector search with no servers to manage.
* Optimized for multi-modal data (text, images, videos, point clouds and more).
* Native Python and Javascript/Typescript support (coming soon).
* Combine attribute-based information with vectors and store them as a single source-of-truth.
* Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure.
@@ -33,7 +37,15 @@ table = db.create_table("my_table",
result = table.search([100, 100]).limit(2).to_df()
```
## Complete Demos
We will be adding completed demo apps built using LanceDB.
- [YouTube Transcript Search](../notebooks/youtube_transcript_search.ipynb)
## Documentation Quick Links
* [`Basic Operations`](basic.md) - basic functionality of LanceDB.
* [`Embedding Functions`](embedding.md) - functions for working with embeddings.
* [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
* [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
* [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.

111
docs/src/integrations.md Normal file
View File

@@ -0,0 +1,111 @@
# Integrations
Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, PyArrow and DuckDB.
## Pandas and PyArrow
First, we need to connect to a `LanceDB` database.
``` py
import lancedb
db = lancedb.connect("/tmp/lancedb")
```
And write a `Pandas DataFrame` to LanceDB directly.
```py
import pandas as pd
data = pd.DataFrame({
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0]
})
table = db.create_table("pd_table", data=data)
# Optionally, create a IVF_PQ index
table.create_index(num_partitions=256, num_sub_vectors=96)
```
You will find detailed instructions of creating dataset and index in [Basic Operations](basic.md) and [Indexing](indexing.md)
sections.
We can now perform similarity searches via `LanceDB`.
```py
# Open the table previously created.
table = db.open_table("pd_table")
query_vector = [100, 100]
# Pandas DataFrame
df = table.search(query_vector).limit(1).to_df()
print(df)
```
```
vector item price score
0 [5.9, 26.5] bar 20.0 14257.05957
```
If you have a simple filter, it's faster to provide a where clause to `LanceDB`'s search query.
If you have more complex criteria, you can always apply the filter to the resulting pandas `DataFrame` from the search query.
```python
# Apply the filter via LanceDB
results = table.search([100, 100]).where("price < 15").to_df()
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
# Apply the filter via Pandas
df = results = table.search([100, 100]).to_df()
results = df[df.price < 15]
assert len(results) == 1
assert results["item"].iloc[0] == "foo"
```
## DuckDB
`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow).
Let us start with installing `duckdb` and `lancedb`.
```shell
pip install duckdb lancedb
```
We will re-use the dataset created previously
```python
import lancedb
db = lancedb.connect("/tmp/lancedb")
table = db.open_table("pd_table")
arrow_table = table.to_arrow()
```
`DuckDB` can directly query the `arrow_table`:
```python
In [15]: duckdb.query("SELECT * FROM t")
Out[15]:
┌─────────────┬─────────┬────────┐
│ vector │ item │ price │
│ float[] │ varchar │ double │
├─────────────┼─────────┼────────┤
│ [3.1, 4.1] │ foo │ 10.0 │
│ [5.9, 26.5] │ bar │ 20.0 │
└─────────────┴─────────┴────────┘
In [16]: duckdb.query("SELECT mean(price) FROM t")
Out[16]:
┌─────────────┐
│ mean(price) │
│ double │
├─────────────┤
│ 15.0 │
└─────────────┘
```

View File

@@ -10,3 +10,5 @@ pip install lancedb
::: lancedb.db
::: lancedb.table
::: lancedb.query
::: lancedb.embeddings
::: lancedb.context

File diff suppressed because one or more lines are too long

View File

@@ -11,7 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .db import LanceDBConnection, URI
from .db import URI, LanceDBConnection
def connect(uri: URI) -> LanceDBConnection:

View File

@@ -11,7 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import Union, List
from typing import List, Union
import numpy as np
import pandas as pd

View File

@@ -14,10 +14,12 @@
from __future__ import annotations
from pathlib import Path
import pyarrow as pa
from .common import URI, DATA
from .common import DATA, URI
from .table import LanceTable
from .util import get_uri_scheme
class LanceDBConnection:
@@ -26,10 +28,12 @@ class LanceDBConnection:
"""
def __init__(self, uri: URI):
if isinstance(uri, str):
uri = Path(uri)
uri = uri.expanduser().absolute()
Path(uri).mkdir(parents=True, exist_ok=True)
is_local = isinstance(uri, Path) or get_uri_scheme(uri) == "file"
if is_local:
if isinstance(uri, str):
uri = Path(uri)
uri = uri.expanduser().absolute()
Path(uri).mkdir(parents=True, exist_ok=True)
self._uri = str(uri)
@property
@@ -43,7 +47,11 @@ class LanceDBConnection:
-------
A list of table names.
"""
return [p.stem for p in Path(self.uri).glob("*.lance")]
if get_uri_scheme(self.uri) == "file":
return [p.stem for p in Path(self.uri).glob("*.lance")]
raise NotImplementedError(
"List table_names is only supported for local filesystem for now"
)
def __len__(self) -> int:
return len(self.table_names())
@@ -55,7 +63,11 @@ class LanceDBConnection:
return self.open_table(name)
def create_table(
self, name: str, data: DATA = None, schema: pa.Schema = None
self,
name: str,
data: DATA = None,
schema: pa.Schema = None,
mode: str = "create",
) -> LanceTable:
"""Create a table in the database.
@@ -67,6 +79,10 @@ class LanceDBConnection:
The data to insert into the table.
schema: pyarrow.Schema; optional
The schema of the table.
mode: str; default "create"
The mode to use when creating the table.
By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite".
Note
----
@@ -78,7 +94,7 @@ class LanceDBConnection:
A LanceTable object representing the table.
"""
if data is not None:
tbl = LanceTable.create(self, name, data, schema)
tbl = LanceTable.create(self, name, data, schema, mode=mode)
else:
tbl = LanceTable(self, name)
return tbl

View File

@@ -12,14 +12,14 @@
# limitations under the License.
import math
import ratelimiter
from retry import retry
import sys
from typing import Callable, Union
from lance.vector import vec_to_table
import numpy as np
import pandas as pd
import pyarrow as pa
from lance.vector import vec_to_table
from retry import retry
def with_embeddings(
@@ -32,11 +32,12 @@ def with_embeddings(
):
func = EmbeddingFunction(func)
if wrap_api:
func = func.retry().rate_limit().batch_size(batch_size)
func = func.retry().rate_limit()
func = func.batch_size(batch_size)
if show_progress:
func = func.show_progress()
if isinstance(data, pd.DataFrame):
data = pa.Table.from_pandas(data)
data = pa.Table.from_pandas(data, preserve_index=False)
embeddings = func(data[column].to_numpy())
table = vec_to_table(np.array(embeddings))
return data.append_column("vector", table["vector"])
@@ -52,17 +53,33 @@ class EmbeddingFunction:
def __call__(self, text):
# Get the embedding with retry
@retry(**self.retry_kwargs)
def embed_func(c):
return self.func(c.tolist())
if len(self.retry_kwargs) > 0:
max_calls = self.rate_limiter_kwargs["max_calls"]
limiter = ratelimiter.RateLimiter(
max_calls, period=self.rate_limiter_kwargs["period"]
)
rate_limited = limiter(embed_func)
@retry(**self.retry_kwargs)
def embed_func(c):
return self.func(c.tolist())
else:
def embed_func(c):
return self.func(c.tolist())
if len(self.rate_limiter_kwargs) > 0:
v = int(sys.version_info.minor)
if v >= 11:
print(
"WARNING: rate limit only support up to 3.10, proceeding without rate limiter"
)
else:
import ratelimiter
max_calls = self.rate_limiter_kwargs["max_calls"]
limiter = ratelimiter.RateLimiter(
max_calls, period=self.rate_limiter_kwargs["period"]
)
embed_func = limiter(embed_func)
batches = self.to_batches(text)
embeds = [emb for c in batches for emb in rate_limited(c)]
embeds = [emb for c in batches for emb in embed_func(c)]
return embeds
def __repr__(self):
@@ -102,4 +119,4 @@ class EmbeddingFunction:
yield from tqdm(_chunker(arr), total=math.ceil(length / self._batch_size))
else:
return _chunker(arr)
yield from _chunker(arr)

View File

@@ -24,6 +24,7 @@ class LanceQueryBuilder:
"""
def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray):
self._metric = "L2"
self._nprobes = 20
self._refine_factor = None
self._table = table
@@ -77,6 +78,21 @@ class LanceQueryBuilder:
self._where = where
return self
def metric(self, metric: str) -> LanceQueryBuilder:
"""Set the distance metric to use.
Parameters
----------
metric: str
The distance metric to use. By default "l2" is used.
Returns
-------
The LanceQueryBuilder object.
"""
self._metric = metric
return self
def nprobes(self, nprobes: int) -> LanceQueryBuilder:
"""Set the number of probes to use.
@@ -108,7 +124,12 @@ class LanceQueryBuilder:
return self
def to_df(self) -> pd.DataFrame:
"""Execute the query and return the results as a pandas DataFrame."""
"""
Execute the query and return the results as a pandas DataFrame.
In addition to the selected columns, LanceDB also returns a vector
and also the "score" column which is the distance between the query
vector and the returned vector.
"""
ds = self._table.to_lance()
# TODO indexed search
tbl = ds.to_table(
@@ -118,6 +139,7 @@ class LanceQueryBuilder:
"column": VECTOR_COLUMN_NAME,
"q": self._query,
"k": self._limit,
"metric": self._metric,
"nprobes": self._nprobes,
"refine_factor": self._refine_factor,
},

View File

@@ -19,12 +19,12 @@ from functools import cached_property
import lance
import numpy as np
import pandas as pd
from lance import LanceDataset
import pyarrow as pa
from lance import LanceDataset
from lance.vector import vec_to_table
from .common import DATA, VEC, VECTOR_COLUMN_NAME
from .query import LanceQueryBuilder
from .common import DATA, VECTOR_COLUMN_NAME, VEC
def _sanitize_data(data, schema):
@@ -46,24 +46,74 @@ class LanceTable:
A table in a LanceDB database.
"""
def __init__(self, connection: "lancedb.db.LanceDBConnection", name: str):
def __init__(
self, connection: "lancedb.db.LanceDBConnection", name: str, version: int = None
):
self._conn = connection
self.name = name
self._version = version
def _reset_dataset(self):
try:
del self.__dict__["_dataset"]
except AttributeError:
pass
@property
def schema(self) -> pa.Schema:
"""Return the schema of the table."""
return self._dataset.schema
def list_versions(self):
"""List all versions of the table"""
return self._dataset.versions()
@property
def version(self):
"""Get the current version of the table"""
return self._dataset.version
def checkout(self, version: int):
"""Checkout a version of the table"""
max_ver = max([v["version"] for v in self._dataset.versions()])
if version < 1 or version > max_ver:
raise ValueError(f"Invalid version {version}")
self._version = version
self._reset_dataset()
def __len__(self):
return self._dataset.count_rows()
def __repr__(self) -> str:
return f"LanceTable({self.name})"
def __str__(self) -> str:
return self.__repr__()
def head(self, n=5) -> pa.Table:
"""Return the first n rows of the table."""
return self._dataset.head(n)
def to_pandas(self) -> pd.DataFrame:
"""Return the table as a pandas DataFrame."""
return self.to_arrow().to_pandas()
def to_arrow(self) -> pa.Table:
"""Return the table as a pyarrow Table."""
return self._dataset.to_table()
@property
def _dataset_uri(self) -> str:
return os.path.join(self._conn.uri, f"{self.name}.lance")
def create_index(self, num_partitions=256, num_sub_vectors=96):
def create_index(self, metric="L2", num_partitions=256, num_sub_vectors=96):
"""Create an index on the table.
Parameters
----------
metric: str, default "L2"
The distance metric to use when creating the index. Valid values are "L2" or "cosine".
L2 is euclidean distance.
num_partitions: int
The number of IVF partitions to use when creating the index.
Default is 256.
@@ -71,16 +121,18 @@ class LanceTable:
The number of PQ sub-vectors to use when creating the index.
Default is 96.
"""
return self._dataset.create_index(
self._dataset.create_index(
column=VECTOR_COLUMN_NAME,
index_type="IVF_PQ",
metric=metric,
num_partitions=num_partitions,
num_sub_vectors=num_sub_vectors,
)
self._reset_dataset()
@cached_property
def _dataset(self) -> LanceDataset:
return lance.dataset(self._dataset_uri)
return lance.dataset(self._dataset_uri, version=self._version)
def to_lance(self) -> LanceDataset:
"""Return the LanceDataset backing this table."""
@@ -102,8 +154,9 @@ class LanceTable:
The number of vectors added to the table.
"""
data = _sanitize_data(data, self.schema)
ds = lance.write_dataset(data, self._dataset_uri, mode=mode)
return ds.count_rows()
lance.write_dataset(data, self._dataset_uri, mode=mode)
self._reset_dataset()
return len(self)
def search(self, query: VEC) -> LanceQueryBuilder:
"""Create a search query to find the nearest neighbors
@@ -117,6 +170,9 @@ class LanceTable:
Returns
-------
A LanceQueryBuilder object representing the query.
Once executed, the query returns selected columns, the vector,
and also the "score" column which is the distance between the query
vector and the returned vector.
"""
if isinstance(query, list):
query = np.array(query)
@@ -127,10 +183,10 @@ class LanceTable:
return LanceQueryBuilder(self, query)
@classmethod
def create(cls, db, name, data, schema=None):
def create(cls, db, name, data, schema=None, mode="create"):
tbl = LanceTable(db, name)
data = _sanitize_data(data, schema)
lance.write_dataset(data, tbl._dataset_uri, mode="create")
lance.write_dataset(data, tbl._dataset_uri, mode=mode)
return tbl
@@ -150,6 +206,7 @@ def _sanitize_schema(data: pa.Table, schema: pa.Schema = None) -> pa.Table:
return data
# cast the columns to the expected types
data = data.combine_chunks()
data = _sanitize_vector_column(data, vector_column_name=VECTOR_COLUMN_NAME)
return pa.Table.from_arrays(
[data[name] for name in schema.names], schema=schema
)

43
python/lancedb/util.py Normal file
View File

@@ -0,0 +1,43 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from urllib.parse import ParseResult, urlparse
from pyarrow import fs
def get_uri_scheme(uri: str) -> str:
"""
Get the scheme of a URI. If the URI does not have a scheme, assume it is a file URI.
Parameters
----------
uri : str
The URI to parse.
Returns
-------
str: The scheme of the URI.
"""
parsed = urlparse(uri)
scheme = parsed.scheme
if not scheme:
scheme = "file"
elif scheme in ["s3a", "s3n"]:
scheme = "s3"
elif len(scheme) == 1:
# Windows drive names are parsed as the scheme
# e.g. "c:\path" -> ParseResult(scheme="c", netloc="", path="/path", ...)
# So we add special handling here for schemes that are a single character
scheme = "file"
return scheme

View File

@@ -1,10 +1,10 @@
[project]
name = "lancedb"
version = "0.0.2"
dependencies = ["pylance", "ratelimiter", "retry", "tqdm"]
version = "0.1.2"
dependencies = ["pylance>=0.4.6", "ratelimiter", "retry", "tqdm"]
description = "lancedb"
authors = [
{ name = "Lance Devs", email = "dev@eto.ai" },
{ name = "LanceDB Devs", email = "dev@lancedb.com" },
]
license = { file = "LICENSE" }
readme = "README.md"

View File

@@ -11,6 +11,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
import pytest
import lancedb
@@ -40,3 +43,57 @@ def test_basic(tmp_path):
assert len(db) == 1
assert db.open_table("test").name == db["test"].name
def test_ingest_pd(tmp_path):
db = lancedb.connect(tmp_path)
assert db.uri == str(tmp_path)
assert db.table_names() == []
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
table = db.create_table("test", data=data)
rs = table.search([100, 100]).limit(1).to_df()
assert len(rs) == 1
assert rs["item"].iloc[0] == "bar"
rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
assert len(rs) == 1
assert rs["item"].iloc[0] == "foo"
assert db.table_names() == ["test"]
assert "test" in db
assert len(db) == 1
assert db.open_table("test").name == db["test"].name
def test_create_mode(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
db.create_table("test", data=data)
with pytest.raises(Exception):
db.create_table("test", data=data)
new_data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["fizz", "buzz"],
"price": [10.0, 20.0],
}
)
tbl = db.create_table("test", data=new_data, mode="overwrite")
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]

View File

@@ -0,0 +1,42 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import numpy as np
import pyarrow as pa
from lancedb.embeddings import with_embeddings
def mock_embed_func(input_data):
return [np.random.randn(128).tolist() for _ in range(len(input_data))]
def test_with_embeddings():
for wrap_api in [True, False]:
if wrap_api and sys.version_info.minor >= 11:
# ratelimiter package doesn't work on 3.11
continue
data = pa.Table.from_arrays(
[
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
names=["text", "price"],
)
data = with_embeddings(mock_embed_func, data, wrap_api=wrap_api)
assert data.num_columns == 3
assert data.num_rows == 2
assert data.column_names == ["text", "price", "vector"]
assert data.column("text").to_pylist() == ["foo", "bar"]
assert data.column("price").to_pylist() == [10.0, 20.0]

View File

@@ -12,13 +12,14 @@
# limitations under the License.
import lance
from lancedb.query import LanceQueryBuilder
import numpy as np
import pandas as pd
import pandas.testing as tm
import pyarrow as pa
import pytest
from lancedb.query import LanceQueryBuilder
class MockTable:
def __init__(self, tmp_path):
@@ -60,3 +61,21 @@ def test_query_builder_with_filter(table):
df = LanceQueryBuilder(table, [0, 0]).where("id = 2").to_df()
assert df["id"].values[0] == 2
assert all(df["vector"].values[0] == [3, 4])
def test_query_builder_with_metric(table):
query = [4, 8]
df_default = LanceQueryBuilder(table, query).to_df()
df_l2 = LanceQueryBuilder(table, query).metric("l2").to_df()
tm.assert_frame_equal(df_default, df_l2)
df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df()
assert df_cosine.score[0] == pytest.approx(
cosine_distance(query, df_cosine.vector[0]),
abs=1e-6,
)
assert 0 <= df_cosine.score[0] <= 1
def cosine_distance(vec1, vec2):
return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

View File

@@ -46,17 +46,17 @@ def test_basic(db):
assert table.to_lance().to_table() == ds.to_table()
def test_add(db):
def test_create_table(db):
schema = pa.schema(
[
pa.field("vector", pa.list_(pa.float32())),
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float32()),
]
)
expected = pa.Table.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]]),
pa.FixedSizeListArray.from_arrays(pa.array([3.1, 4.1, 5.9, 26.5]), 2),
pa.array(["foo", "bar"]),
pa.array([10.0, 20.0]),
],
@@ -79,3 +79,61 @@ def test_add(db):
.to_table()
)
assert expected == tbl
def test_add(db):
table = LanceTable.create(
db,
"test",
data=[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
],
)
# table = LanceTable(db, "test")
assert len(table) == 2
count = table.add([{"vector": [6.3, 100.5], "item": "new", "price": 30.0}])
assert count == 3
expected = pa.Table.from_arrays(
[
pa.FixedSizeListArray.from_arrays(
pa.array([3.1, 4.1, 5.9, 26.5, 6.3, 100.5]), 2
),
pa.array(["foo", "bar", "new"]),
pa.array([10.0, 20.0, 30.0]),
],
schema=pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.string()),
pa.field("price", pa.float64()),
]
),
)
assert expected == table.to_arrow()
def test_versioning(db):
table = LanceTable.create(
db,
"test",
data=[
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
],
)
assert len(table.list_versions()) == 1
assert table.version == 1
table.add([{"vector": [6.3, 100.5], "item": "new", "price": 30.0}])
assert len(table.list_versions()) == 2
assert table.version == 2
assert len(table) == 3
table.checkout(1)
assert table.version == 1
assert len(table) == 2

30
python/tests/test_util.py Normal file
View File

@@ -0,0 +1,30 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from lancedb.util import get_uri_scheme
def test_normalize_uri():
uris = [
"relative/path",
"/absolute/path",
"file:///absolute/path",
"s3://bucket/path",
"gs://bucket/path",
"c:\\windows\\path",
]
schemes = ["file", "file", "file", "s3", "gs", "file"]
for uri, expected_scheme in zip(uris, schemes):
parsed_scheme = get_uri_scheme(uri)
assert parsed_scheme == expected_scheme

12
rust/Cargo.toml Normal file
View File

@@ -0,0 +1,12 @@
[package]
name = "vectordb"
version = "0.0.1"
edition = "2021"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
repository = "https://github.com/lancedb/lancedb"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
lance = "0.4.3"

14
rust/src/lib.rs Normal file
View File

@@ -0,0 +1,14 @@
pub fn add(left: usize, right: usize) -> usize {
left + right
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
}