mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 21:39:57 +00:00
Compare commits
4 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a022368426 | ||
|
|
8b815ef5a8 | ||
|
|
e4c3a9346c | ||
|
|
1d1f8964d2 |
@@ -5,8 +5,8 @@ exclude = ["python"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.7.5", "features" = ["dynamodb"] }
|
||||
lance-linalg = { "version" = "=0.7.5" }
|
||||
lance = { "version" = "=0.8.1", "features" = ["dynamodb"] }
|
||||
lance-linalg = { "version" = "=0.8.1" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "43.0.0", optional = false }
|
||||
arrow-array = "43.0"
|
||||
|
||||
@@ -6,7 +6,7 @@ LanceDB provides many parameters to fine-tune the index's size, the speed of que
|
||||
|
||||
Currently, LanceDB does *not* automatically create the ANN index.
|
||||
LanceDB has optimized code for KNN as well. For many use-cases, datasets under 100K vectors won't require index creation at all.
|
||||
If you can live with <100ms latency, skipping index creation is a simpler workflow while guaranteeing 100% recall.
|
||||
If you can live with < 100ms latency, skipping index creation is a simpler workflow while guaranteeing 100% recall.
|
||||
|
||||
In the future we will look to automatically create and configure the ANN index.
|
||||
|
||||
@@ -154,28 +154,28 @@ You can select the columns returned by the query using a select clause.
|
||||
|
||||
## FAQ
|
||||
|
||||
### When is it necessary to create an ANN vector index.
|
||||
### When is it necessary to create an ANN vector index?
|
||||
|
||||
`LanceDB` has manually tuned SIMD code for computing vector distances.
|
||||
In our benchmarks, computing 100K pairs of 1K dimension vectors only take less than 20ms.
|
||||
For small dataset (<100K rows) or the applications which can accept 100ms latency, vector indices are usually not necessary.
|
||||
`LanceDB` has manually-tuned SIMD code for computing vector distances.
|
||||
In our benchmarks, computing 100K pairs of 1K dimension vectors takes **less than 20ms**.
|
||||
For small datasets (< 100K rows) or applications that can accept 100ms latency, vector indices are usually not necessary.
|
||||
|
||||
For large-scale or higher dimension vectors, it is beneficial to create vector index.
|
||||
|
||||
### How big is my index, and how many memory will it take.
|
||||
### How big is my index, and how many memory will it take?
|
||||
|
||||
In LanceDB, all vector indices are disk-based, meaning that when responding to a vector query, only the relevant pages from the index file are loaded from disk and cached in memory. Additionally, each sub-vector is usually encoded into 1 byte PQ code.
|
||||
In LanceDB, all vector indices are **disk-based**, meaning that when responding to a vector query, only the relevant pages from the index file are loaded from disk and cached in memory. Additionally, each sub-vector is usually encoded into 1 byte PQ code.
|
||||
|
||||
For example, with a 1024-dimension dataset, if we choose `num_sub_vectors=64`, each sub-vector has `1024 / 64 = 16` float32 numbers.
|
||||
Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` times of space reduction.
|
||||
|
||||
### How to choose `num_partitions` and `num_sub_vectors` for `IVF_PQ` index.
|
||||
### How to choose `num_partitions` and `num_sub_vectors` for `IVF_PQ` index?
|
||||
|
||||
`num_partitions` is used to decide how many partitions the first level `IVF` index uses.
|
||||
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
||||
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
||||
|
||||
`num_sub_vectors` decides how many Product Quantization code to generate on each vector. Because
|
||||
Product Quantization is a lossy compression of the original vector, the more `num_sub_vectors` usually results to
|
||||
less space distortion, and thus yield better accuracy. However, similarly, more `num_sub_vectors` causes heavier I/O and
|
||||
more PQ computation, thus, higher latency. `dimension / num_sub_vectors` should be aligned with 8 for better SIMD efficiency.
|
||||
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
|
||||
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
|
||||
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||
@@ -123,9 +123,15 @@ After a table has been created, you can always add more data to it using
|
||||
|
||||
=== "Python"
|
||||
```python
|
||||
df = pd.DataFrame([{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
||||
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}])
|
||||
tbl.add(df)
|
||||
|
||||
# Option 1: Add a list of dicts to a table
|
||||
data = [{"vector": [1.3, 1.4], "item": "fizz", "price": 100.0},
|
||||
{"vector": [9.5, 56.2], "item": "buzz", "price": 200.0}]
|
||||
tbl.add(data)
|
||||
|
||||
# Option 2: Add a pandas DataFrame to a table
|
||||
df = pd.DataFrame(data)
|
||||
tbl.add(data)
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
|
||||
@@ -6,17 +6,19 @@ to make this available for JS as well.
|
||||
|
||||
## Installation
|
||||
|
||||
To use full text search, you must install optional dependency tantivy-py:
|
||||
To use full text search, you must install the dependency `tantivy-py`:
|
||||
|
||||
# tantivy 0.19.2
|
||||
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
|
||||
# tantivy 0.20.1
|
||||
```sh
|
||||
pip install tantivy==0.20.1
|
||||
```
|
||||
|
||||
|
||||
## Quickstart
|
||||
|
||||
Assume:
|
||||
1. `table` is a LanceDB Table
|
||||
2. `text` is the name of the Table column that we want to index
|
||||
2. `text` is the name of the `Table` column that we want to index
|
||||
|
||||
For example,
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
import pandas as pd
|
||||
|
||||
data = pd.DataFrame({
|
||||
"vector": [[1.1, 1.2], [0.2, 1.8]],
|
||||
"vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]],
|
||||
"lat": [45.5, 40.1],
|
||||
"long": [-122.7, -74.1]
|
||||
})
|
||||
@@ -56,7 +56,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
|
||||
```python
|
||||
custom_schema = pa.schema([
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("lat", pa.float32()),
|
||||
pa.field("long", pa.float32())
|
||||
])
|
||||
@@ -70,8 +70,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
```python
|
||||
table = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]],
|
||||
pa.list_(pa.float32(), 2)),
|
||||
pa.array([[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]],
|
||||
pa.list_(pa.float32(), 4)),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
@@ -131,8 +131,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
for i in range(5):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1], [5.9, 26.5]],
|
||||
pa.list_(pa.float32(), 2)),
|
||||
pa.array([[3.1, 4.1, 5.1, 6.1], [5.9, 26.5, 4.7, 32.8]],
|
||||
pa.list_(pa.float32(), 4)),
|
||||
pa.array(["foo", "bar"]),
|
||||
pa.array([10.0, 20.0]),
|
||||
],
|
||||
@@ -140,7 +140,7 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
)
|
||||
|
||||
schema = pa.schema([
|
||||
pa.field("vector", pa.list_(pa.float32(), 2)),
|
||||
pa.field("vector", pa.list_(pa.float32(), 4)),
|
||||
pa.field("item", pa.utf8()),
|
||||
pa.field("price", pa.float32()),
|
||||
])
|
||||
|
||||
@@ -25,8 +25,8 @@ Currently, we support the following metrics:
|
||||
|
||||
### Flat Search
|
||||
|
||||
If LanceDB does not create a vector index, LanceDB would need to scan (`Flat Search`) the entire vector column
|
||||
and compute the distance for each vector in order to find the closest matches.
|
||||
If you do not create a vector index, LanceDB would need to exhaustively scan the entire vector column (via `Flat Search`)
|
||||
and compute the distance for *every* vector in order to find the closest matches. This is effectively a KNN search.
|
||||
|
||||
|
||||
<!-- Setup Code
|
||||
@@ -110,7 +110,7 @@ as well.
|
||||
|
||||
To accelerate vector retrievals, it is common to build vector indices.
|
||||
A vector index is a data structure specifically designed to efficiently organize and
|
||||
search vector data based on their similarity or distance metrics.
|
||||
search vector data based on their similarity via the chosen distance metric.
|
||||
By constructing a vector index, you can reduce the search space and avoid the need
|
||||
for brute-force scanning of the entire vector column.
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.2.6
|
||||
current_version = 0.3.0
|
||||
commit = True
|
||||
message = [python] Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
[project]
|
||||
name = "lancedb"
|
||||
version = "0.2.6"
|
||||
version = "0.3.0"
|
||||
dependencies = [
|
||||
"pylance==0.8.0",
|
||||
"pylance==0.8.1",
|
||||
"ratelimiter~=1.0",
|
||||
"retry>=0.9.2",
|
||||
"tqdm>=4.1.0",
|
||||
|
||||
@@ -12,8 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use lance::index::vector::ivf::IvfBuildParams;
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::index::vector::{ivf::IvfBuildParams, pq::PQBuildParams};
|
||||
use lance_linalg::distance::MetricType;
|
||||
use neon::context::FunctionContext;
|
||||
use neon::prelude::*;
|
||||
@@ -79,11 +78,9 @@ fn get_index_params_builder(
|
||||
|
||||
num_partitions.map(|np| {
|
||||
let max_iters = max_iters.unwrap_or(50);
|
||||
let ivf_params = IvfBuildParams {
|
||||
num_partitions: np,
|
||||
max_iters,
|
||||
centroids: None,
|
||||
};
|
||||
let mut ivf_params = IvfBuildParams::default();
|
||||
ivf_params.num_partitions = np;
|
||||
ivf_params.max_iters = max_iters;
|
||||
index_builder.ivf_params(ivf_params)
|
||||
});
|
||||
|
||||
|
||||
@@ -190,9 +190,8 @@ impl Table {
|
||||
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
|
||||
use lance::index::DatasetIndexExt;
|
||||
|
||||
let dataset = self
|
||||
.dataset
|
||||
.create_index(
|
||||
let mut dataset = self.dataset.as_ref().clone();
|
||||
dataset.create_index(
|
||||
&[index_builder
|
||||
.get_column()
|
||||
.unwrap_or(VECTOR_COLUMN_NAME.to_string())
|
||||
|
||||
Reference in New Issue
Block a user