mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-22 21:09:58 +00:00
feat: support binary vector and IVF_FLAT in TypeScript (#2221)
resolve #2218 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
@@ -126,6 +126,37 @@ the vectors.
|
||||
|
||||
***
|
||||
|
||||
### ivfFlat()
|
||||
|
||||
```ts
|
||||
static ivfFlat(options?): Index
|
||||
```
|
||||
|
||||
Create an IvfFlat index
|
||||
|
||||
This index groups vectors into partitions of similar vectors. Each partition keeps track of
|
||||
a centroid which is the average value of all vectors in the group.
|
||||
|
||||
During a query the centroids are compared with the query vector to find the closest
|
||||
partitions. The vectors in these partitions are then searched to find
|
||||
the closest vectors.
|
||||
|
||||
The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
many groups to create.
|
||||
|
||||
Note that training an IVF FLAT index on a large dataset is a slow operation and
|
||||
currently is also a memory intensive operation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **options?**: `Partial`<[`IvfFlatOptions`](../interfaces/IvfFlatOptions.md)>
|
||||
|
||||
#### Returns
|
||||
|
||||
[`Index`](Index.md)
|
||||
|
||||
***
|
||||
|
||||
### ivfPq()
|
||||
|
||||
```ts
|
||||
|
||||
19
docs/src/js/functions/packBits.md
Normal file
19
docs/src/js/functions/packBits.md
Normal file
@@ -0,0 +1,19 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / packBits
|
||||
|
||||
# Function: packBits()
|
||||
|
||||
```ts
|
||||
function packBits(data): number[]
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
* **data**: `number`[]
|
||||
|
||||
## Returns
|
||||
|
||||
`number`[]
|
||||
@@ -39,6 +39,7 @@
|
||||
- [IndexConfig](interfaces/IndexConfig.md)
|
||||
- [IndexOptions](interfaces/IndexOptions.md)
|
||||
- [IndexStatistics](interfaces/IndexStatistics.md)
|
||||
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
|
||||
- [IvfPqOptions](interfaces/IvfPqOptions.md)
|
||||
- [OpenTableOptions](interfaces/OpenTableOptions.md)
|
||||
- [OptimizeOptions](interfaces/OptimizeOptions.md)
|
||||
@@ -66,3 +67,4 @@
|
||||
|
||||
- [connect](functions/connect.md)
|
||||
- [makeArrowTable](functions/makeArrowTable.md)
|
||||
- [packBits](functions/packBits.md)
|
||||
|
||||
112
docs/src/js/interfaces/IvfFlatOptions.md
Normal file
112
docs/src/js/interfaces/IvfFlatOptions.md
Normal file
@@ -0,0 +1,112 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / IvfFlatOptions
|
||||
|
||||
# Interface: IvfFlatOptions
|
||||
|
||||
Options to create an `IVF_FLAT` index
|
||||
|
||||
## Properties
|
||||
|
||||
### distanceType?
|
||||
|
||||
```ts
|
||||
optional distanceType: "l2" | "cosine" | "dot" | "hamming";
|
||||
```
|
||||
|
||||
Distance type to use to build the index.
|
||||
|
||||
Default value is "l2".
|
||||
|
||||
This is used when training the index to calculate the IVF partitions
|
||||
(vectors are grouped in partitions with similar vectors according to this
|
||||
distance type).
|
||||
|
||||
The distance type used to train an index MUST match the distance type used
|
||||
to search the index. Failure to do so will yield inaccurate results.
|
||||
|
||||
The following distance types are available:
|
||||
|
||||
"l2" - Euclidean distance. This is a very common distance metric that
|
||||
accounts for both magnitude and direction when determining the distance
|
||||
between vectors. l2 distance has a range of [0, ∞).
|
||||
|
||||
"cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
calculated from the cosine similarity between two vectors. Cosine
|
||||
similarity is a measure of similarity between two non-zero vectors of an
|
||||
inner product space. It is defined to equal the cosine of the angle
|
||||
between them. Unlike l2, the cosine distance is not affected by the
|
||||
magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
|
||||
Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
are all zeros (there is no direction). These vectors are invalid and may
|
||||
never be returned from a vector search.
|
||||
|
||||
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
|
||||
"hamming" - Hamming distance. Hamming distance is a distance metric
|
||||
calculated from the number of bits that are different between two vectors.
|
||||
Hamming distance has a range of [0, dimension]. Note that the hamming distance
|
||||
is only valid for binary vectors.
|
||||
|
||||
***
|
||||
|
||||
### maxIterations?
|
||||
|
||||
```ts
|
||||
optional maxIterations: number;
|
||||
```
|
||||
|
||||
Max iteration to train IVF kmeans.
|
||||
|
||||
When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter
|
||||
controls how many iterations of kmeans to run.
|
||||
|
||||
Increasing this might improve the quality of the index but in most cases these extra
|
||||
iterations have diminishing returns.
|
||||
|
||||
The default value is 50.
|
||||
|
||||
***
|
||||
|
||||
### numPartitions?
|
||||
|
||||
```ts
|
||||
optional numPartitions: number;
|
||||
```
|
||||
|
||||
The number of IVF partitions to create.
|
||||
|
||||
This value should generally scale with the number of rows in the dataset.
|
||||
By default the number of partitions is the square root of the number of
|
||||
rows.
|
||||
|
||||
If this value is too large then the first part of the search (picking the
|
||||
right partition) will be slow. If this value is too small then the second
|
||||
part of the search (searching within a partition) will be slow.
|
||||
|
||||
***
|
||||
|
||||
### sampleRate?
|
||||
|
||||
```ts
|
||||
optional sampleRate: number;
|
||||
```
|
||||
|
||||
The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
|
||||
When an IVF FLAT index is trained, we need to calculate partitions. These are groups
|
||||
of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
|
||||
Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
random sample of the data. This parameter controls the size of the sample. The total
|
||||
number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
|
||||
Increasing this value might improve the quality of the index but in most cases the
|
||||
default should be sufficient.
|
||||
|
||||
The default value is 256.
|
||||
@@ -138,6 +138,19 @@ LanceDB supports binary vectors as a data type, and has the ability to search bi
|
||||
--8<-- "python/python/tests/docs/test_binary_vector.py:async_binary_vector"
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```ts
|
||||
--8<-- "nodejs/examples/search.test.ts:import"
|
||||
|
||||
--8<-- "nodejs/examples/search.test.ts:import_bin_util"
|
||||
|
||||
--8<-- "nodejs/examples/search.test.ts:ingest_binary_data"
|
||||
|
||||
--8<-- "nodejs/examples/search.test.ts:search_binary_data"
|
||||
```
|
||||
|
||||
|
||||
## Multivector type
|
||||
|
||||
LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali).
|
||||
|
||||
@@ -21,6 +21,7 @@ import {
|
||||
Int64,
|
||||
List,
|
||||
Schema,
|
||||
Uint8,
|
||||
Utf8,
|
||||
makeArrowTable,
|
||||
} from "../lancedb/arrow";
|
||||
@@ -740,6 +741,38 @@ describe("When creating an index", () => {
|
||||
expect(stats).toBeUndefined();
|
||||
});
|
||||
|
||||
test("create ivf_flat with binary vectors", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const binarySchema = new Schema([
|
||||
new Field("id", new Int32(), true),
|
||||
new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))),
|
||||
]);
|
||||
const tbl = await db.createTable(
|
||||
"binary",
|
||||
makeArrowTable(
|
||||
Array(300)
|
||||
.fill(1)
|
||||
.map((_, i) => ({
|
||||
id: i,
|
||||
vec: Array(32)
|
||||
.fill(1)
|
||||
.map(() => Math.floor(Math.random() * 255)),
|
||||
})),
|
||||
{ schema: binarySchema },
|
||||
),
|
||||
);
|
||||
await tbl.createIndex("vec", {
|
||||
config: Index.ivfFlat({ numPartitions: 10, distanceType: "hamming" }),
|
||||
});
|
||||
|
||||
// query with binary vectors
|
||||
const queryVec = Array(32)
|
||||
.fill(1)
|
||||
.map(() => Math.floor(Math.random() * 255));
|
||||
const rst = await tbl.query().limit(5).nearestTo(queryVec).toArrow();
|
||||
expect(rst.numRows).toBe(5);
|
||||
});
|
||||
|
||||
// TODO: Move this test to the query API test (making sure we can reject queries
|
||||
// when the dimension is incorrect)
|
||||
test("two columns with different dimensions", async () => {
|
||||
|
||||
@@ -4,9 +4,12 @@ import { expect, test } from "@jest/globals";
|
||||
// --8<-- [start:import]
|
||||
import * as lancedb from "@lancedb/lancedb";
|
||||
// --8<-- [end:import]
|
||||
// --8<-- [start:import_bin_util]
|
||||
import { Field, FixedSizeList, Int32, Schema, Uint8 } from "apache-arrow";
|
||||
// --8<-- [end:import_bin_util]
|
||||
import { withTempDirectory } from "./util.ts";
|
||||
|
||||
test("full text search", async () => {
|
||||
test("vector search", async () => {
|
||||
await withTempDirectory(async (databaseDir) => {
|
||||
{
|
||||
const db = await lancedb.connect(databaseDir);
|
||||
@@ -14,8 +17,6 @@ test("full text search", async () => {
|
||||
const data = Array.from({ length: 10_000 }, (_, i) => ({
|
||||
vector: Array(128).fill(i),
|
||||
id: `${i}`,
|
||||
content: "",
|
||||
longId: `${i}`,
|
||||
}));
|
||||
|
||||
await db.createTable("my_vectors", data);
|
||||
@@ -52,5 +53,41 @@ test("full text search", async () => {
|
||||
expect(r.distance).toBeGreaterThanOrEqual(0.1);
|
||||
expect(r.distance).toBeLessThan(0.2);
|
||||
}
|
||||
|
||||
{
|
||||
// --8<-- [start:ingest_binary_data]
|
||||
const schema = new Schema([
|
||||
new Field("id", new Int32(), true),
|
||||
new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))),
|
||||
]);
|
||||
const data = lancedb.makeArrowTable(
|
||||
Array(1_000)
|
||||
.fill(0)
|
||||
.map((_, i) => ({
|
||||
// the 256 bits would be store in 32 bytes,
|
||||
// if your data is already in this format, you can skip the packBits step
|
||||
id: i,
|
||||
vec: lancedb.packBits(Array(256).fill(i % 2)),
|
||||
})),
|
||||
{ schema: schema },
|
||||
);
|
||||
|
||||
const tbl = await db.createTable("binary_table", data);
|
||||
await tbl.createIndex("vec", {
|
||||
config: lancedb.Index.ivfFlat({
|
||||
numPartitions: 10,
|
||||
distanceType: "hamming",
|
||||
}),
|
||||
});
|
||||
// --8<-- [end:ingest_binary_data]
|
||||
|
||||
// --8<-- [start:search_binary_data]
|
||||
const query = Array(32)
|
||||
.fill(1)
|
||||
.map(() => Math.floor(Math.random() * 255));
|
||||
const results = await tbl.query().nearestTo(query).limit(10).toArrow();
|
||||
// --8<-- [end:search_binary_data
|
||||
expect(results.numRows).toBe(10);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -53,6 +53,7 @@ export {
|
||||
Index,
|
||||
IndexOptions,
|
||||
IvfPqOptions,
|
||||
IvfFlatOptions,
|
||||
HnswPqOptions,
|
||||
HnswSqOptions,
|
||||
FtsOptions,
|
||||
@@ -79,7 +80,7 @@ export {
|
||||
DataLike,
|
||||
IntoVector,
|
||||
} from "./arrow";
|
||||
export { IntoSql } from "./util";
|
||||
export { IntoSql, packBits } from "./util";
|
||||
|
||||
/**
|
||||
* Connect to a LanceDB instance at the given URI.
|
||||
|
||||
@@ -327,6 +327,94 @@ export interface HnswSqOptions {
|
||||
efConstruction?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create an `IVF_FLAT` index
|
||||
*/
|
||||
export interface IvfFlatOptions {
|
||||
/**
|
||||
* The number of IVF partitions to create.
|
||||
*
|
||||
* This value should generally scale with the number of rows in the dataset.
|
||||
* By default the number of partitions is the square root of the number of
|
||||
* rows.
|
||||
*
|
||||
* If this value is too large then the first part of the search (picking the
|
||||
* right partition) will be slow. If this value is too small then the second
|
||||
* part of the search (searching within a partition) will be slow.
|
||||
*/
|
||||
numPartitions?: number;
|
||||
|
||||
/**
|
||||
* Distance type to use to build the index.
|
||||
*
|
||||
* Default value is "l2".
|
||||
*
|
||||
* This is used when training the index to calculate the IVF partitions
|
||||
* (vectors are grouped in partitions with similar vectors according to this
|
||||
* distance type).
|
||||
*
|
||||
* The distance type used to train an index MUST match the distance type used
|
||||
* to search the index. Failure to do so will yield inaccurate results.
|
||||
*
|
||||
* The following distance types are available:
|
||||
*
|
||||
* "l2" - Euclidean distance. This is a very common distance metric that
|
||||
* accounts for both magnitude and direction when determining the distance
|
||||
* between vectors. l2 distance has a range of [0, ∞).
|
||||
*
|
||||
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
* calculated from the cosine similarity between two vectors. Cosine
|
||||
* similarity is a measure of similarity between two non-zero vectors of an
|
||||
* inner product space. It is defined to equal the cosine of the angle
|
||||
* between them. Unlike l2, the cosine distance is not affected by the
|
||||
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
*
|
||||
* Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
* are all zeros (there is no direction). These vectors are invalid and may
|
||||
* never be returned from a vector search.
|
||||
*
|
||||
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
*
|
||||
* "hamming" - Hamming distance. Hamming distance is a distance metric
|
||||
* calculated from the number of bits that are different between two vectors.
|
||||
* Hamming distance has a range of [0, dimension]. Note that the hamming distance
|
||||
* is only valid for binary vectors.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot" | "hamming";
|
||||
|
||||
/**
|
||||
* Max iteration to train IVF kmeans.
|
||||
*
|
||||
* When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter
|
||||
* controls how many iterations of kmeans to run.
|
||||
*
|
||||
* Increasing this might improve the quality of the index but in most cases these extra
|
||||
* iterations have diminishing returns.
|
||||
*
|
||||
* The default value is 50.
|
||||
*/
|
||||
maxIterations?: number;
|
||||
|
||||
/**
|
||||
* The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
*
|
||||
* When an IVF FLAT index is trained, we need to calculate partitions. These are groups
|
||||
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
*
|
||||
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
* random sample of the data. This parameter controls the size of the sample. The total
|
||||
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
*
|
||||
* Increasing this value might improve the quality of the index but in most cases the
|
||||
* default should be sufficient.
|
||||
*
|
||||
* The default value is 256.
|
||||
*/
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create a full text search index
|
||||
*/
|
||||
@@ -426,6 +514,33 @@ export class Index {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an IvfFlat index
|
||||
*
|
||||
* This index groups vectors into partitions of similar vectors. Each partition keeps track of
|
||||
* a centroid which is the average value of all vectors in the group.
|
||||
*
|
||||
* During a query the centroids are compared with the query vector to find the closest
|
||||
* partitions. The vectors in these partitions are then searched to find
|
||||
* the closest vectors.
|
||||
*
|
||||
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
* many groups to create.
|
||||
*
|
||||
* Note that training an IVF FLAT index on a large dataset is a slow operation and
|
||||
* currently is also a memory intensive operation.
|
||||
*/
|
||||
static ivfFlat(options?: Partial<IvfFlatOptions>) {
|
||||
return new Index(
|
||||
LanceDbIndex.ivfFlat(
|
||||
options?.distanceType,
|
||||
options?.numPartitions,
|
||||
options?.maxIterations,
|
||||
options?.sampleRate,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a btree index
|
||||
*
|
||||
|
||||
@@ -35,6 +35,16 @@ export function toSQL(value: IntoSql): string {
|
||||
}
|
||||
}
|
||||
|
||||
export function packBits(data: Array<number>): Array<number> {
|
||||
const packed = Array(data.length >> 3).fill(0);
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const byte = i >> 3;
|
||||
const bit = i & 7;
|
||||
packed[byte] |= data[i] << bit;
|
||||
}
|
||||
return packed;
|
||||
}
|
||||
|
||||
export class TTLCache {
|
||||
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||
private readonly cache: Map<string, { value: any; expires: number }>;
|
||||
|
||||
@@ -4,7 +4,9 @@
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
|
||||
use lancedb::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder};
|
||||
use lancedb::index::vector::{
|
||||
IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
|
||||
};
|
||||
use lancedb::index::Index as LanceDbIndex;
|
||||
use napi_derive::napi;
|
||||
|
||||
@@ -63,6 +65,32 @@ impl Index {
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn ivf_flat(
|
||||
distance_type: Option<String>,
|
||||
num_partitions: Option<u32>,
|
||||
max_iterations: Option<u32>,
|
||||
sample_rate: Option<u32>,
|
||||
) -> napi::Result<Self> {
|
||||
let mut ivf_flat_builder = IvfFlatIndexBuilder::default();
|
||||
if let Some(distance_type) = distance_type {
|
||||
let distance_type = parse_distance_type(distance_type)?;
|
||||
ivf_flat_builder = ivf_flat_builder.distance_type(distance_type);
|
||||
}
|
||||
if let Some(num_partitions) = num_partitions {
|
||||
ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions);
|
||||
}
|
||||
if let Some(max_iterations) = max_iterations {
|
||||
ivf_flat_builder = ivf_flat_builder.max_iterations(max_iterations);
|
||||
}
|
||||
if let Some(sample_rate) = sample_rate {
|
||||
ivf_flat_builder = ivf_flat_builder.sample_rate(sample_rate);
|
||||
}
|
||||
Ok(Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::IvfFlat(ivf_flat_builder))),
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn btree() -> Self {
|
||||
Self {
|
||||
|
||||
Reference in New Issue
Block a user