From bdb6c09c3b0432bc77c49e230a63d5c41a52ecd3 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Sat, 22 Mar 2025 01:57:08 +0800 Subject: [PATCH] feat: support binary vector and IVF_FLAT in TypeScript (#2221) resolve #2218 --------- Signed-off-by: BubbleCal --- docs/src/js/classes/Index.md | 31 ++++++ docs/src/js/functions/packBits.md | 19 ++++ docs/src/js/globals.md | 2 + docs/src/js/interfaces/IvfFlatOptions.md | 112 ++++++++++++++++++++++ docs/src/search.md | 13 +++ nodejs/__test__/table.test.ts | 33 +++++++ nodejs/examples/search.test.ts | 43 ++++++++- nodejs/lancedb/index.ts | 3 +- nodejs/lancedb/indices.ts | 115 +++++++++++++++++++++++ nodejs/lancedb/util.ts | 10 ++ nodejs/src/index.rs | 30 +++++- 11 files changed, 406 insertions(+), 5 deletions(-) create mode 100644 docs/src/js/functions/packBits.md create mode 100644 docs/src/js/interfaces/IvfFlatOptions.md diff --git a/docs/src/js/classes/Index.md b/docs/src/js/classes/Index.md index 1b73766b..0ec245b2 100644 --- a/docs/src/js/classes/Index.md +++ b/docs/src/js/classes/Index.md @@ -126,6 +126,37 @@ the vectors. *** +### ivfFlat() + +```ts +static ivfFlat(options?): Index +``` + +Create an IvfFlat index + +This index groups vectors into partitions of similar vectors. Each partition keeps track of +a centroid which is the average value of all vectors in the group. + +During a query the centroids are compared with the query vector to find the closest +partitions. The vectors in these partitions are then searched to find +the closest vectors. + +The partitioning process is called IVF and the `num_partitions` parameter controls how +many groups to create. + +Note that training an IVF FLAT index on a large dataset is a slow operation and +currently is also a memory intensive operation. + +#### Parameters + +* **options?**: `Partial`<[`IvfFlatOptions`](../interfaces/IvfFlatOptions.md)> + +#### Returns + +[`Index`](Index.md) + +*** + ### ivfPq() ```ts diff --git a/docs/src/js/functions/packBits.md b/docs/src/js/functions/packBits.md new file mode 100644 index 00000000..63e9e0aa --- /dev/null +++ b/docs/src/js/functions/packBits.md @@ -0,0 +1,19 @@ +[**@lancedb/lancedb**](../README.md) • **Docs** + +*** + +[@lancedb/lancedb](../globals.md) / packBits + +# Function: packBits() + +```ts +function packBits(data): number[] +``` + +## Parameters + +* **data**: `number`[] + +## Returns + +`number`[] diff --git a/docs/src/js/globals.md b/docs/src/js/globals.md index 13810a5c..f57acf68 100644 --- a/docs/src/js/globals.md +++ b/docs/src/js/globals.md @@ -39,6 +39,7 @@ - [IndexConfig](interfaces/IndexConfig.md) - [IndexOptions](interfaces/IndexOptions.md) - [IndexStatistics](interfaces/IndexStatistics.md) +- [IvfFlatOptions](interfaces/IvfFlatOptions.md) - [IvfPqOptions](interfaces/IvfPqOptions.md) - [OpenTableOptions](interfaces/OpenTableOptions.md) - [OptimizeOptions](interfaces/OptimizeOptions.md) @@ -66,3 +67,4 @@ - [connect](functions/connect.md) - [makeArrowTable](functions/makeArrowTable.md) +- [packBits](functions/packBits.md) diff --git a/docs/src/js/interfaces/IvfFlatOptions.md b/docs/src/js/interfaces/IvfFlatOptions.md new file mode 100644 index 00000000..36da9890 --- /dev/null +++ b/docs/src/js/interfaces/IvfFlatOptions.md @@ -0,0 +1,112 @@ +[**@lancedb/lancedb**](../README.md) • **Docs** + +*** + +[@lancedb/lancedb](../globals.md) / IvfFlatOptions + +# Interface: IvfFlatOptions + +Options to create an `IVF_FLAT` index + +## Properties + +### distanceType? + +```ts +optional distanceType: "l2" | "cosine" | "dot" | "hamming"; +``` + +Distance type to use to build the index. + +Default value is "l2". + +This is used when training the index to calculate the IVF partitions +(vectors are grouped in partitions with similar vectors according to this +distance type). + +The distance type used to train an index MUST match the distance type used +to search the index. Failure to do so will yield inaccurate results. + +The following distance types are available: + +"l2" - Euclidean distance. This is a very common distance metric that +accounts for both magnitude and direction when determining the distance +between vectors. l2 distance has a range of [0, ∞). + +"cosine" - Cosine distance. Cosine distance is a distance metric +calculated from the cosine similarity between two vectors. Cosine +similarity is a measure of similarity between two non-zero vectors of an +inner product space. It is defined to equal the cosine of the angle +between them. Unlike l2, the cosine distance is not affected by the +magnitude of the vectors. Cosine distance has a range of [0, 2]. + +Note: the cosine distance is undefined when one (or both) of the vectors +are all zeros (there is no direction). These vectors are invalid and may +never be returned from a vector search. + +"dot" - Dot product. Dot distance is the dot product of two vectors. Dot +distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their +l2 norm is 1), then dot distance is equivalent to the cosine distance. + +"hamming" - Hamming distance. Hamming distance is a distance metric +calculated from the number of bits that are different between two vectors. +Hamming distance has a range of [0, dimension]. Note that the hamming distance +is only valid for binary vectors. + +*** + +### maxIterations? + +```ts +optional maxIterations: number; +``` + +Max iteration to train IVF kmeans. + +When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter +controls how many iterations of kmeans to run. + +Increasing this might improve the quality of the index but in most cases these extra +iterations have diminishing returns. + +The default value is 50. + +*** + +### numPartitions? + +```ts +optional numPartitions: number; +``` + +The number of IVF partitions to create. + +This value should generally scale with the number of rows in the dataset. +By default the number of partitions is the square root of the number of +rows. + +If this value is too large then the first part of the search (picking the +right partition) will be slow. If this value is too small then the second +part of the search (searching within a partition) will be slow. + +*** + +### sampleRate? + +```ts +optional sampleRate: number; +``` + +The number of vectors, per partition, to sample when training IVF kmeans. + +When an IVF FLAT index is trained, we need to calculate partitions. These are groups +of vectors that are similar to each other. To do this we use an algorithm called kmeans. + +Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a +random sample of the data. This parameter controls the size of the sample. The total +number of vectors used to train the index is `sample_rate * num_partitions`. + +Increasing this value might improve the quality of the index but in most cases the +default should be sufficient. + +The default value is 256. diff --git a/docs/src/search.md b/docs/src/search.md index 7703a2bd..7ffabd5d 100644 --- a/docs/src/search.md +++ b/docs/src/search.md @@ -138,6 +138,19 @@ LanceDB supports binary vectors as a data type, and has the ability to search bi --8<-- "python/python/tests/docs/test_binary_vector.py:async_binary_vector" ``` + === "TypeScript" + + ```ts + --8<-- "nodejs/examples/search.test.ts:import" + + --8<-- "nodejs/examples/search.test.ts:import_bin_util" + + --8<-- "nodejs/examples/search.test.ts:ingest_binary_data" + + --8<-- "nodejs/examples/search.test.ts:search_binary_data" + ``` + + ## Multivector type LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali). diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 821fa983..034042e5 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -21,6 +21,7 @@ import { Int64, List, Schema, + Uint8, Utf8, makeArrowTable, } from "../lancedb/arrow"; @@ -740,6 +741,38 @@ describe("When creating an index", () => { expect(stats).toBeUndefined(); }); + test("create ivf_flat with binary vectors", async () => { + const db = await connect(tmpDir.name); + const binarySchema = new Schema([ + new Field("id", new Int32(), true), + new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))), + ]); + const tbl = await db.createTable( + "binary", + makeArrowTable( + Array(300) + .fill(1) + .map((_, i) => ({ + id: i, + vec: Array(32) + .fill(1) + .map(() => Math.floor(Math.random() * 255)), + })), + { schema: binarySchema }, + ), + ); + await tbl.createIndex("vec", { + config: Index.ivfFlat({ numPartitions: 10, distanceType: "hamming" }), + }); + + // query with binary vectors + const queryVec = Array(32) + .fill(1) + .map(() => Math.floor(Math.random() * 255)); + const rst = await tbl.query().limit(5).nearestTo(queryVec).toArrow(); + expect(rst.numRows).toBe(5); + }); + // TODO: Move this test to the query API test (making sure we can reject queries // when the dimension is incorrect) test("two columns with different dimensions", async () => { diff --git a/nodejs/examples/search.test.ts b/nodejs/examples/search.test.ts index d188f7e3..b04e5ffa 100644 --- a/nodejs/examples/search.test.ts +++ b/nodejs/examples/search.test.ts @@ -4,9 +4,12 @@ import { expect, test } from "@jest/globals"; // --8<-- [start:import] import * as lancedb from "@lancedb/lancedb"; // --8<-- [end:import] +// --8<-- [start:import_bin_util] +import { Field, FixedSizeList, Int32, Schema, Uint8 } from "apache-arrow"; +// --8<-- [end:import_bin_util] import { withTempDirectory } from "./util.ts"; -test("full text search", async () => { +test("vector search", async () => { await withTempDirectory(async (databaseDir) => { { const db = await lancedb.connect(databaseDir); @@ -14,8 +17,6 @@ test("full text search", async () => { const data = Array.from({ length: 10_000 }, (_, i) => ({ vector: Array(128).fill(i), id: `${i}`, - content: "", - longId: `${i}`, })); await db.createTable("my_vectors", data); @@ -52,5 +53,41 @@ test("full text search", async () => { expect(r.distance).toBeGreaterThanOrEqual(0.1); expect(r.distance).toBeLessThan(0.2); } + + { + // --8<-- [start:ingest_binary_data] + const schema = new Schema([ + new Field("id", new Int32(), true), + new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))), + ]); + const data = lancedb.makeArrowTable( + Array(1_000) + .fill(0) + .map((_, i) => ({ + // the 256 bits would be store in 32 bytes, + // if your data is already in this format, you can skip the packBits step + id: i, + vec: lancedb.packBits(Array(256).fill(i % 2)), + })), + { schema: schema }, + ); + + const tbl = await db.createTable("binary_table", data); + await tbl.createIndex("vec", { + config: lancedb.Index.ivfFlat({ + numPartitions: 10, + distanceType: "hamming", + }), + }); + // --8<-- [end:ingest_binary_data] + + // --8<-- [start:search_binary_data] + const query = Array(32) + .fill(1) + .map(() => Math.floor(Math.random() * 255)); + const results = await tbl.query().nearestTo(query).limit(10).toArrow(); + // --8<-- [end:search_binary_data + expect(results.numRows).toBe(10); + } }); }); diff --git a/nodejs/lancedb/index.ts b/nodejs/lancedb/index.ts index 482910e3..76eef118 100644 --- a/nodejs/lancedb/index.ts +++ b/nodejs/lancedb/index.ts @@ -53,6 +53,7 @@ export { Index, IndexOptions, IvfPqOptions, + IvfFlatOptions, HnswPqOptions, HnswSqOptions, FtsOptions, @@ -79,7 +80,7 @@ export { DataLike, IntoVector, } from "./arrow"; -export { IntoSql } from "./util"; +export { IntoSql, packBits } from "./util"; /** * Connect to a LanceDB instance at the given URI. diff --git a/nodejs/lancedb/indices.ts b/nodejs/lancedb/indices.ts index 1f3af0d6..ff37723d 100644 --- a/nodejs/lancedb/indices.ts +++ b/nodejs/lancedb/indices.ts @@ -327,6 +327,94 @@ export interface HnswSqOptions { efConstruction?: number; } +/** + * Options to create an `IVF_FLAT` index + */ +export interface IvfFlatOptions { + /** + * The number of IVF partitions to create. + * + * This value should generally scale with the number of rows in the dataset. + * By default the number of partitions is the square root of the number of + * rows. + * + * If this value is too large then the first part of the search (picking the + * right partition) will be slow. If this value is too small then the second + * part of the search (searching within a partition) will be slow. + */ + numPartitions?: number; + + /** + * Distance type to use to build the index. + * + * Default value is "l2". + * + * This is used when training the index to calculate the IVF partitions + * (vectors are grouped in partitions with similar vectors according to this + * distance type). + * + * The distance type used to train an index MUST match the distance type used + * to search the index. Failure to do so will yield inaccurate results. + * + * The following distance types are available: + * + * "l2" - Euclidean distance. This is a very common distance metric that + * accounts for both magnitude and direction when determining the distance + * between vectors. l2 distance has a range of [0, ∞). + * + * "cosine" - Cosine distance. Cosine distance is a distance metric + * calculated from the cosine similarity between two vectors. Cosine + * similarity is a measure of similarity between two non-zero vectors of an + * inner product space. It is defined to equal the cosine of the angle + * between them. Unlike l2, the cosine distance is not affected by the + * magnitude of the vectors. Cosine distance has a range of [0, 2]. + * + * Note: the cosine distance is undefined when one (or both) of the vectors + * are all zeros (there is no direction). These vectors are invalid and may + * never be returned from a vector search. + * + * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot + * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their + * l2 norm is 1), then dot distance is equivalent to the cosine distance. + * + * "hamming" - Hamming distance. Hamming distance is a distance metric + * calculated from the number of bits that are different between two vectors. + * Hamming distance has a range of [0, dimension]. Note that the hamming distance + * is only valid for binary vectors. + */ + distanceType?: "l2" | "cosine" | "dot" | "hamming"; + + /** + * Max iteration to train IVF kmeans. + * + * When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter + * controls how many iterations of kmeans to run. + * + * Increasing this might improve the quality of the index but in most cases these extra + * iterations have diminishing returns. + * + * The default value is 50. + */ + maxIterations?: number; + + /** + * The number of vectors, per partition, to sample when training IVF kmeans. + * + * When an IVF FLAT index is trained, we need to calculate partitions. These are groups + * of vectors that are similar to each other. To do this we use an algorithm called kmeans. + * + * Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a + * random sample of the data. This parameter controls the size of the sample. The total + * number of vectors used to train the index is `sample_rate * num_partitions`. + * + * Increasing this value might improve the quality of the index but in most cases the + * default should be sufficient. + * + * The default value is 256. + */ + sampleRate?: number; +} + /** * Options to create a full text search index */ @@ -426,6 +514,33 @@ export class Index { ); } + /** + * Create an IvfFlat index + * + * This index groups vectors into partitions of similar vectors. Each partition keeps track of + * a centroid which is the average value of all vectors in the group. + * + * During a query the centroids are compared with the query vector to find the closest + * partitions. The vectors in these partitions are then searched to find + * the closest vectors. + * + * The partitioning process is called IVF and the `num_partitions` parameter controls how + * many groups to create. + * + * Note that training an IVF FLAT index on a large dataset is a slow operation and + * currently is also a memory intensive operation. + */ + static ivfFlat(options?: Partial) { + return new Index( + LanceDbIndex.ivfFlat( + options?.distanceType, + options?.numPartitions, + options?.maxIterations, + options?.sampleRate, + ), + ); + } + /** * Create a btree index * diff --git a/nodejs/lancedb/util.ts b/nodejs/lancedb/util.ts index 275d5e15..9219f3d1 100644 --- a/nodejs/lancedb/util.ts +++ b/nodejs/lancedb/util.ts @@ -35,6 +35,16 @@ export function toSQL(value: IntoSql): string { } } +export function packBits(data: Array): Array { + const packed = Array(data.length >> 3).fill(0); + for (let i = 0; i < data.length; i++) { + const byte = i >> 3; + const bit = i & 7; + packed[byte] |= data[i] << bit; + } + return packed; +} + export class TTLCache { // biome-ignore lint/suspicious/noExplicitAny: private readonly cache: Map; diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs index d91b22f4..900505ce 100644 --- a/nodejs/src/index.rs +++ b/nodejs/src/index.rs @@ -4,7 +4,9 @@ use std::sync::Mutex; use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder}; -use lancedb::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder}; +use lancedb::index::vector::{ + IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, +}; use lancedb::index::Index as LanceDbIndex; use napi_derive::napi; @@ -63,6 +65,32 @@ impl Index { }) } + #[napi(factory)] + pub fn ivf_flat( + distance_type: Option, + num_partitions: Option, + max_iterations: Option, + sample_rate: Option, + ) -> napi::Result { + let mut ivf_flat_builder = IvfFlatIndexBuilder::default(); + if let Some(distance_type) = distance_type { + let distance_type = parse_distance_type(distance_type)?; + ivf_flat_builder = ivf_flat_builder.distance_type(distance_type); + } + if let Some(num_partitions) = num_partitions { + ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions); + } + if let Some(max_iterations) = max_iterations { + ivf_flat_builder = ivf_flat_builder.max_iterations(max_iterations); + } + if let Some(sample_rate) = sample_rate { + ivf_flat_builder = ivf_flat_builder.sample_rate(sample_rate); + } + Ok(Self { + inner: Mutex::new(Some(LanceDbIndex::IvfFlat(ivf_flat_builder))), + }) + } + #[napi(factory)] pub fn btree() -> Self { Self {