mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-10 13:52:58 +00:00
feat: support binary vector and IVF_FLAT in TypeScript (#2221)
resolve #2218 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
@@ -53,6 +53,7 @@ export {
|
||||
Index,
|
||||
IndexOptions,
|
||||
IvfPqOptions,
|
||||
IvfFlatOptions,
|
||||
HnswPqOptions,
|
||||
HnswSqOptions,
|
||||
FtsOptions,
|
||||
@@ -79,7 +80,7 @@ export {
|
||||
DataLike,
|
||||
IntoVector,
|
||||
} from "./arrow";
|
||||
export { IntoSql } from "./util";
|
||||
export { IntoSql, packBits } from "./util";
|
||||
|
||||
/**
|
||||
* Connect to a LanceDB instance at the given URI.
|
||||
|
||||
@@ -327,6 +327,94 @@ export interface HnswSqOptions {
|
||||
efConstruction?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create an `IVF_FLAT` index
|
||||
*/
|
||||
export interface IvfFlatOptions {
|
||||
/**
|
||||
* The number of IVF partitions to create.
|
||||
*
|
||||
* This value should generally scale with the number of rows in the dataset.
|
||||
* By default the number of partitions is the square root of the number of
|
||||
* rows.
|
||||
*
|
||||
* If this value is too large then the first part of the search (picking the
|
||||
* right partition) will be slow. If this value is too small then the second
|
||||
* part of the search (searching within a partition) will be slow.
|
||||
*/
|
||||
numPartitions?: number;
|
||||
|
||||
/**
|
||||
* Distance type to use to build the index.
|
||||
*
|
||||
* Default value is "l2".
|
||||
*
|
||||
* This is used when training the index to calculate the IVF partitions
|
||||
* (vectors are grouped in partitions with similar vectors according to this
|
||||
* distance type).
|
||||
*
|
||||
* The distance type used to train an index MUST match the distance type used
|
||||
* to search the index. Failure to do so will yield inaccurate results.
|
||||
*
|
||||
* The following distance types are available:
|
||||
*
|
||||
* "l2" - Euclidean distance. This is a very common distance metric that
|
||||
* accounts for both magnitude and direction when determining the distance
|
||||
* between vectors. l2 distance has a range of [0, ∞).
|
||||
*
|
||||
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
* calculated from the cosine similarity between two vectors. Cosine
|
||||
* similarity is a measure of similarity between two non-zero vectors of an
|
||||
* inner product space. It is defined to equal the cosine of the angle
|
||||
* between them. Unlike l2, the cosine distance is not affected by the
|
||||
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
*
|
||||
* Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
* are all zeros (there is no direction). These vectors are invalid and may
|
||||
* never be returned from a vector search.
|
||||
*
|
||||
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
*
|
||||
* "hamming" - Hamming distance. Hamming distance is a distance metric
|
||||
* calculated from the number of bits that are different between two vectors.
|
||||
* Hamming distance has a range of [0, dimension]. Note that the hamming distance
|
||||
* is only valid for binary vectors.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot" | "hamming";
|
||||
|
||||
/**
|
||||
* Max iteration to train IVF kmeans.
|
||||
*
|
||||
* When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter
|
||||
* controls how many iterations of kmeans to run.
|
||||
*
|
||||
* Increasing this might improve the quality of the index but in most cases these extra
|
||||
* iterations have diminishing returns.
|
||||
*
|
||||
* The default value is 50.
|
||||
*/
|
||||
maxIterations?: number;
|
||||
|
||||
/**
|
||||
* The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
*
|
||||
* When an IVF FLAT index is trained, we need to calculate partitions. These are groups
|
||||
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
*
|
||||
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
* random sample of the data. This parameter controls the size of the sample. The total
|
||||
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
*
|
||||
* Increasing this value might improve the quality of the index but in most cases the
|
||||
* default should be sufficient.
|
||||
*
|
||||
* The default value is 256.
|
||||
*/
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create a full text search index
|
||||
*/
|
||||
@@ -426,6 +514,33 @@ export class Index {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an IvfFlat index
|
||||
*
|
||||
* This index groups vectors into partitions of similar vectors. Each partition keeps track of
|
||||
* a centroid which is the average value of all vectors in the group.
|
||||
*
|
||||
* During a query the centroids are compared with the query vector to find the closest
|
||||
* partitions. The vectors in these partitions are then searched to find
|
||||
* the closest vectors.
|
||||
*
|
||||
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
* many groups to create.
|
||||
*
|
||||
* Note that training an IVF FLAT index on a large dataset is a slow operation and
|
||||
* currently is also a memory intensive operation.
|
||||
*/
|
||||
static ivfFlat(options?: Partial<IvfFlatOptions>) {
|
||||
return new Index(
|
||||
LanceDbIndex.ivfFlat(
|
||||
options?.distanceType,
|
||||
options?.numPartitions,
|
||||
options?.maxIterations,
|
||||
options?.sampleRate,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a btree index
|
||||
*
|
||||
|
||||
@@ -35,6 +35,16 @@ export function toSQL(value: IntoSql): string {
|
||||
}
|
||||
}
|
||||
|
||||
export function packBits(data: Array<number>): Array<number> {
|
||||
const packed = Array(data.length >> 3).fill(0);
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const byte = i >> 3;
|
||||
const bit = i & 7;
|
||||
packed[byte] |= data[i] << bit;
|
||||
}
|
||||
return packed;
|
||||
}
|
||||
|
||||
export class TTLCache {
|
||||
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||
private readonly cache: Map<string, { value: any; expires: number }>;
|
||||
|
||||
Reference in New Issue
Block a user