mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 07:09:57 +00:00
* Add new wait_for_index() table operation that polls until indices are created/fully indexed * Add an optional wait timeout parameter to all create_index operations * Python and NodeJS interfaces <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit ## Summary by CodeRabbit - **New Features** - Added optional waiting for index creation completion with configurable timeout. - Introduced methods to poll and wait for indices to be fully built across sync and async tables. - Extended index creation APIs to accept a wait timeout parameter. - **Bug Fixes** - Added a new timeout error variant for improved error reporting on index operations. - **Tests** - Added tests covering successful index readiness waiting, timeout scenarios, and missing index cases. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
687 lines
24 KiB
TypeScript
687 lines
24 KiB
TypeScript
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
|
|
import { Index as LanceDbIndex } from "./native";
|
|
|
|
/**
|
|
* Options to create an `IVF_PQ` index
|
|
*/
|
|
export interface IvfPqOptions {
|
|
/**
|
|
* The number of IVF partitions to create.
|
|
*
|
|
* This value should generally scale with the number of rows in the dataset.
|
|
* By default the number of partitions is the square root of the number of
|
|
* rows.
|
|
*
|
|
* If this value is too large then the first part of the search (picking the
|
|
* right partition) will be slow. If this value is too small then the second
|
|
* part of the search (searching within a partition) will be slow.
|
|
*/
|
|
numPartitions?: number;
|
|
|
|
/**
|
|
* Number of sub-vectors of PQ.
|
|
*
|
|
* This value controls how much the vector is compressed during the quantization step.
|
|
* The more sub vectors there are the less the vector is compressed. The default is
|
|
* the dimension of the vector divided by 16. If the dimension is not evenly divisible
|
|
* by 16 we use the dimension divded by 8.
|
|
*
|
|
* The above two cases are highly preferred. Having 8 or 16 values per subvector allows
|
|
* us to use efficient SIMD instructions.
|
|
*
|
|
* If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
|
|
* will likely result in poor performance.
|
|
*/
|
|
numSubVectors?: number;
|
|
|
|
/**
|
|
* Number of bits per sub-vector.
|
|
*
|
|
* This value controls how much each subvector is compressed. The more bits the more
|
|
* accurate the index will be but the slower search. The default is 8 bits.
|
|
*
|
|
* The number of bits must be 4 or 8.
|
|
*/
|
|
numBits?: number;
|
|
|
|
/**
|
|
* Distance type to use to build the index.
|
|
*
|
|
* Default value is "l2".
|
|
*
|
|
* This is used when training the index to calculate the IVF partitions
|
|
* (vectors are grouped in partitions with similar vectors according to this
|
|
* distance type) and to calculate a subvector's code during quantization.
|
|
*
|
|
* The distance type used to train an index MUST match the distance type used
|
|
* to search the index. Failure to do so will yield inaccurate results.
|
|
*
|
|
* The following distance types are available:
|
|
*
|
|
* "l2" - Euclidean distance. This is a very common distance metric that
|
|
* accounts for both magnitude and direction when determining the distance
|
|
* between vectors. l2 distance has a range of [0, ∞).
|
|
*
|
|
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
|
* calculated from the cosine similarity between two vectors. Cosine
|
|
* similarity is a measure of similarity between two non-zero vectors of an
|
|
* inner product space. It is defined to equal the cosine of the angle
|
|
* between them. Unlike l2, the cosine distance is not affected by the
|
|
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
|
*
|
|
* Note: the cosine distance is undefined when one (or both) of the vectors
|
|
* are all zeros (there is no direction). These vectors are invalid and may
|
|
* never be returned from a vector search.
|
|
*
|
|
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
|
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
|
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
|
*/
|
|
distanceType?: "l2" | "cosine" | "dot";
|
|
|
|
/**
|
|
* Max iteration to train IVF kmeans.
|
|
*
|
|
* When training an IVF PQ index we use kmeans to calculate the partitions. This parameter
|
|
* controls how many iterations of kmeans to run.
|
|
*
|
|
* Increasing this might improve the quality of the index but in most cases these extra
|
|
* iterations have diminishing returns.
|
|
*
|
|
* The default value is 50.
|
|
*/
|
|
maxIterations?: number;
|
|
|
|
/**
|
|
* The number of vectors, per partition, to sample when training IVF kmeans.
|
|
*
|
|
* When an IVF PQ index is trained, we need to calculate partitions. These are groups
|
|
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
|
*
|
|
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
|
* random sample of the data. This parameter controls the size of the sample. The total
|
|
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
|
*
|
|
* Increasing this value might improve the quality of the index but in most cases the
|
|
* default should be sufficient.
|
|
*
|
|
* The default value is 256.
|
|
*/
|
|
sampleRate?: number;
|
|
}
|
|
|
|
/**
|
|
* Options to create an `HNSW_PQ` index
|
|
*/
|
|
export interface HnswPqOptions {
|
|
/**
|
|
* The distance metric used to train the index.
|
|
*
|
|
* Default value is "l2".
|
|
*
|
|
* The following distance types are available:
|
|
*
|
|
* "l2" - Euclidean distance. This is a very common distance metric that
|
|
* accounts for both magnitude and direction when determining the distance
|
|
* between vectors. l2 distance has a range of [0, ∞).
|
|
*
|
|
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
|
* calculated from the cosine similarity between two vectors. Cosine
|
|
* similarity is a measure of similarity between two non-zero vectors of an
|
|
* inner product space. It is defined to equal the cosine of the angle
|
|
* between them. Unlike l2, the cosine distance is not affected by the
|
|
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
|
*
|
|
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
|
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
|
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
|
*/
|
|
distanceType?: "l2" | "cosine" | "dot";
|
|
|
|
/**
|
|
* The number of IVF partitions to create.
|
|
*
|
|
* For HNSW, we recommend a small number of partitions. Setting this to 1 works
|
|
* well for most tables. For very large tables, training just one HNSW graph
|
|
* will require too much memory. Each partition becomes its own HNSW graph, so
|
|
* setting this value higher reduces the peak memory use of training.
|
|
*
|
|
*/
|
|
numPartitions?: number;
|
|
|
|
/**
|
|
* Number of sub-vectors of PQ.
|
|
*
|
|
* This value controls how much the vector is compressed during the quantization step.
|
|
* The more sub vectors there are the less the vector is compressed. The default is
|
|
* the dimension of the vector divided by 16. If the dimension is not evenly divisible
|
|
* by 16 we use the dimension divded by 8.
|
|
*
|
|
* The above two cases are highly preferred. Having 8 or 16 values per subvector allows
|
|
* us to use efficient SIMD instructions.
|
|
*
|
|
* If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
|
|
* will likely result in poor performance.
|
|
*
|
|
*/
|
|
numSubVectors?: number;
|
|
|
|
/**
|
|
* Max iterations to train kmeans.
|
|
*
|
|
* The default value is 50.
|
|
*
|
|
* When training an IVF index we use kmeans to calculate the partitions. This parameter
|
|
* controls how many iterations of kmeans to run.
|
|
*
|
|
* Increasing this might improve the quality of the index but in most cases the parameter
|
|
* is unused because kmeans will converge with fewer iterations. The parameter is only
|
|
* used in cases where kmeans does not appear to converge. In those cases it is unlikely
|
|
* that setting this larger will lead to the index converging anyways.
|
|
*
|
|
*/
|
|
maxIterations?: number;
|
|
|
|
/**
|
|
* The rate used to calculate the number of training vectors for kmeans.
|
|
*
|
|
* Default value is 256.
|
|
*
|
|
* When an IVF index is trained, we need to calculate partitions. These are groups
|
|
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
|
*
|
|
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
|
* random sample of the data. This parameter controls the size of the sample. The total
|
|
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
|
*
|
|
* Increasing this value might improve the quality of the index but in most cases the
|
|
* default should be sufficient.
|
|
*
|
|
*/
|
|
sampleRate?: number;
|
|
|
|
/**
|
|
* The number of neighbors to select for each vector in the HNSW graph.
|
|
*
|
|
* The default value is 20.
|
|
*
|
|
* This value controls the tradeoff between search speed and accuracy.
|
|
* The higher the value the more accurate the search but the slower it will be.
|
|
*
|
|
*/
|
|
m?: number;
|
|
|
|
/**
|
|
* The number of candidates to evaluate during the construction of the HNSW graph.
|
|
*
|
|
* The default value is 300.
|
|
*
|
|
* This value controls the tradeoff between build speed and accuracy.
|
|
* The higher the value the more accurate the build but the slower it will be.
|
|
* 150 to 300 is the typical range. 100 is a minimum for good quality search
|
|
* results. In most cases, there is no benefit to setting this higher than 500.
|
|
* This value should be set to a value that is not less than `ef` in the search phase.
|
|
*
|
|
*/
|
|
efConstruction?: number;
|
|
}
|
|
|
|
/**
|
|
* Options to create an `HNSW_SQ` index
|
|
*/
|
|
export interface HnswSqOptions {
|
|
/**
|
|
* The distance metric used to train the index.
|
|
*
|
|
* Default value is "l2".
|
|
*
|
|
* The following distance types are available:
|
|
*
|
|
* "l2" - Euclidean distance. This is a very common distance metric that
|
|
* accounts for both magnitude and direction when determining the distance
|
|
* between vectors. l2 distance has a range of [0, ∞).
|
|
*
|
|
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
|
* calculated from the cosine similarity between two vectors. Cosine
|
|
* similarity is a measure of similarity between two non-zero vectors of an
|
|
* inner product space. It is defined to equal the cosine of the angle
|
|
* between them. Unlike l2, the cosine distance is not affected by the
|
|
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
|
*
|
|
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
|
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
|
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
|
*/
|
|
distanceType?: "l2" | "cosine" | "dot";
|
|
|
|
/**
|
|
* The number of IVF partitions to create.
|
|
*
|
|
* For HNSW, we recommend a small number of partitions. Setting this to 1 works
|
|
* well for most tables. For very large tables, training just one HNSW graph
|
|
* will require too much memory. Each partition becomes its own HNSW graph, so
|
|
* setting this value higher reduces the peak memory use of training.
|
|
*
|
|
*/
|
|
numPartitions?: number;
|
|
|
|
/**
|
|
* Max iterations to train kmeans.
|
|
*
|
|
* The default value is 50.
|
|
*
|
|
* When training an IVF index we use kmeans to calculate the partitions. This parameter
|
|
* controls how many iterations of kmeans to run.
|
|
*
|
|
* Increasing this might improve the quality of the index but in most cases the parameter
|
|
* is unused because kmeans will converge with fewer iterations. The parameter is only
|
|
* used in cases where kmeans does not appear to converge. In those cases it is unlikely
|
|
* that setting this larger will lead to the index converging anyways.
|
|
*
|
|
*/
|
|
maxIterations?: number;
|
|
|
|
/**
|
|
* The rate used to calculate the number of training vectors for kmeans.
|
|
*
|
|
* Default value is 256.
|
|
*
|
|
* When an IVF index is trained, we need to calculate partitions. These are groups
|
|
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
|
*
|
|
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
|
* random sample of the data. This parameter controls the size of the sample. The total
|
|
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
|
*
|
|
* Increasing this value might improve the quality of the index but in most cases the
|
|
* default should be sufficient.
|
|
*
|
|
*/
|
|
sampleRate?: number;
|
|
|
|
/**
|
|
* The number of neighbors to select for each vector in the HNSW graph.
|
|
*
|
|
* The default value is 20.
|
|
*
|
|
* This value controls the tradeoff between search speed and accuracy.
|
|
* The higher the value the more accurate the search but the slower it will be.
|
|
*
|
|
*/
|
|
m?: number;
|
|
|
|
/**
|
|
* The number of candidates to evaluate during the construction of the HNSW graph.
|
|
*
|
|
* The default value is 300.
|
|
*
|
|
* This value controls the tradeoff between build speed and accuracy.
|
|
* The higher the value the more accurate the build but the slower it will be.
|
|
* 150 to 300 is the typical range. 100 is a minimum for good quality search
|
|
* results. In most cases, there is no benefit to setting this higher than 500.
|
|
* This value should be set to a value that is not less than `ef` in the search phase.
|
|
*
|
|
*/
|
|
efConstruction?: number;
|
|
}
|
|
|
|
/**
|
|
* Options to create an `IVF_FLAT` index
|
|
*/
|
|
export interface IvfFlatOptions {
|
|
/**
|
|
* The number of IVF partitions to create.
|
|
*
|
|
* This value should generally scale with the number of rows in the dataset.
|
|
* By default the number of partitions is the square root of the number of
|
|
* rows.
|
|
*
|
|
* If this value is too large then the first part of the search (picking the
|
|
* right partition) will be slow. If this value is too small then the second
|
|
* part of the search (searching within a partition) will be slow.
|
|
*/
|
|
numPartitions?: number;
|
|
|
|
/**
|
|
* Distance type to use to build the index.
|
|
*
|
|
* Default value is "l2".
|
|
*
|
|
* This is used when training the index to calculate the IVF partitions
|
|
* (vectors are grouped in partitions with similar vectors according to this
|
|
* distance type).
|
|
*
|
|
* The distance type used to train an index MUST match the distance type used
|
|
* to search the index. Failure to do so will yield inaccurate results.
|
|
*
|
|
* The following distance types are available:
|
|
*
|
|
* "l2" - Euclidean distance. This is a very common distance metric that
|
|
* accounts for both magnitude and direction when determining the distance
|
|
* between vectors. l2 distance has a range of [0, ∞).
|
|
*
|
|
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
|
* calculated from the cosine similarity between two vectors. Cosine
|
|
* similarity is a measure of similarity between two non-zero vectors of an
|
|
* inner product space. It is defined to equal the cosine of the angle
|
|
* between them. Unlike l2, the cosine distance is not affected by the
|
|
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
|
*
|
|
* Note: the cosine distance is undefined when one (or both) of the vectors
|
|
* are all zeros (there is no direction). These vectors are invalid and may
|
|
* never be returned from a vector search.
|
|
*
|
|
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
|
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
|
* l2 norm is 1), then dot distance is equivalent to the cosine distance.
|
|
*
|
|
* "hamming" - Hamming distance. Hamming distance is a distance metric
|
|
* calculated from the number of bits that are different between two vectors.
|
|
* Hamming distance has a range of [0, dimension]. Note that the hamming distance
|
|
* is only valid for binary vectors.
|
|
*/
|
|
distanceType?: "l2" | "cosine" | "dot" | "hamming";
|
|
|
|
/**
|
|
* Max iteration to train IVF kmeans.
|
|
*
|
|
* When training an IVF FLAT index we use kmeans to calculate the partitions. This parameter
|
|
* controls how many iterations of kmeans to run.
|
|
*
|
|
* Increasing this might improve the quality of the index but in most cases these extra
|
|
* iterations have diminishing returns.
|
|
*
|
|
* The default value is 50.
|
|
*/
|
|
maxIterations?: number;
|
|
|
|
/**
|
|
* The number of vectors, per partition, to sample when training IVF kmeans.
|
|
*
|
|
* When an IVF FLAT index is trained, we need to calculate partitions. These are groups
|
|
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
|
*
|
|
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
|
* random sample of the data. This parameter controls the size of the sample. The total
|
|
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
|
*
|
|
* Increasing this value might improve the quality of the index but in most cases the
|
|
* default should be sufficient.
|
|
*
|
|
* The default value is 256.
|
|
*/
|
|
sampleRate?: number;
|
|
}
|
|
|
|
/**
|
|
* Options to create a full text search index
|
|
*/
|
|
export interface FtsOptions {
|
|
/**
|
|
* Whether to build the index with positions.
|
|
* True by default.
|
|
* If set to false, the index will not store the positions of the tokens in the text,
|
|
* which will make the index smaller and faster to build, but will not support phrase queries.
|
|
*/
|
|
withPosition?: boolean;
|
|
|
|
/**
|
|
* The tokenizer to use when building the index.
|
|
* The default is "simple".
|
|
*
|
|
* The following tokenizers are available:
|
|
*
|
|
* "simple" - Simple tokenizer. This tokenizer splits the text into tokens using whitespace and punctuation as a delimiter.
|
|
*
|
|
* "whitespace" - Whitespace tokenizer. This tokenizer splits the text into tokens using whitespace as a delimiter.
|
|
*
|
|
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
|
|
*/
|
|
baseTokenizer?: "simple" | "whitespace" | "raw";
|
|
|
|
/**
|
|
* language for stemming and stop words
|
|
* this is only used when `stem` or `remove_stop_words` is true
|
|
*/
|
|
language?: string;
|
|
|
|
/**
|
|
* maximum token length
|
|
* tokens longer than this length will be ignored
|
|
*/
|
|
maxTokenLength?: number;
|
|
|
|
/**
|
|
* whether to lowercase tokens
|
|
*/
|
|
lowercase?: boolean;
|
|
|
|
/**
|
|
* whether to stem tokens
|
|
*/
|
|
stem?: boolean;
|
|
|
|
/**
|
|
* whether to remove stop words
|
|
*/
|
|
removeStopWords?: boolean;
|
|
|
|
/**
|
|
* whether to remove punctuation
|
|
*/
|
|
asciiFolding?: boolean;
|
|
}
|
|
|
|
export class Index {
|
|
private readonly inner: LanceDbIndex;
|
|
private constructor(inner: LanceDbIndex) {
|
|
this.inner = inner;
|
|
}
|
|
|
|
/**
|
|
* Create an IvfPq index
|
|
*
|
|
* This index stores a compressed (quantized) copy of every vector. These vectors
|
|
* are grouped into partitions of similar vectors. Each partition keeps track of
|
|
* a centroid which is the average value of all vectors in the group.
|
|
*
|
|
* During a query the centroids are compared with the query vector to find the closest
|
|
* partitions. The compressed vectors in these partitions are then searched to find
|
|
* the closest vectors.
|
|
*
|
|
* The compression scheme is called product quantization. Each vector is divided into
|
|
* subvectors and then each subvector is quantized into a small number of bits. the
|
|
* parameters `num_bits` and `num_subvectors` control this process, providing a tradeoff
|
|
* between index size (and thus search speed) and index accuracy.
|
|
*
|
|
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
|
* many groups to create.
|
|
*
|
|
* Note that training an IVF PQ index on a large dataset is a slow operation and
|
|
* currently is also a memory intensive operation.
|
|
*/
|
|
static ivfPq(options?: Partial<IvfPqOptions>) {
|
|
return new Index(
|
|
LanceDbIndex.ivfPq(
|
|
options?.distanceType,
|
|
options?.numPartitions,
|
|
options?.numSubVectors,
|
|
options?.maxIterations,
|
|
options?.sampleRate,
|
|
),
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Create an IvfFlat index
|
|
*
|
|
* This index groups vectors into partitions of similar vectors. Each partition keeps track of
|
|
* a centroid which is the average value of all vectors in the group.
|
|
*
|
|
* During a query the centroids are compared with the query vector to find the closest
|
|
* partitions. The vectors in these partitions are then searched to find
|
|
* the closest vectors.
|
|
*
|
|
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
|
* many groups to create.
|
|
*
|
|
* Note that training an IVF FLAT index on a large dataset is a slow operation and
|
|
* currently is also a memory intensive operation.
|
|
*/
|
|
static ivfFlat(options?: Partial<IvfFlatOptions>) {
|
|
return new Index(
|
|
LanceDbIndex.ivfFlat(
|
|
options?.distanceType,
|
|
options?.numPartitions,
|
|
options?.maxIterations,
|
|
options?.sampleRate,
|
|
),
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Create a btree index
|
|
*
|
|
* A btree index is an index on a scalar columns. The index stores a copy of the column
|
|
* in sorted order. A header entry is created for each block of rows (currently the
|
|
* block size is fixed at 4096). These header entries are stored in a separate
|
|
* cacheable structure (a btree). To search for data the header is used to determine
|
|
* which blocks need to be read from disk.
|
|
*
|
|
* For example, a btree index in a table with 1Bi rows requires sizeof(Scalar) * 256Ki
|
|
* bytes of memory and will generally need to read sizeof(Scalar) * 4096 bytes to find
|
|
* the correct row ids.
|
|
*
|
|
* This index is good for scalar columns with mostly distinct values and does best when
|
|
* the query is highly selective.
|
|
*
|
|
* The btree index does not currently have any parameters though parameters such as the
|
|
* block size may be added in the future.
|
|
*/
|
|
static btree() {
|
|
return new Index(LanceDbIndex.btree());
|
|
}
|
|
|
|
/**
|
|
* Create a bitmap index.
|
|
*
|
|
* A `Bitmap` index stores a bitmap for each distinct value in the column for every row.
|
|
*
|
|
* This index works best for low-cardinality columns, where the number of unique values
|
|
* is small (i.e., less than a few hundreds).
|
|
*/
|
|
static bitmap() {
|
|
return new Index(LanceDbIndex.bitmap());
|
|
}
|
|
|
|
/**
|
|
* Create a label list index.
|
|
*
|
|
* LabelList index is a scalar index that can be used on `List<T>` columns to
|
|
* support queries with `array_contains_all` and `array_contains_any`
|
|
* using an underlying bitmap index.
|
|
*/
|
|
static labelList() {
|
|
return new Index(LanceDbIndex.labelList());
|
|
}
|
|
|
|
/**
|
|
* Create a full text search index
|
|
*
|
|
* A full text search index is an index on a string column, so that you can conduct full
|
|
* text searches on the column.
|
|
*
|
|
* The results of a full text search are ordered by relevance measured by BM25.
|
|
*
|
|
* You can combine filters with full text search.
|
|
*/
|
|
static fts(options?: Partial<FtsOptions>) {
|
|
return new Index(
|
|
LanceDbIndex.fts(
|
|
options?.withPosition,
|
|
options?.baseTokenizer,
|
|
options?.language,
|
|
options?.maxTokenLength,
|
|
options?.lowercase,
|
|
options?.stem,
|
|
options?.removeStopWords,
|
|
options?.asciiFolding,
|
|
),
|
|
);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Create a hnswPq index
|
|
*
|
|
* HNSW-PQ stands for Hierarchical Navigable Small World - Product Quantization.
|
|
* It is a variant of the HNSW algorithm that uses product quantization to compress
|
|
* the vectors.
|
|
*
|
|
*/
|
|
static hnswPq(options?: Partial<HnswPqOptions>) {
|
|
return new Index(
|
|
LanceDbIndex.hnswPq(
|
|
options?.distanceType,
|
|
options?.numPartitions,
|
|
options?.numSubVectors,
|
|
options?.maxIterations,
|
|
options?.sampleRate,
|
|
options?.m,
|
|
options?.efConstruction,
|
|
),
|
|
);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Create a hnswSq index
|
|
*
|
|
* HNSW-SQ stands for Hierarchical Navigable Small World - Scalar Quantization.
|
|
* It is a variant of the HNSW algorithm that uses scalar quantization to compress
|
|
* the vectors.
|
|
*
|
|
*/
|
|
static hnswSq(options?: Partial<HnswSqOptions>) {
|
|
return new Index(
|
|
LanceDbIndex.hnswSq(
|
|
options?.distanceType,
|
|
options?.numPartitions,
|
|
options?.maxIterations,
|
|
options?.sampleRate,
|
|
options?.m,
|
|
options?.efConstruction,
|
|
),
|
|
);
|
|
}
|
|
}
|
|
|
|
export interface IndexOptions {
|
|
/**
|
|
* Advanced index configuration
|
|
*
|
|
* This option allows you to specify a specfic index to create and also
|
|
* allows you to pass in configuration for training the index.
|
|
*
|
|
* See the static methods on Index for details on the various index types.
|
|
*
|
|
* If this is not supplied then column data type(s) and column statistics
|
|
* will be used to determine the most useful kind of index to create.
|
|
*/
|
|
config?: Index;
|
|
/**
|
|
* Whether to replace the existing index
|
|
*
|
|
* If this is false, and another index already exists on the same columns
|
|
* and the same name, then an error will be returned. This is true even if
|
|
* that index is out of date.
|
|
*
|
|
* The default is true
|
|
*/
|
|
replace?: boolean;
|
|
|
|
waitTimeoutSeconds?: number;
|
|
}
|