mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-10 13:52:58 +00:00
feat!: migrate FTS from tantivy to lance-index (#1483)
Lance now supports FTS, so add it into lancedb Python, TypeScript and Rust SDKs. For Python, we still use tantivy based FTS by default because the lance FTS index now misses some features of tantivy. For Python: - Support to create lance based FTS index - Support to specify columns for full text search (only available for lance based FTS index) For TypeScript: - Change the search method so that it can accept both string and vector - Support full text search For Rust - Support full text search The others: - Update the FTS doc BREAKING CHANGE: - for Python, this renames the attached score column of FTS from "score" to "_score", this could be a breaking change for users that rely the scores --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
@@ -37,6 +37,13 @@ interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
|
||||
export class EmbeddingFunctionRegistry {
|
||||
#functions = new Map<string, EmbeddingFunctionConstructor>();
|
||||
|
||||
/**
|
||||
* Get the number of registered functions
|
||||
*/
|
||||
length() {
|
||||
return this.#functions.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register an embedding function
|
||||
* @param name The name of the function
|
||||
|
||||
@@ -175,6 +175,22 @@ export class Index {
|
||||
static btree() {
|
||||
return new Index(LanceDbIndex.btree());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a full text search index
|
||||
*
|
||||
* A full text search index is an index on a string column, so that you can conduct full
|
||||
* text searches on the column.
|
||||
*
|
||||
* The results of a full text search are ordered by relevance measured by BM25.
|
||||
*
|
||||
* You can combine filters with full text search.
|
||||
*
|
||||
* For now, the full text search index only supports English, and doesn't support phrase search.
|
||||
*/
|
||||
static fts() {
|
||||
return new Index(LanceDbIndex.fts());
|
||||
}
|
||||
}
|
||||
|
||||
export interface IndexOptions {
|
||||
|
||||
@@ -88,6 +88,19 @@ export interface QueryExecutionOptions {
|
||||
maxBatchLength?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options that control the behavior of a full text search
|
||||
*/
|
||||
export interface FullTextSearchOptions {
|
||||
/**
|
||||
* The columns to search
|
||||
*
|
||||
* If not specified, all indexed columns will be searched.
|
||||
* For now, only one column can be searched.
|
||||
*/
|
||||
columns?: string | string[];
|
||||
}
|
||||
|
||||
/** Common methods supported by all query types */
|
||||
export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
implements AsyncIterable<RecordBatch>
|
||||
@@ -134,6 +147,25 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
return this.where(predicate);
|
||||
}
|
||||
|
||||
fullTextSearch(
|
||||
query: string,
|
||||
options?: Partial<FullTextSearchOptions>,
|
||||
): this {
|
||||
let columns = null;
|
||||
if (options) {
|
||||
if (typeof options.columns === "string") {
|
||||
columns = [options.columns];
|
||||
} else if (Array.isArray(options.columns)) {
|
||||
columns = options.columns;
|
||||
}
|
||||
}
|
||||
|
||||
this.doCall((inner: NativeQueryType) =>
|
||||
inner.fullTextSearch(query, columns),
|
||||
);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return only the specified columns.
|
||||
*
|
||||
|
||||
@@ -270,22 +270,23 @@ export abstract class Table {
|
||||
* @returns {Query} A builder that can be used to parameterize the query
|
||||
*/
|
||||
abstract query(): Query;
|
||||
|
||||
/**
|
||||
* Create a search query to find the nearest neighbors
|
||||
* of the given query vector
|
||||
* @param {string} query - the query. This will be converted to a vector using the table's provided embedding function
|
||||
* @note If no embedding functions are defined in the table, this will error when collecting the results.
|
||||
* of the given query
|
||||
* @param {string | IntoVector} query - the query, a vector or string
|
||||
* @param {string} queryType - the type of the query, "vector", "fts", or "auto"
|
||||
* @param {string | string[]} ftsColumns - the columns to search in for full text search
|
||||
* for now, only one column can be searched at a time.
|
||||
*
|
||||
* This is just a convenience method for calling `.query().nearestTo(await myEmbeddingFunction(query))`
|
||||
* when "auto" is used, if the query is a string and an embedding function is defined, it will be treated as a vector query
|
||||
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
||||
*/
|
||||
abstract search(query: string): VectorQuery;
|
||||
/**
|
||||
* Create a search query to find the nearest neighbors
|
||||
* of the given query vector
|
||||
* @param {IntoVector} query - the query vector
|
||||
* This is just a convenience method for calling `.query().nearestTo(query)`
|
||||
*/
|
||||
abstract search(query: IntoVector): VectorQuery;
|
||||
abstract search(
|
||||
query: string | IntoVector,
|
||||
queryType?: string,
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query;
|
||||
/**
|
||||
* Search the table with a given query vector.
|
||||
*
|
||||
@@ -581,27 +582,50 @@ export class LocalTable extends Table {
|
||||
query(): Query {
|
||||
return new Query(this.inner);
|
||||
}
|
||||
search(query: string | IntoVector): VectorQuery {
|
||||
if (typeof query !== "string") {
|
||||
return this.vectorSearch(query);
|
||||
} else {
|
||||
const queryPromise = this.getEmbeddingFunctions().then(
|
||||
async (functions) => {
|
||||
// TODO: Support multiple embedding functions
|
||||
const embeddingFunc: EmbeddingFunctionConfig | undefined = functions
|
||||
.values()
|
||||
.next().value;
|
||||
if (!embeddingFunc) {
|
||||
return Promise.reject(
|
||||
new Error("No embedding functions are defined in the table"),
|
||||
);
|
||||
}
|
||||
return await embeddingFunc.function.computeQueryEmbeddings(query);
|
||||
},
|
||||
);
|
||||
|
||||
return this.query().nearestTo(queryPromise);
|
||||
search(
|
||||
query: string | IntoVector,
|
||||
queryType: string = "auto",
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query {
|
||||
if (typeof query !== "string") {
|
||||
if (queryType === "fts") {
|
||||
throw new Error("Cannot perform full text search on a vector query");
|
||||
}
|
||||
return this.vectorSearch(query);
|
||||
}
|
||||
|
||||
// If the query is a string, we need to determine if it is a vector query or a full text search query
|
||||
if (queryType === "fts") {
|
||||
return this.query().fullTextSearch(query, {
|
||||
columns: ftsColumns,
|
||||
});
|
||||
}
|
||||
|
||||
// The query type is auto or vector
|
||||
// fall back to full text search if no embedding functions are defined and the query is a string
|
||||
if (queryType === "auto" && getRegistry().length() === 0) {
|
||||
return this.query().fullTextSearch(query, {
|
||||
columns: ftsColumns,
|
||||
});
|
||||
}
|
||||
|
||||
const queryPromise = this.getEmbeddingFunctions().then(
|
||||
async (functions) => {
|
||||
// TODO: Support multiple embedding functions
|
||||
const embeddingFunc: EmbeddingFunctionConfig | undefined = functions
|
||||
.values()
|
||||
.next().value;
|
||||
if (!embeddingFunc) {
|
||||
return Promise.reject(
|
||||
new Error("No embedding functions are defined in the table"),
|
||||
);
|
||||
}
|
||||
return await embeddingFunc.function.computeQueryEmbeddings(query);
|
||||
},
|
||||
);
|
||||
|
||||
return this.query().nearestTo(queryPromise);
|
||||
}
|
||||
|
||||
vectorSearch(vector: IntoVector): VectorQuery {
|
||||
|
||||
Reference in New Issue
Block a user