Files
lancedb/nodejs/__test__/rerankers.test.ts
Bert c9f248b058 feat: add hybrid search to node and rust SDKs (#1940)
Support hybrid search in both rust and node SDKs.

- Adds a new rerankers package to rust LanceDB, with the implementation
of the default RRF reranker
- Adds a new hybrid package to lancedb, with some helper methods related
to hybrid search such as normalizing scores and converting score column
to rank columns
- Adds capability to LanceDB VectorQuery to perform hybrid search if it
has both a nearest vector and full text search parameters.
- Adds wrappers for reranker implementations to nodejs SDK.

Additional rerankers will be added in followup PRs

https://github.com/lancedb/lancedb/issues/1921

---
Notes about how the rust rerankers are wrapped for calling from JS:

I wanted to keep the core reranker logic, and the invocation of the
reranker by the query code, in Rust. This aligns with the philosophy of
the new node SDK where it's just a thin wrapper around Rust. However, I
also wanted to have support for users who want to add custom rerankers
written in Javascript.

When we add a reranker to the query from Javascript, it adds a special
Rust reranker that has a callback to the Javascript code (which could
then turn around and call an underlying Rust reranker implementation if
desired). This adds a bit of complexity, but overall I think it moves us
in the right direction of having the majority of the query logic in the
underlying Rust SDK while keeping the option open to support custom
Javascript Rerankers.
2024-12-30 09:03:41 -05:00

80 lines
2.2 KiB
TypeScript

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import { RecordBatch } from "apache-arrow";
import * as tmp from "tmp";
import { Connection, Index, Table, connect, makeArrowTable } from "../lancedb";
import { RRFReranker } from "../lancedb/rerankers";
describe("rerankers", function () {
let tmpDir: tmp.DirResult;
let conn: Connection;
let table: Table;
beforeEach(async () => {
tmpDir = tmp.dirSync({ unsafeCleanup: true });
conn = await connect(tmpDir.name);
table = await conn.createTable("mytable", [
{ vector: [0.1, 0.1], text: "dog" },
{ vector: [0.2, 0.2], text: "cat" },
]);
await table.createIndex("text", {
config: Index.fts(),
replace: true,
});
});
it("will query with the custom reranker", async function () {
const expectedResult = [
{
text: "albert",
// biome-ignore lint/style/useNamingConvention: this is the lance field name
_relevance_score: 0.99,
},
];
class MyCustomReranker {
async rerankHybrid(
_query: string,
_vecResults: RecordBatch,
_ftsResults: RecordBatch,
): Promise<RecordBatch> {
// no reranker logic, just return some static data
const table = makeArrowTable(expectedResult);
return table.batches[0];
}
}
let result = await table
.query()
.nearestTo([0.1, 0.1])
.fullTextSearch("dog")
.rerank(new MyCustomReranker())
.select(["text"])
.limit(5)
.toArray();
result = JSON.parse(JSON.stringify(result)); // convert StructRow to Object
expect(result).toEqual([
{
text: "albert",
// biome-ignore lint/style/useNamingConvention: this is the lance field name
_relevance_score: 0.99,
},
]);
});
it("will query with RRFReranker", async function () {
// smoke test to see if the Rust wrapping Typescript is wired up correctly
const result = await table
.query()
.nearestTo([0.1, 0.1])
.fullTextSearch("dog")
.rerank(await RRFReranker.create())
.select(["text"])
.limit(5)
.toArray();
expect(result).toHaveLength(2);
});
});