mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-25 22:29:58 +00:00
BREAKING CHANGE: embedding function implementations in Node need to now call `resolveVariables()` in their constructors and should **not** implement `toJSON()`. This tries to address the handling of secrets. In Node, they are currently lost. In Python, they are currently leaked into the table schema metadata. This PR introduces an in-memory variable store on the function registry. It also allows embedding function definitions to label certain config values as "sensitive", and the preprocessing logic will raise an error if users try to pass in hard-coded values. Closes #2110 Closes #521 --------- Co-authored-by: Weston Pace <weston.pace@gmail.com>
103 lines
2.6 KiB
TypeScript
103 lines
2.6 KiB
TypeScript
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
|
|
import type OpenAI from "openai";
|
|
import type { EmbeddingCreateParams } from "openai/resources/index";
|
|
import { Float, Float32 } from "../arrow";
|
|
import { EmbeddingFunction } from "./embedding_function";
|
|
import { register } from "./registry";
|
|
|
|
export type OpenAIOptions = {
|
|
apiKey: string;
|
|
model: EmbeddingCreateParams["model"];
|
|
};
|
|
|
|
@register("openai")
|
|
export class OpenAIEmbeddingFunction extends EmbeddingFunction<
|
|
string,
|
|
Partial<OpenAIOptions>
|
|
> {
|
|
#openai: OpenAI;
|
|
#modelName: OpenAIOptions["model"];
|
|
|
|
constructor(
|
|
optionsRaw: Partial<OpenAIOptions> = {
|
|
model: "text-embedding-ada-002",
|
|
},
|
|
) {
|
|
super();
|
|
const options = this.resolveVariables(optionsRaw);
|
|
|
|
const openAIKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
|
|
if (!openAIKey) {
|
|
throw new Error("OpenAI API key is required");
|
|
}
|
|
const modelName = options?.model ?? "text-embedding-ada-002";
|
|
|
|
/**
|
|
* @type {import("openai").default}
|
|
*/
|
|
// eslint-disable-next-line @typescript-eslint/naming-convention
|
|
let Openai;
|
|
try {
|
|
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
Openai = require("openai");
|
|
} catch {
|
|
throw new Error("please install openai@^4.24.1 using npm install openai");
|
|
}
|
|
|
|
const configuration = {
|
|
apiKey: openAIKey,
|
|
};
|
|
|
|
this.#openai = new Openai(configuration);
|
|
this.#modelName = modelName;
|
|
}
|
|
|
|
protected getSensitiveKeys(): string[] {
|
|
return ["apiKey"];
|
|
}
|
|
|
|
ndims(): number {
|
|
switch (this.#modelName) {
|
|
case "text-embedding-ada-002":
|
|
return 1536;
|
|
case "text-embedding-3-large":
|
|
return 3072;
|
|
case "text-embedding-3-small":
|
|
return 1536;
|
|
default:
|
|
throw new Error(`Unknown model: ${this.#modelName}`);
|
|
}
|
|
}
|
|
|
|
embeddingDataType(): Float {
|
|
return new Float32();
|
|
}
|
|
|
|
async computeSourceEmbeddings(data: string[]): Promise<number[][]> {
|
|
const response = await this.#openai.embeddings.create({
|
|
model: this.#modelName,
|
|
input: data,
|
|
});
|
|
|
|
const embeddings: number[][] = [];
|
|
for (let i = 0; i < response.data.length; i++) {
|
|
embeddings.push(response.data[i].embedding);
|
|
}
|
|
return embeddings;
|
|
}
|
|
|
|
async computeQueryEmbeddings(data: string): Promise<number[]> {
|
|
if (typeof data !== "string") {
|
|
throw new Error("Data must be a string");
|
|
}
|
|
const response = await this.#openai.embeddings.create({
|
|
model: this.#modelName,
|
|
input: data,
|
|
});
|
|
|
|
return response.data[0].embedding;
|
|
}
|
|
}
|