mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-24 07:20:40 +00:00
feat!: add variable store to embeddings registry (#2112)
BREAKING CHANGE: embedding function implementations in Node need to now call `resolveVariables()` in their constructors and should **not** implement `toJSON()`. This tries to address the handling of secrets. In Node, they are currently lost. In Python, they are currently leaked into the table schema metadata. This PR introduces an in-memory variable store on the function registry. It also allows embedding function definitions to label certain config values as "sensitive", and the preprocessing logic will raise an error if users try to pass in hard-coded values. Closes #2110 Closes #521 --------- Co-authored-by: Weston Pace <weston.pace@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import {
|
||||
newVectorType,
|
||||
} from "../arrow";
|
||||
import { sanitizeType } from "../sanitize";
|
||||
import { getRegistry } from "./registry";
|
||||
|
||||
/**
|
||||
* Options for a given embedding function
|
||||
@@ -32,6 +33,22 @@ export interface EmbeddingFunctionConstructor<
|
||||
|
||||
/**
|
||||
* An embedding function that automatically creates vector representation for a given column.
|
||||
*
|
||||
* It's important subclasses pass the **original** options to the super constructor
|
||||
* and then pass those options to `resolveVariables` to resolve any variables before
|
||||
* using them.
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* class MyEmbeddingFunction extends EmbeddingFunction {
|
||||
* constructor(options: {model: string, timeout: number}) {
|
||||
* super(optionsRaw);
|
||||
* const options = this.resolveVariables(optionsRaw);
|
||||
* this.model = options.model;
|
||||
* this.timeout = options.timeout;
|
||||
* }
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
export abstract class EmbeddingFunction<
|
||||
// biome-ignore lint/suspicious/noExplicitAny: we don't know what the implementor will do
|
||||
@@ -44,33 +61,74 @@ export abstract class EmbeddingFunction<
|
||||
*/
|
||||
// biome-ignore lint/style/useNamingConvention: we want to keep the name as it is
|
||||
readonly TOptions!: M;
|
||||
/**
|
||||
* Convert the embedding function to a JSON object
|
||||
* It is used to serialize the embedding function to the schema
|
||||
* It's important that any object returned by this method contains all the necessary
|
||||
* information to recreate the embedding function
|
||||
*
|
||||
* It should return the same object that was passed to the constructor
|
||||
* If it does not, the embedding function will not be able to be recreated, or could be recreated incorrectly
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* class MyEmbeddingFunction extends EmbeddingFunction {
|
||||
* constructor(options: {model: string, timeout: number}) {
|
||||
* super();
|
||||
* this.model = options.model;
|
||||
* this.timeout = options.timeout;
|
||||
* }
|
||||
* toJSON() {
|
||||
* return {
|
||||
* model: this.model,
|
||||
* timeout: this.timeout,
|
||||
* };
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
abstract toJSON(): Partial<M>;
|
||||
|
||||
#config: Partial<M>;
|
||||
|
||||
/**
|
||||
* Get the original arguments to the constructor, to serialize them so they
|
||||
* can be used to recreate the embedding function later.
|
||||
*/
|
||||
// biome-ignore lint/suspicious/noExplicitAny :
|
||||
toJSON(): Record<string, any> {
|
||||
return JSON.parse(JSON.stringify(this.#config));
|
||||
}
|
||||
|
||||
constructor() {
|
||||
this.#config = {};
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide a list of keys in the function options that should be treated as
|
||||
* sensitive. If users pass raw values for these keys, they will be rejected.
|
||||
*/
|
||||
protected getSensitiveKeys(): string[] {
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply variables to the config.
|
||||
*/
|
||||
protected resolveVariables(config: Partial<M>): Partial<M> {
|
||||
this.#config = config;
|
||||
const registry = getRegistry();
|
||||
const newConfig = { ...config };
|
||||
for (const [key_, value] of Object.entries(newConfig)) {
|
||||
if (
|
||||
this.getSensitiveKeys().includes(key_) &&
|
||||
!value.startsWith("$var:")
|
||||
) {
|
||||
throw new Error(
|
||||
`The key "${key_}" is sensitive and cannot be set directly. Please use the $var: syntax to set it.`,
|
||||
);
|
||||
}
|
||||
// Makes TS happy (https://stackoverflow.com/a/78391854)
|
||||
const key = key_ as keyof M;
|
||||
if (typeof value === "string" && value.startsWith("$var:")) {
|
||||
const [name, defaultValue] = value.slice(5).split(":", 2);
|
||||
const variableValue = registry.getVar(name);
|
||||
if (!variableValue) {
|
||||
if (defaultValue) {
|
||||
// biome-ignore lint/suspicious/noExplicitAny:
|
||||
newConfig[key] = defaultValue as any;
|
||||
} else {
|
||||
throw new Error(`Variable "${name}" not found`);
|
||||
}
|
||||
} else {
|
||||
// biome-ignore lint/suspicious/noExplicitAny:
|
||||
newConfig[key] = variableValue as any;
|
||||
}
|
||||
}
|
||||
}
|
||||
return newConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optionally load any resources needed for the embedding function.
|
||||
*
|
||||
* This method is called after the embedding function has been initialized
|
||||
* but before any embeddings are computed. It is useful for loading local models
|
||||
* or other resources that are needed for the embedding function to work.
|
||||
*/
|
||||
async init?(): Promise<void>;
|
||||
|
||||
/**
|
||||
|
||||
@@ -21,11 +21,13 @@ export class OpenAIEmbeddingFunction extends EmbeddingFunction<
|
||||
#modelName: OpenAIOptions["model"];
|
||||
|
||||
constructor(
|
||||
options: Partial<OpenAIOptions> = {
|
||||
optionsRaw: Partial<OpenAIOptions> = {
|
||||
model: "text-embedding-ada-002",
|
||||
},
|
||||
) {
|
||||
super();
|
||||
const options = this.resolveVariables(optionsRaw);
|
||||
|
||||
const openAIKey = options?.apiKey ?? process.env.OPENAI_API_KEY;
|
||||
if (!openAIKey) {
|
||||
throw new Error("OpenAI API key is required");
|
||||
@@ -52,10 +54,8 @@ export class OpenAIEmbeddingFunction extends EmbeddingFunction<
|
||||
this.#modelName = modelName;
|
||||
}
|
||||
|
||||
toJSON() {
|
||||
return {
|
||||
model: this.#modelName,
|
||||
};
|
||||
protected getSensitiveKeys(): string[] {
|
||||
return ["apiKey"];
|
||||
}
|
||||
|
||||
ndims(): number {
|
||||
|
||||
@@ -23,6 +23,7 @@ export interface EmbeddingFunctionCreate<T extends EmbeddingFunction> {
|
||||
*/
|
||||
export class EmbeddingFunctionRegistry {
|
||||
#functions = new Map<string, EmbeddingFunctionConstructor>();
|
||||
#variables = new Map<string, string>();
|
||||
|
||||
/**
|
||||
* Get the number of registered functions
|
||||
@@ -82,10 +83,7 @@ export class EmbeddingFunctionRegistry {
|
||||
};
|
||||
} else {
|
||||
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||
create = function (options?: any) {
|
||||
const instance = new factory(options);
|
||||
return instance;
|
||||
};
|
||||
create = (options?: any) => new factory(options);
|
||||
}
|
||||
|
||||
return {
|
||||
@@ -164,6 +162,37 @@ export class EmbeddingFunctionRegistry {
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a variable. These can be accessed in the embedding function
|
||||
* configuration using the syntax `$var:variable_name`. If they are not
|
||||
* set, an error will be thrown letting you know which key is unset. If you
|
||||
* want to supply a default value, you can add an additional part in the
|
||||
* configuration like so: `$var:variable_name:default_value`. Default values
|
||||
* can be used for runtime configurations that are not sensitive, such as
|
||||
* whether to use a GPU for inference.
|
||||
*
|
||||
* The name must not contain colons. The default value can contain colons.
|
||||
*
|
||||
* @param name
|
||||
* @param value
|
||||
*/
|
||||
setVar(name: string, value: string): void {
|
||||
if (name.includes(":")) {
|
||||
throw new Error("Variable names cannot contain colons");
|
||||
}
|
||||
this.#variables.set(name, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a variable.
|
||||
* @param name
|
||||
* @returns
|
||||
* @see {@link setVar}
|
||||
*/
|
||||
getVar(name: string): string | undefined {
|
||||
return this.#variables.get(name);
|
||||
}
|
||||
}
|
||||
|
||||
const _REGISTRY = new EmbeddingFunctionRegistry();
|
||||
|
||||
@@ -44,11 +44,12 @@ export class TransformersEmbeddingFunction extends EmbeddingFunction<
|
||||
#ndims?: number;
|
||||
|
||||
constructor(
|
||||
options: Partial<XenovaTransformerOptions> = {
|
||||
optionsRaw: Partial<XenovaTransformerOptions> = {
|
||||
model: "Xenova/all-MiniLM-L6-v2",
|
||||
},
|
||||
) {
|
||||
super();
|
||||
const options = this.resolveVariables(optionsRaw);
|
||||
|
||||
const modelName = options?.model ?? "Xenova/all-MiniLM-L6-v2";
|
||||
this.#tokenizerOptions = {
|
||||
@@ -59,22 +60,6 @@ export class TransformersEmbeddingFunction extends EmbeddingFunction<
|
||||
this.#ndims = options.ndims;
|
||||
this.#modelName = modelName;
|
||||
}
|
||||
toJSON() {
|
||||
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
||||
const obj: Record<string, any> = {
|
||||
model: this.#modelName,
|
||||
};
|
||||
if (this.#ndims) {
|
||||
obj["ndims"] = this.#ndims;
|
||||
}
|
||||
if (this.#tokenizerOptions) {
|
||||
obj["tokenizerOptions"] = this.#tokenizerOptions;
|
||||
}
|
||||
if (this.#tokenizer) {
|
||||
obj["tokenizer"] = this.#tokenizer.name;
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
async init() {
|
||||
let transformers;
|
||||
|
||||
Reference in New Issue
Block a user