diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 71ea0f53..223f20d1 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -68,6 +68,7 @@ nav: - Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md - Javascript examples: - YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md + - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md - References: - Vector Search: search.md - SQL filters: sql.md diff --git a/docs/src/examples/transformerjs_embedding_search_nodejs.md b/docs/src/examples/transformerjs_embedding_search_nodejs.md new file mode 100644 index 00000000..77413bb5 --- /dev/null +++ b/docs/src/examples/transformerjs_embedding_search_nodejs.md @@ -0,0 +1,117 @@ +# Vector embedding search using TransformersJS and NodeJS + +This example shows how to use the [transformers.js](https://github.com/xenova/transformers.js) library to perform vector embedding search using LanceDB's Javascript API. + + +### Setting up +First, install the dependencies: +```bash +npm install vectordb +npm i @xenova/transformers +``` + +We will also be using the [all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) model to make it compatible with Transformers.js + +Within our `index.js` file we will import the necessary libraries and define our model and database: + +```javascript +const lancedb = require('vectordb') +const { pipeline } = await import('@xenova/transformers') +const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); +``` + +### Creating the embedding function + +Next, we will create a function that will take in a string and return the vector embedding of that string. We will use the `pipe` function we defined earlier to get the vector embedding of the string. + +```javascript +// Define the function. `sourceColumn` is required for LanceDB to know +// which column to use as input. +const embed_fun = {} +embed_fun.sourceColumn = 'text' +embed_fun.embed = async function (batch) { + let result = [] + // Given a batch of strings, we will use the `pipe` function to get + // the vector embedding of each string. + for (let text of batch) { + // 'mean' pooling and normalizing allows the embeddings to share the + // same length. + const res = await pipe(text, { pooling: 'mean', normalize: true }) + result.push(Array.from(res['data'])) + } + return (result) +} +``` + +### Creating the database + +Now, we will create the LanceDB database and add the embedding function we defined earlier. + +```javascript +// Link a folder and create a table with data +const db = await lancedb.connect('data/sample-lancedb') + +// You can also import any other data, but make sure that you have a column +// for the embedding function to use. +const data = [ + { id: 1, text: 'Cherry', type: 'fruit' }, + { id: 2, text: 'Carrot', type: 'vegetable' }, + { id: 3, text: 'Potato', type: 'vegetable' }, + { id: 4, text: 'Apple', type: 'fruit' }, + { id: 5, text: 'Banana', type: 'fruit' } +] + +// Create the table with the embedding function +const table = await db.createTable('food_table', data, "create", embed_fun) +``` + +### Performing the search + +Now, we can perform the search using the `search` function. LanceDB automatically uses the embedding function we defined earlier to get the vector embedding of the query string. + +```javascript +// Query the table +const results = await table + .search("a sweet fruit to eat") + .metricType("cosine") + .limit(2) + .execute() +console.log(results.map(r => r.text)) +``` +```bash +[ 'Banana', 'Cherry' ] +``` + +Output of `results`: +```bash +[ + { + vector: Float32Array(384) [ + -0.057455405592918396, + 0.03617725893855095, + -0.0367760956287384, + ... 381 more items + ], + id: 5, + text: 'Banana', + type: 'fruit', + score: 0.4919965863227844 + }, + { + vector: Float32Array(384) [ + 0.0009714411571621895, + 0.008223623037338257, + 0.009571489877998829, + ... 381 more items + ], + id: 1, + text: 'Cherry', + type: 'fruit', + score: 0.5540297031402588 + } +] +``` + +### Wrapping it up + +In this example, we showed how to use the `transformers.js` library to perform vector embedding search using LanceDB's Javascript API. You can find the full code for this example on [Github](https://github.com/lancedb/lancedb/blob/main/node/examples/js-transformers/index.js)! diff --git a/node/examples/js-transformers/index.js b/node/examples/js-transformers/index.js new file mode 100644 index 00000000..ccf21f63 --- /dev/null +++ b/node/examples/js-transformers/index.js @@ -0,0 +1,66 @@ +// Copyright 2023 Lance Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +'use strict' + + +async function example() { + + const lancedb = require('vectordb') + + // Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2) + const { pipeline } = await import('@xenova/transformers') + const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2'); + + + // Create embedding function from pipeline which returns a list of vectors from batch + // sourceColumn is the name of the column in the data to be embedded + // + // Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector + const embed_fun = {} + embed_fun.sourceColumn = 'text' + embed_fun.embed = async function (batch) { + let result = [] + for (let text of batch) { + const res = await pipe(text, { pooling: 'mean', normalize: true }) + result.push(Array.from(res['data'])) + } + return (result) + } + + // Link a folder and create a table with data + const db = await lancedb.connect('data/sample-lancedb') + + const data = [ + { id: 1, text: 'Cherry', type: 'fruit' }, + { id: 2, text: 'Carrot', type: 'vegetable' }, + { id: 3, text: 'Potato', type: 'vegetable' }, + { id: 4, text: 'Apple', type: 'fruit' }, + { id: 5, text: 'Banana', type: 'fruit' } + ] + + const table = await db.createTable('food_table', data, "create", embed_fun) + + + // Query the table + const results = await table + .search("a sweet fruit to eat") + .metricType("cosine") + .limit(2) + .execute() + console.log(results.map(r => r.text)) + +} + +example().then(_ => { console.log("Done!") }) diff --git a/node/examples/js-transformers/package.json b/node/examples/js-transformers/package.json new file mode 100644 index 00000000..4255e27a --- /dev/null +++ b/node/examples/js-transformers/package.json @@ -0,0 +1,16 @@ +{ + "name": "vectordb-example-js-transformers", + "version": "1.0.0", + "description": "Example for using transformers.js with lancedb", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Lance Devs", + "license": "Apache-2.0", + "dependencies": { + "@xenova/transformers": "^2.4.1", + "vectordb": "^0.1.12" + } + +}