diff --git a/.gitignore b/.gitignore index f61eaccc..360df464 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ **/__pycache__ .DS_Store venv +.venv .vscode .zed diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bbe6aa53..e907714d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,4 +18,4 @@ repos: language: system types: [text] files: "nodejs/.*" - exclude: nodejs/lancedb/native.d.ts|nodejs/dist/.* + exclude: nodejs/lancedb/native.d.ts|nodejs/dist/.*|nodejs/examples/.* diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 37a3cd33..ec4d9713 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -38,13 +38,27 @@ Lance supports `IVF_PQ` index type by default. tbl.create_index(num_partitions=256, num_sub_vectors=96) ``` -=== "Typescript" +=== "TypeScript" - ```typescript - --8<--- "docs/src/ann_indexes.ts:import" + === "@lancedb/lancedb" - --8<-- "docs/src/ann_indexes.ts:ingest" - ``` + Creating indexes is done via the [lancedb.Table.createIndex](../js/classes/Table.md/#createIndex) method. + + ```typescript + --8<--- "nodejs/examples/ann_indexes.ts:import" + + --8<-- "nodejs/examples/ann_indexes.ts:ingest" + ``` + + === "vectordb (deprecated)" + + Creating indexes is done via the [lancedb.Table.createIndex](../javascript/interfaces/Table.md/#createIndex) method. + + ```typescript + --8<--- "docs/src/ann_indexes.ts:import" + + --8<-- "docs/src/ann_indexes.ts:ingest" + ``` === "Rust" @@ -91,27 +105,27 @@ You can specify the GPU device to train IVF partitions via === "Linux" - - ``` { .python .copy } - # Create index using CUDA on Nvidia GPUs. - tbl.create_index( - num_partitions=256, - num_sub_vectors=96, - accelerator="cuda" - ) - ``` + + ``` { .python .copy } + # Create index using CUDA on Nvidia GPUs. + tbl.create_index( + num_partitions=256, + num_sub_vectors=96, + accelerator="cuda" + ) + ``` === "MacOS" - - ```python - # Create index using MPS on Apple Silicon. - tbl.create_index( - num_partitions=256, - num_sub_vectors=96, - accelerator="mps" - ) - ``` + + ```python + # Create index using MPS on Apple Silicon. + tbl.create_index( + num_partitions=256, + num_sub_vectors=96, + accelerator="mps" + ) + ``` Troubleshooting: @@ -150,11 +164,19 @@ There are a couple of parameters that can be used to fine-tune the search: 1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867 ``` -=== "Typescript" +=== "TypeScript" - ```typescript - --8<-- "docs/src/ann_indexes.ts:search1" - ``` + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/ann_indexes.ts:search1" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/ann_indexes.ts:search1" + ``` === "Rust" @@ -172,15 +194,23 @@ You can further filter the elements returned by a search using a where clause. === "Python" - ```python - tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas() - ``` + ```python + tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas() + ``` -=== "Typescript" +=== "TypeScript" - ```javascript - --8<-- "docs/src/ann_indexes.ts:search2" - ``` + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/ann_indexes.ts:search2" + ``` + + === "vectordb (deprecated)" + + ```javascript + --8<-- "docs/src/ann_indexes.ts:search2" + ``` ### Projections (select clause) @@ -188,23 +218,31 @@ You can select the columns returned by the query using a select clause. === "Python" - ```python - tbl.search(np.random.random((1536))).select(["vector"]).to_pandas() - ``` + ```python + tbl.search(np.random.random((1536))).select(["vector"]).to_pandas() + ``` - ```text - vector _distance - 0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092 - 1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485 - ... - ``` + ```text + vector _distance + 0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092 + 1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485 + ... + ``` -=== "Typescript" +=== "TypeScript" - ```typescript - --8<-- "docs/src/ann_indexes.ts:search3" - ``` + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/ann_indexes.ts:search3" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/ann_indexes.ts:search3" + ``` ## FAQ diff --git a/docs/src/basic.md b/docs/src/basic.md index 6aafc507..02fd5ad4 100644 --- a/docs/src/basic.md +++ b/docs/src/basic.md @@ -16,12 +16,43 @@ pip install lancedb ``` -=== "Typescript" +=== "Typescript[^1]" + === "@lancedb/lancedb" - ```shell - npm install vectordb - ``` + ```shell + npm install @lancedb/lancedb + ``` + !!! note "Bundling `@lancedb/lancedb` apps with Webpack" + Since LanceDB contains a prebuilt Node binary, you must configure `next.config.js` to exclude it from webpack. This is required for both using Next.js and deploying a LanceDB app on Vercel. + + ```javascript + /** @type {import('next').NextConfig} */ + module.exports = ({ + webpack(config) { + config.externals.push({ '@lancedb/lancedb': '@lancedb/lancedb' }) + return config; + } + }) + ``` + === "vectordb (deprecated)" + + ```shell + npm install vectordb + ``` + !!! note "Bundling `vectordb` apps with Webpack" + + Since LanceDB contains a prebuilt Node binary, you must configure `next.config.js` to exclude it from webpack. This is required for both using Next.js and deploying a LanceDB app on Vercel. + + ```javascript + /** @type {import('next').NextConfig} */ + module.exports = ({ + webpack(config) { + config.externals.push({ vectordb: 'vectordb' }) + return config; + } + }) + ``` === "Rust" ```shell @@ -58,14 +89,21 @@ recommend switching to stable releases. pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ lancedb ``` -=== "Typescript" +=== "Typescript[^1]" - ```shell - npm install vectordb@preview - ``` + === "@lancedb/lancedb" + + ```shell + npm install @lancedb/lancedb@preview + ``` + === "vectordb (deprecated)" + + ```shell + npm install vectordb@preview + ``` === "Rust" - + We don't push preview releases to crates.io, but you can referent the tag in GitHub within your Cargo dependencies: @@ -93,23 +131,22 @@ recommend switching to stable releases. use the same syntax as the asynchronous API. To help with this migration we have created a [migration guide](migration.md) detailing the differences. -=== "Typescript" +=== "Typescript[^1]" - ```typescript - --8<-- "docs/src/basic_legacy.ts:import" + === "@lancedb/lancedb" - --8<-- "docs/src/basic_legacy.ts:open_db" - ``` + ```typescript + import * as lancedb from "@lancedb/lancedb"; + import * as arrow from "apache-arrow"; - !!! note "`@lancedb/lancedb` vs. `vectordb`" + --8<-- "nodejs/examples/basic.ts:connect" + ``` - The Javascript SDK was originally released as `vectordb`. In an effort to - reduce maintenance we are aligning our SDKs. The new, aligned, Javascript - API is being released as `lancedb`. If you are starting new work we encourage - you to try out `lancedb`. Once the new API is feature complete we will begin - slowly deprecating `vectordb` in favor of `lancedb`. There is a - [migration guide](migration.md) detailing the differences which will assist - you in this process. + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:open_db" + ``` === "Rust" @@ -152,15 +189,23 @@ table. --8<-- "python/python/tests/docs/test_basic.py:create_table_async_pandas" ``` -=== "Typescript" +=== "Typescript[^1]" - ```typescript - --8<-- "docs/src/basic_legacy.ts:create_table" - ``` + === "@lancedb/lancedb" - If the table already exists, LanceDB will raise an error by default. - If you want to overwrite the table, you can pass in `mode="overwrite"` - to the `createTable` function. + ```typescript + --8<-- "nodejs/examples/basic.ts:create_table" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:create_table" + ``` + + If the table already exists, LanceDB will raise an error by default. + If you want to overwrite the table, you can pass in `mode:"overwrite"` + to the `createTable` function. === "Rust" @@ -200,11 +245,19 @@ similar to a `CREATE TABLE` statement in SQL. !!! note "You can define schema in Pydantic" LanceDB comes with Pydantic support, which allows you to define the schema of your data using Pydantic models. This makes it easy to work with LanceDB tables and data. Learn more about all supported types in [tables guide](./guides/tables.md). -=== "Typescript" +=== "Typescript[^1]" - ```typescript - --8<-- "docs/src/basic_legacy.ts:create_empty_table" - ``` + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/basic.ts:create_empty_table" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:create_empty_table" + ``` === "Rust" @@ -223,11 +276,19 @@ Once created, you can open a table as follows: --8<-- "python/python/tests/docs/test_basic.py:open_table_async" ``` -=== "Typescript" +=== "Typescript[^1]" + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/basic.ts:open_table" + ``` + + === "vectordb (deprecated)" + + ```typescript + const tbl = await db.openTable("myTable"); + ``` - ```typescript - const tbl = await db.openTable("myTable"); - ``` === "Rust" @@ -244,11 +305,18 @@ If you forget the name of your table, you can always get a listing of all table --8<-- "python/python/tests/docs/test_basic.py:table_names_async" ``` -=== "Javascript" +=== "Typescript[^1]" + === "@lancedb/lancedb" - ```javascript - console.log(await db.tableNames()); - ``` + ```typescript + --8<-- "nodejs/examples/basic.ts:table_names" + ``` + + === "vectordb (deprecated)" + + ```typescript + console.log(await db.tableNames()); + ``` === "Rust" @@ -267,11 +335,18 @@ After a table has been created, you can always add more data to it as follows: --8<-- "python/python/tests/docs/test_basic.py:add_data_async" ``` -=== "Typescript" +=== "Typescript[^1]" + === "@lancedb/lancedb" - ```typescript - --8<-- "docs/src/basic_legacy.ts:add" - ``` + ```typescript + --8<-- "nodejs/examples/basic.ts:add_data" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:add" + ``` === "Rust" @@ -292,11 +367,18 @@ Once you've embedded the query, you can find its nearest neighbors as follows: This returns a pandas DataFrame with the results. -=== "Typescript" +=== "Typescript[^1]" + === "@lancedb/lancedb" - ```typescript - --8<-- "docs/src/basic_legacy.ts:search" - ``` + ```typescript + --8<-- "nodejs/examples/basic.ts:vector_search" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:search" + ``` === "Rust" @@ -325,11 +407,18 @@ LanceDB allows you to create an ANN index on a table as follows: --8<-- "python/python/tests/docs/test_basic.py:create_index_async" ``` -=== "Typescript" +=== "Typescript[^1]" + === "@lancedb/lancedb" - ```{.typescript .ignore} - --8<-- "docs/src/basic_legacy.ts:create_index" - ``` + ```typescript + --8<-- "nodejs/examples/basic.ts:create_index" + ``` + + === "vectordb (deprecated)" + + ```{.typescript .ignore} + --8<-- "docs/src/basic_legacy.ts:create_index" + ``` === "Rust" @@ -357,11 +446,19 @@ This can delete any number of rows that match the filter. --8<-- "python/python/tests/docs/test_basic.py:delete_rows_async" ``` -=== "Typescript" +=== "Typescript[^1]" - ```typescript - --8<-- "docs/src/basic_legacy.ts:delete" - ``` + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/basic.ts:delete_rows" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:delete" + ``` === "Rust" @@ -378,9 +475,15 @@ simple or complex as needed. To see what expressions are supported, see the Read more: [lancedb.table.Table.delete][] -=== "Javascript" +=== "Typescript[^1]" - Read more: [vectordb.Table.delete](javascript/interfaces/Table.md#delete) + === "@lancedb/lancedb" + + Read more: [lancedb.Table.delete](javascript/interfaces/Table.md#delete) + + === "vectordb (deprecated)" + + Read more: [vectordb.Table.delete](javascript/interfaces/Table.md#delete) === "Rust" @@ -392,23 +495,31 @@ Use the `drop_table()` method on the database to remove a table. === "Python" - ```python - --8<-- "python/python/tests/docs/test_basic.py:drop_table" - --8<-- "python/python/tests/docs/test_basic.py:drop_table_async" - ``` + ```python + --8<-- "python/python/tests/docs/test_basic.py:drop_table" + --8<-- "python/python/tests/docs/test_basic.py:drop_table_async" + ``` - This permanently removes the table and is not recoverable, unlike deleting rows. - By default, if the table does not exist an exception is raised. To suppress this, - you can pass in `ignore_missing=True`. + This permanently removes the table and is not recoverable, unlike deleting rows. + By default, if the table does not exist an exception is raised. To suppress this, + you can pass in `ignore_missing=True`. -=== "Typescript" +=== "Typescript[^1]" - ```typescript - --8<-- "docs/src/basic_legacy.ts:drop_table" - ``` + === "@lancedb/lancedb" - This permanently removes the table and is not recoverable, unlike deleting rows. - If the table does not exist an exception is raised. + ```typescript + --8<-- "nodejs/examples/basic.ts:drop_table" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:drop_table" + ``` + + This permanently removes the table and is not recoverable, unlike deleting rows. + If the table does not exist an exception is raised. === "Rust" @@ -416,19 +527,6 @@ Use the `drop_table()` method on the database to remove a table. --8<-- "rust/lancedb/examples/simple.rs:drop_table" ``` -!!! note "Bundling `vectordb` apps with Webpack" - - If you're using the `vectordb` module in JavaScript, since LanceDB contains a prebuilt Node binary, you must configure `next.config.js` to exclude it from webpack. This is required for both using Next.js and deploying a LanceDB app on Vercel. - - ```javascript - /** @type {import('next').NextConfig} */ - module.exports = ({ - webpack(config) { - config.externals.push({ vectordb: 'vectordb' }) - return config; - } - }) - ``` ## Using the Embedding API You can use the embedding API when working with embedding models. It automatically vectorizes the data at ingestion and query time and comes with built-in integrations with popular embedding models like Openai, Hugging Face, Sentence Transformers, CLIP and more. @@ -440,6 +538,22 @@ You can use the embedding API when working with embedding models. It automatical --8<-- "python/python/tests/docs/test_embeddings_optional.py:openai_embeddings" ``` +=== "Typescript[^1]" + + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/embedding.ts:imports" + --8<-- "nodejs/examples/embedding.ts:openai_embeddings" + ``` + +=== "Rust" + + ```rust + --8<-- "rust/lancedb/examples/openai.rs:imports" + --8<-- "rust/lancedb/examples/openai.rs:openai_embeddings" + ``` + Learn about using the existing integrations and creating custom embedding functions in the [embedding API guide](./embeddings/). @@ -448,3 +562,5 @@ Learn about using the existing integrations and creating custom embedding functi This section covered the very basics of using LanceDB. If you're learning about vector databases for the first time, you may want to read the page on [indexing](concepts/index_ivfpq.md) to get familiar with the concepts. If you've already worked with other vector databases, you may want to read the [guides](guides/tables.md) to learn how to work with LanceDB in more detail. + +[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](migration.md) for more information. diff --git a/docs/src/basic_legacy.ts b/docs/src/basic_legacy.ts index 560f7254..646e6002 100644 --- a/docs/src/basic_legacy.ts +++ b/docs/src/basic_legacy.ts @@ -24,6 +24,7 @@ const example = async () => { ); // --8<-- [end:create_table] + // --8<-- [start:add] const newData = Array.from({ length: 500 }, (_, i) => ({ vector: [i, i + 1], diff --git a/docs/src/embeddings/embedding_functions.md b/docs/src/embeddings/embedding_functions.md index 5d9ef9f1..17ce944e 100644 --- a/docs/src/embeddings/embedding_functions.md +++ b/docs/src/embeddings/embedding_functions.md @@ -6,8 +6,8 @@ For this purpose, LanceDB introduces an **embedding functions API**, that allow LanceDB Cloud does not support embedding functions yet. You need to generate embeddings before ingesting into the table or querying. !!! warning - Using the embedding function registry means that you don't have to explicitly generate the embeddings yourself. - However, if your embedding function changes, you'll have to re-configure your table with the new embedding function + Using the embedding function registry means that you don't have to explicitly generate the embeddings yourself. + However, if your embedding function changes, you'll have to re-configure your table with the new embedding function and regenerate the embeddings. In the future, we plan to support the ability to change the embedding function via table metadata and have LanceDB automatically take care of regenerating the embeddings. @@ -16,7 +16,7 @@ For this purpose, LanceDB introduces an **embedding functions API**, that allow === "Python" In the LanceDB python SDK, we define a global embedding function registry with - many different embedding models and even more coming soon. + many different embedding models and even more coming soon. Here's let's an implementation of CLIP as example. ```python @@ -26,20 +26,35 @@ For this purpose, LanceDB introduces an **embedding functions API**, that allow clip = registry.get("open-clip").create() ``` - You can also define your own embedding function by implementing the `EmbeddingFunction` + You can also define your own embedding function by implementing the `EmbeddingFunction` abstract base interface. It subclasses Pydantic Model which can be utilized to write complex schemas simply as we'll see next! -=== "JavaScript"" +=== "TypeScript" In the TypeScript SDK, the choices are more limited. For now, only the OpenAI embedding function is available. ```javascript - const lancedb = require("vectordb"); + import * as lancedb from '@lancedb/lancedb' + import { getRegistry } from '@lancedb/lancedb/embeddings' // You need to provide an OpenAI API key const apiKey = "sk-..." // The embedding function will create embeddings for the 'text' column - const embedding = new lancedb.OpenAIEmbeddingFunction('text', apiKey) + const func = getRegistry().get("openai").create({apiKey}) + ``` +=== "Rust" + In the Rust SDK, the choices are more limited. For now, only the OpenAI + embedding function is available. But unlike the Python and TypeScript SDKs, you need manually register the OpenAI embedding function. + + ```toml + // Make sure to include the `openai` feature + [dependencies] + lancedb = {version = "*", features = ["openai"]} + ``` + + ```rust + --8<-- "rust/lancedb/examples/openai.rs:imports" + --8<-- "rust/lancedb/examples/openai.rs:openai_embeddings" ``` ## 2. Define the data model or schema @@ -55,14 +70,14 @@ For this purpose, LanceDB introduces an **embedding functions API**, that allow `VectorField` tells LanceDB to use the clip embedding function to generate query embeddings for the `vector` column and `SourceField` ensures that when adding data, we automatically use the specified embedding function to encode `image_uri`. -=== "JavaScript" +=== "TypeScript" For the TypeScript SDK, a schema can be inferred from input data, or an explicit Arrow schema can be provided. ## 3. Create table and add data -Now that we have chosen/defined our embedding function and the schema, +Now that we have chosen/defined our embedding function and the schema, we can create the table and ingest data without needing to explicitly generate the embeddings at all: @@ -74,17 +89,26 @@ the embeddings at all: table.add([{"image_uri": u} for u in uris]) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - const db = await lancedb.connect("data/sample-lancedb"); - const data = [ - { text: "pepperoni"}, - { text: "pineapple"} - ] + === "@lancedb/lancedb" - const table = await db.createTable("vectors", data, embedding) - ``` + ```ts + --8<-- "nodejs/examples/embedding.ts:imports" + --8<-- "nodejs/examples/embedding.ts:embedding_function" + ``` + + === "vectordb (deprecated)" + + ```ts + const db = await lancedb.connect("data/sample-lancedb"); + const data = [ + { text: "pepperoni"}, + { text: "pineapple"} + ] + + const table = await db.createTable("vectors", data, embedding) + ``` ## 4. Querying your table Not only can you forget about the embeddings during ingestion, you also don't @@ -97,8 +121,8 @@ need to worry about it when you query the table: ```python results = ( table.search("dog") - .limit(10) - .to_pandas() + .limit(10) + .to_pandas() ) ``` @@ -109,22 +133,32 @@ need to worry about it when you query the table: query_image = Image.open(p) results = ( table.search(query_image) - .limit(10) - .to_pandas() + .limit(10) + .to_pandas() ) ``` Both of the above snippet returns a pandas DataFrame with the 10 closest vectors to the query. -=== "JavaScript" +=== "TypeScript" + + === "@lancedb/lancedb" + + ```ts + const results = await table.search("What's the best pizza topping?") + .limit(10) + .toArray() + ``` + + === "vectordb (deprecated) + + ```ts + const results = await table + .search("What's the best pizza topping?") + .limit(10) + .execute() + ``` - ```javascript - const results = await table - .search("What's the best pizza topping?") - .limit(10) - .execute() - ``` - The above snippet returns an array of records with the top 10 nearest neighbors to the query. --- diff --git a/docs/src/embeddings/index.md b/docs/src/embeddings/index.md index 6faee93f..d46cb85b 100644 --- a/docs/src/embeddings/index.md +++ b/docs/src/embeddings/index.md @@ -1,13 +1,13 @@ -Due to the nature of vector embeddings, they can be used to represent any kind of data, from text to images to audio. -This makes them a very powerful tool for machine learning practitioners. -However, there's no one-size-fits-all solution for generating embeddings - there are many different libraries and APIs +Due to the nature of vector embeddings, they can be used to represent any kind of data, from text to images to audio. +This makes them a very powerful tool for machine learning practitioners. +However, there's no one-size-fits-all solution for generating embeddings - there are many different libraries and APIs (both commercial and open source) that can be used to generate embeddings from structured/unstructured data. LanceDB supports 3 methods of working with embeddings. 1. You can manually generate embeddings for the data and queries. This is done outside of LanceDB. 2. You can use the built-in [embedding functions](./embedding_functions.md) to embed the data and queries in the background. -3. For python users, you can define your own [custom embedding function](./custom_embedding_function.md) +3. You can define your own [custom embedding function](./custom_embedding_function.md) that extends the default embedding functions. For python users, there is also a legacy [with_embeddings API](./legacy.md). @@ -18,62 +18,89 @@ It is retained for compatibility and will be removed in a future version. To get started with embeddings, you can use the built-in embedding functions. ### OpenAI Embedding function + LanceDB registers the OpenAI embeddings function in the registry as `openai`. You can pass any supported model name to the `create`. By default it uses `"text-embedding-ada-002"`. -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry +=== "Python" -db = lancedb.connect("/tmp/db") -func = get_registry().get("openai").create(name="text-embedding-ada-002") + ```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import get_registry -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() + db = lancedb.connect("/tmp/db") + func = get_registry().get("openai").create(name="text-embedding-ada-002") -table = db.create_table("words", schema=Words, mode="overwrite") -table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) + class Words(LanceModel): + text: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` + table = db.create_table("words", schema=Words, mode="overwrite") + table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] + ) + + query = "greetings" + actual = table.search(query).limit(1).to_pydantic(Words)[0] + print(actual.text) + ``` + +=== "TypeScript" + + ```typescript + --8<--- "nodejs/examples/embedding.ts:imports" + --8<--- "nodejs/examples/embedding.ts:openai_embeddings" + ``` + +=== "Rust" + + ```rust + --8<--- "rust/lancedb/examples/openai.rs:imports" + --8<--- "rust/lancedb/examples/openai.rs:openai_embeddings" + ``` ### Sentence Transformers Embedding function LanceDB registers the Sentence Transformers embeddings function in the registry as `sentence-transformers`. You can pass any supported model name to the `create`. By default it uses `"sentence-transformers/paraphrase-MiniLM-L6-v2"`. -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry +=== "Python" + ```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import get_registry -db = lancedb.connect("/tmp/db") -model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") + db = lancedb.connect("/tmp/db") + model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") -class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() + class Words(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() -table = db.create_table("words", schema=Words) -table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] -) + table = db.create_table("words", schema=Words) + table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] + ) -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` + query = "greetings" + actual = table.search(query).limit(1).to_pydantic(Words)[0] + print(actual.text) + ``` + +=== "TypeScript" + + Coming Soon! + +=== "Rust" + + Coming Soon! ### Jina Embeddings + LanceDB registers the JinaAI embeddings function in the registry as `jina`. You can pass any supported model name to the `create`. By default it uses `"jina-clip-v1"`. `jina-clip-v1` can handle both text and images and other models only support `text`. @@ -104,4 +131,4 @@ table.add( query = "greetings" actual = table.search(query).limit(1).to_pydantic(Words)[0] print(actual.text) -``` \ No newline at end of file +``` diff --git a/docs/src/guides/storage.md b/docs/src/guides/storage.md index b165a536..6cdc52bd 100644 --- a/docs/src/guides/storage.md +++ b/docs/src/guides/storage.md @@ -32,28 +32,54 @@ LanceDB OSS supports object stores such as AWS S3 (and compatible stores), Azure db = lancedb.connect("az://bucket/path") ``` -=== "JavaScript" +=== "TypeScript" - AWS S3: + === "@lancedb/lancedb" - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect("s3://bucket/path"); - ``` + AWS S3: - Google Cloud Storage: + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect("s3://bucket/path"); + ``` - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect("gs://bucket/path"); - ``` + Google Cloud Storage: - Azure Blob Storage: + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect("gs://bucket/path"); + ``` - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect("az://bucket/path"); - ``` + Azure Blob Storage: + + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect("az://bucket/path"); + ``` + + + === "vectordb (deprecated)" + + AWS S3: + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect("s3://bucket/path"); + ``` + + Google Cloud Storage: + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect("gs://bucket/path"); + ``` + + Azure Blob Storage: + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect("az://bucket/path"); + ``` In most cases, when running in the respective cloud and permissions are set up correctly, no additional configuration is required. When running outside of the respective cloud, authentication credentials must be provided. Credentials and other configuration options can be set in two ways: first, by setting environment variables. And second, by passing a `storage_options` object to the `connect` function. For example, to increase the request timeout to 60 seconds, you can set the `TIMEOUT` environment variable to `60s`: @@ -78,13 +104,26 @@ If you only want this to apply to one particular connection, you can pass the `s ) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect("s3://bucket/path", - {storageOptions: {timeout: "60s"}}); - ``` + === "@lancedb/lancedb" + + ```ts + import * as lancedb from "@lancedb/lancedb"; + + const db = await lancedb.connect("s3://bucket/path", { + storageOptions: {timeout: "60s"} + }); + ``` + + === "vectordb (deprecated)" + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect("s3://bucket/path", { + storageOptions: {timeout: "60s"} + }); + ``` Getting even more specific, you can set the `timeout` for only a particular table: @@ -101,18 +140,33 @@ Getting even more specific, you can set the `timeout` for only a particular tabl ) ``` -=== "JavaScript" +=== "TypeScript" - - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect("s3://bucket/path"); - const table = db.createTable( - "table", - [{ a: 1, b: 2}], - {storageOptions: {timeout: "60s"}} - ); - ``` + === "@lancedb/lancedb" + + + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect("s3://bucket/path"); + const table = db.createTable( + "table", + [{ a: 1, b: 2}], + {storageOptions: {timeout: "60s"}} + ); + ``` + + === "vectordb (deprecated)" + + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect("s3://bucket/path"); + const table = db.createTable( + "table", + [{ a: 1, b: 2}], + {storageOptions: {timeout: "60s"}} + ); + ``` !!! info "Storage option casing" @@ -135,7 +189,6 @@ There are several options that can be set for all object stores, mostly related | `proxy_ca_certificate` | PEM-formatted CA certificate for proxy connections. | | `proxy_excludes` | List of hosts that bypass the proxy. This is a comma-separated list of domains and IP masks. Any subdomain of the provided domain will be bypassed. For example, `example.com, 192.168.1.0/24` would bypass `https://api.example.com`, `https://www.example.com`, and any IP in the range `192.168.1.0/24`. | - ### AWS S3 To configure credentials for AWS S3, you can use the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` keys. Region can also be set, but it is not mandatory when using AWS. @@ -155,21 +208,39 @@ These can be set as environment variables or passed in the `storage_options` par ) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "s3://bucket/path", - { - storageOptions: { - awsAccessKeyId: "my-access-key", - awsSecretAccessKey: "my-secret-key", - awsSessionToken: "my-session-token", + === "@lancedb/lancedb" + + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect( + "s3://bucket/path", + { + storageOptions: { + awsAccessKeyId: "my-access-key", + awsSecretAccessKey: "my-secret-key", + awsSessionToken: "my-session-token", + } } - } - ); - ``` + ); + ``` + + === "vectordb (deprecated)" + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect( + "s3://bucket/path", + { + storageOptions: { + awsAccessKeyId: "my-access-key", + awsSecretAccessKey: "my-secret-key", + awsSessionToken: "my-session-token", + } + } + ); + ``` Alternatively, if you are using AWS SSO, you can use the `AWS_PROFILE` and `AWS_DEFAULT_REGION` environment variables. @@ -188,7 +259,6 @@ The following keys can be used as both environment variables or keys in the `sto | `aws_sse_kms_key_id` | The KMS key ID to use for server-side encryption. If set, `aws_server_side_encryption` must be `"aws:kms"` or `"aws:kms:dsse"`. | | `aws_sse_bucket_key_enabled` | Whether to use bucket keys for server-side encryption. | - !!! tip "Automatic cleanup for failed writes" LanceDB uses [multi-part uploads](https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html) when writing data to S3 in order to maximize write speed. LanceDB will abort these uploads when it shuts down gracefully, such as when cancelled by keyboard interrupt. However, in the rare case that LanceDB crashes, it is possible that some data will be left lingering in your account. To cleanup this data, we recommend (as AWS themselves do) that you setup a lifecycle rule to delete in-progress uploads after 7 days. See the AWS guide: @@ -384,20 +454,37 @@ LanceDB can also connect to S3-compatible stores, such as MinIO. To do so, you m ) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "s3://bucket/path", - { - storageOptions: { - region: "us-east-1", - endpoint: "http://minio:9000", + === "@lancedb/lancedb" + + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect( + "s3://bucket/path", + { + storageOptions: { + region: "us-east-1", + endpoint: "http://minio:9000", + } } - } - ); - ``` + ); + ``` + + === "vectordb (deprecated)" + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect( + "s3://bucket/path", + { + storageOptions: { + region: "us-east-1", + endpoint: "http://minio:9000", + } + } + ); + ``` This can also be done with the ``AWS_ENDPOINT`` and ``AWS_DEFAULT_REGION`` environment variables. @@ -428,21 +515,37 @@ To configure LanceDB to use an S3 Express endpoint, you must set the storage opt ) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "s3://my-bucket--use1-az4--x-s3/path", - { - storageOptions: { - region: "us-east-1", - s3Express: "true", + === "@lancedb/lancedb" + + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect( + "s3://my-bucket--use1-az4--x-s3/path", + { + storageOptions: { + region: "us-east-1", + s3Express: "true", + } } - } - ); - ``` + ); + ``` + === "vectordb (deprecated)" + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect( + "s3://my-bucket--use1-az4--x-s3/path", + { + storageOptions: { + region: "us-east-1", + s3Express: "true", + } + } + ); + ``` ### Google Cloud Storage @@ -461,26 +564,40 @@ GCS credentials are configured by setting the `GOOGLE_SERVICE_ACCOUNT` environme ) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "gs://my-bucket/my-database", - { - storageOptions: { - serviceAccount: "path/to/service-account.json", + === "@lancedb/lancedb" + + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect( + "gs://my-bucket/my-database", + { + storageOptions: { + serviceAccount: "path/to/service-account.json", + } } - } - ); - ``` + ); + ``` + === "vectordb (deprecated)" + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect( + "gs://my-bucket/my-database", + { + storageOptions: { + serviceAccount: "path/to/service-account.json", + } + } + ); + ``` !!! info "HTTP/2 support" By default, GCS uses HTTP/1 for communication, as opposed to HTTP/2. This improves maximum throughput significantly. However, if you wish to use HTTP/2 for some reason, you can set the environment variable `HTTP1_ONLY` to `false`. - The following keys can be used as both environment variables or keys in the `storage_options` parameter: @@ -490,7 +607,6 @@ The following keys can be used as both environment variables or keys in the `sto | ``google_service_account_key`` | The serialized service account key. | | ``google_application_credentials`` | Path to the application credentials. | - ### Azure Blob Storage Azure Blob Storage credentials can be configured by setting the `AZURE_STORAGE_ACCOUNT_NAME`and `AZURE_STORAGE_ACCOUNT_KEY` environment variables. Alternatively, you can pass the account name and key in the `storage_options` parameter: @@ -509,20 +625,37 @@ Azure Blob Storage credentials can be configured by setting the `AZURE_STORAGE_A ) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - const lancedb = require("lancedb"); - const db = await lancedb.connect( - "az://my-container/my-database", - { - storageOptions: { - accountName: "some-account", - accountKey: "some-key", + === "@lancedb/lancedb" + + ```ts + import * as lancedb from "@lancedb/lancedb"; + const db = await lancedb.connect( + "az://my-container/my-database", + { + storageOptions: { + accountName: "some-account", + accountKey: "some-key", + } } - } - ); - ``` + ); + ``` + + === "vectordb (deprecated)" + + ```ts + const lancedb = require("lancedb"); + const db = await lancedb.connect( + "az://my-container/my-database", + { + storageOptions: { + accountName: "some-account", + accountKey: "some-key", + } + } + ); + ``` These keys can be used as both environment variables or keys in the `storage_options` parameter: @@ -547,4 +680,4 @@ These keys can be used as both environment variables or keys in the `storage_opt | ``azure_use_azure_cli`` | Use azure cli for acquiring access token. | | ``azure_disable_tagging`` | Disables tagging objects. This can be desirable if not supported by the backing store. | - \ No newline at end of file + diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md index df53fa12..cf58079b 100644 --- a/docs/src/guides/tables.md +++ b/docs/src/guides/tables.md @@ -3,32 +3,45 @@ A Table is a collection of Records in a LanceDB Database. Tables in Lance have a schema that defines the columns and their types. These schemas can include nested columns and can evolve over time. -This guide will show how to create tables, insert data into them, and update the data. +This guide will show how to create tables, insert data into them, and update the data. ## Creating a LanceDB Table +Initialize a LanceDB connection and create a table + === "Python" - Initialize a LanceDB connection and create a table using one of the many methods listed below. ```python import lancedb db = lancedb.connect("./.lancedb") ``` -=== "Javascript" - - Initialize a VectorDB connection and create a table using one of the many methods listed below. - - ```javascript - const lancedb = require("vectordb"); - - const uri = "data/sample-lancedb"; - const db = await lancedb.connect(uri); - ``` - LanceDB allows ingesting data from various sources - `dict`, `list[dict]`, `pd.DataFrame`, `pa.Table` or a `Iterator[pa.RecordBatch]`. Let's take a look at some of the these. +=== "Typescript[^1]" + + === "@lancedb/lancedb" + + ```typescript + import * as lancedb from "@lancedb/lancedb"; + import * as arrow from "apache-arrow"; + + const uri = "data/sample-lancedb"; + const db = await lancedb.connect(uri); + ``` + + === "vectordb (deprecated)" + + ```typescript + const lancedb = require("vectordb"); + + const uri = "data/sample-lancedb"; + const db = await lancedb.connect(uri); + ``` + + + ### From list of tuples or dictionaries === "Python" @@ -45,74 +58,104 @@ This guide will show how to create tables, insert data into them, and update the db["my_table"].head() ``` + !!! info "Note" - If the table already exists, LanceDB will raise an error by default. + If the table already exists, LanceDB will raise an error by default. `create_table` supports an optional `exist_ok` parameter. When set to True and the table exists, then it simply opens the existing table. The data you passed in will NOT be appended to the table in that case. - ```python - db.create_table("name", data, exist_ok=True) - ``` - - Sometimes you want to make sure that you start fresh. If you want to - overwrite the table, you can pass in mode="overwrite" to the createTable function. - - ```python - db.create_table("name", data, mode="overwrite") - ``` - -=== "Javascript" - You can create a LanceDB table in JavaScript using an array of JSON records as follows. - - ```javascript - const tb = await db.createTable("my_table", [{ - "vector": [3.1, 4.1], - "item": "foo", - "price": 10.0 - }, { - "vector": [5.9, 26.5], - "item": "bar", - "price": 20.0 - }]); - ``` - !!! info "Note" - If the table already exists, LanceDB will raise an error by default. If you want to overwrite the table, you need to specify the `WriteMode` in the createTable function. - - ```javascript - const table = await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite }) + ```python + db.create_table("name", data, exist_ok=True) ``` - ### From a Pandas DataFrame + Sometimes you want to make sure that you start fresh. If you want to + overwrite the table, you can pass in mode="overwrite" to the createTable function. ```python - import pandas as pd - - data = pd.DataFrame({ - "vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]], - "lat": [45.5, 40.1], - "long": [-122.7, -74.1] - }) - - db.create_table("my_table", data) - - db["my_table"].head() + db.create_table("name", data, mode="overwrite") ``` - !!! info "Note" + +=== "Typescript[^1]" + You can create a LanceDB table in JavaScript using an array of records as follows. + + === "@lancedb/lancedb" + + + ```ts + --8<-- "nodejs/examples/basic.ts:create_table" + ``` + + This will infer the schema from the provided data. If you want to explicitly provide a schema, you can use `apache-arrow` to declare a schema + + ```ts + --8<-- "nodejs/examples/basic.ts:create_table_with_schema" + ``` + + !!! info "Note" + `createTable` supports an optional `existsOk` parameter. When set to true + and the table exists, then it simply opens the existing table. The data you + passed in will NOT be appended to the table in that case. + + + ```ts + --8<-- "nodejs/examples/basic.ts:create_table_exists_ok" + ``` + + Sometimes you want to make sure that you start fresh. If you want to + overwrite the table, you can pass in mode: "overwrite" to the createTable function. + + ```ts + --8<-- "nodejs/examples/basic.ts:create_table_overwrite" + ``` + + === "vectordb (deprecated)" + + ```ts + --8<-- "docs/src/basic_legacy.ts:create_table" + ``` + + !!! warning + `existsOk` option is not supported in `vectordb` + + Sometimes you want to make sure that you start fresh. If you want to + overwrite the table, you can pass in mode: "overwrite" to the createTable function. + + ```ts + const table = await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite }) + ``` + +### From a Pandas DataFrame + +```python +import pandas as pd + +data = pd.DataFrame({ + "vector": [[1.1, 1.2, 1.3, 1.4], [0.2, 1.8, 0.4, 3.6]], + "lat": [45.5, 40.1], + "long": [-122.7, -74.1] +}) + +db.create_table("my_table", data) + +db["my_table"].head() +``` + +!!! info "Note" Data is converted to Arrow before being written to disk. For maximum control over how data is saved, either provide the PyArrow schema to convert to or else provide a PyArrow Table directly. - The **`vector`** column needs to be a [Vector](../python/pydantic.md#vector-field) (defined as [pyarrow.FixedSizeList](https://arrow.apache.org/docs/python/generated/pyarrow.list_.html)) type. +The **`vector`** column needs to be a [Vector](../python/pydantic.md#vector-field) (defined as [pyarrow.FixedSizeList](https://arrow.apache.org/docs/python/generated/pyarrow.list_.html)) type. - ```python - custom_schema = pa.schema([ - pa.field("vector", pa.list_(pa.float32(), 4)), - pa.field("lat", pa.float32()), - pa.field("long", pa.float32()) - ]) +```python +custom_schema = pa.schema([ +pa.field("vector", pa.list_(pa.float32(), 4)), +pa.field("lat", pa.float32()), +pa.field("long", pa.float32()) +]) - table = db.create_table("my_table", data, schema=custom_schema) - ``` +table = db.create_table("my_table", data, schema=custom_schema) +``` ### From a Polars DataFrame @@ -133,14 +176,15 @@ table = db.create_table("pl_table", data=data) ``` ### From an Arrow Table +You can also create LanceDB tables directly from Arrow tables. +LanceDB supports float16 data type! + === "Python" - You can also create LanceDB tables directly from Arrow tables. - LanceDB supports float16 data type! ```python import pyarrows as pa import numpy as np - + dim = 16 total = 2 schema = pa.schema( @@ -160,13 +204,19 @@ table = db.create_table("pl_table", data=data) tbl = db.create_table("f16_tbl", data, schema=schema) ``` -=== "Javascript" - You can also create LanceDB tables directly from Arrow tables. - LanceDB supports Float16 data type! +=== "Typescript[^1]" - ```javascript - --8<-- "docs/src/basic_legacy.ts:create_f16_table" - ``` + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/basic.ts:create_f16_table" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:create_f16_table" + ``` ### From Pydantic Models @@ -225,7 +275,7 @@ class NestedSchema(LanceModel): tbl = db.create_table("nested_table", schema=NestedSchema, mode="overwrite") ``` -This creates a struct column called "document" that has two subfields +This creates a struct column called "document" that has two subfields called "content" and "source": ``` @@ -236,7 +286,7 @@ vector: fixed_size_list[1536] not null child 0, item: float document: struct not null child 0, content: string not null - child 1, source: string not null + child 1, source: string not null ``` #### Validators @@ -261,7 +311,7 @@ class TestModel(LanceModel): @classmethod def tz_must_match(cls, dt: datetime) -> datetime: assert dt.tzinfo == tz - return dt + return dt ok = TestModel(dt_with_tz=datetime.now(tz)) @@ -329,23 +379,24 @@ You can also use iterators of other types like Pandas DataFrame or Pylists direc tbl = db.open_table("my_table") ``` -=== "JavaScript" +=== "Typescript[^1]" + If you forget the name of your table, you can always get a listing of all table names. - ```javascript + ```typescript console.log(await db.tableNames()); ``` Then, you can open any existing tables. - ```javascript + ```typescript const tbl = await db.openTable("my_table"); ``` ## Creating empty table +You can create an empty table for scenarios where you want to add data to the table later. An example would be when you want to collect data from a stream/external file and then add it to a table in batches. === "Python" - In Python, you can create an empty table for scenarios where you want to add data to the table later. An example would be when you want to collect data from a stream/external file and then add it to a table in batches. ```python @@ -364,8 +415,8 @@ You can also use iterators of other types like Pandas DataFrame or Pylists direc tbl = db.create_table("empty_table_add", schema=schema) ``` - Alternatively, you can also use Pydantic to specify the schema for the empty table. Note that we do not - directly import `pydantic` but instead use `lancedb.pydantic` which is a subclass of `pydantic.BaseModel` + Alternatively, you can also use Pydantic to specify the schema for the empty table. Note that we do not + directly import `pydantic` but instead use `lancedb.pydantic` which is a subclass of `pydantic.BaseModel` that has been extended to support LanceDB specific types like `Vector`. ```python @@ -382,9 +433,23 @@ You can also use iterators of other types like Pandas DataFrame or Pylists direc Once the empty table has been created, you can add data to it via the various methods listed in the [Adding to a table](#adding-to-a-table) section. +=== "Typescript[^1]" + + === "@lancedb/lancedb" + + ```typescript + --8<-- "nodejs/examples/basic.ts:create_empty_table" + ``` + + === "vectordb (deprecated)" + + ```typescript + --8<-- "docs/src/basic_legacy.ts:create_empty_table" + ``` + ## Adding to a table -After a table has been created, you can always add more data to it using the various methods available. +After a table has been created, you can always add more data to it usind the `add` method === "Python" You can add any of the valid data structures accepted by LanceDB table, i.e, `dict`, `list[dict]`, `pd.DataFrame`, or `Iterator[pa.RecordBatch]`. Below are some examples. @@ -472,9 +537,7 @@ After a table has been created, you can always add more data to it using the var tbl.add(models) ``` - - -=== "JavaScript" +=== "Typescript[^1]" ```javascript await tbl.add( @@ -530,15 +593,15 @@ Use the `delete()` method on tables to delete rows from a table. To choose which # 0 3 [5.0, 6.0] ``` -=== "JavaScript" +=== "Typescript[^1]" - ```javascript + ```ts await tbl.delete('item = "fizz"') ``` ### Deleting row with specific column value - ```javascript + ```ts const con = await lancedb.connect("./.lancedb") const data = [ {id: 1, vector: [1, 2]}, @@ -552,7 +615,7 @@ Use the `delete()` method on tables to delete rows from a table. To choose which ### Delete from a list of values - ```javascript + ```ts const to_remove = [1, 5]; await tbl.delete(`id IN (${to_remove.join(",")})`) await tbl.countRows() // Returns 1 @@ -609,26 +672,49 @@ This can be used to update zero to all rows depending on how many rows match the 2 2 [10.0, 10.0] ``` -=== "JavaScript/Typescript" +=== "Typescript[^1]" - API Reference: [vectordb.Table.update](../javascript/interfaces/Table.md/#update) + === "@lancedb/lancedb" - ```javascript - const lancedb = require("vectordb"); + API Reference: [lancedb.Table.update](../js/classes/Table.md/#update) - const db = await lancedb.connect("./.lancedb"); + ```ts + import * as lancedb from "@lancedb/lancedb"; - const data = [ - {x: 1, vector: [1, 2]}, - {x: 2, vector: [3, 4]}, - {x: 3, vector: [5, 6]}, - ]; - const tbl = await db.createTable("my_table", data) + const db = await lancedb.connect("./.lancedb"); - await tbl.update({ where: "x = 2", values: {vector: [10, 10]} }) - ``` + const data = [ + {x: 1, vector: [1, 2]}, + {x: 2, vector: [3, 4]}, + {x: 3, vector: [5, 6]}, + ]; + const tbl = await db.createTable("my_table", data) -The `values` parameter is used to provide the new values for the columns as literal values. You can also use the `values_sql` / `valuesSql` parameter to provide SQL expressions for the new values. For example, you can use `values_sql="x + 1"` to increment the value of the `x` column by 1. + await tbl.update({vector: [10, 10]}, { where: "x = 2"}) + ``` + + === "vectordb (deprecated)" + + API Reference: [vectordb.Table.update](../javascript/interfaces/Table.md/#update) + + ```ts + const lancedb = require("vectordb"); + + const db = await lancedb.connect("./.lancedb"); + + const data = [ + {x: 1, vector: [1, 2]}, + {x: 2, vector: [3, 4]}, + {x: 3, vector: [5, 6]}, + ]; + const tbl = await db.createTable("my_table", data) + + await tbl.update({ where: "x = 2", values: {vector: [10, 10]} }) + ``` + +#### Updating using a sql query + + The `values` parameter is used to provide the new values for the columns as literal values. You can also use the `values_sql` / `valuesSql` parameter to provide SQL expressions for the new values. For example, you can use `values_sql="x + 1"` to increment the value of the `x` column by 1. === "Python" @@ -647,11 +733,17 @@ The `values` parameter is used to provide the new values for the columns as lite 2 3 [10.0, 10.0] ``` -=== "JavaScript/Typescript" +=== "Typescript[^1]" - ```javascript - await tbl.update({ valuesSql: { x: "x + 1" } }) - ``` + === "@lancedb/lancedb" + + Coming Soon! + + === "vectordb (deprecated)" + + ```ts + await tbl.update({ valuesSql: { x: "x + 1" } }) + ``` !!! info "Note" @@ -672,7 +764,7 @@ Use the `drop_table()` method on the database to remove a table. By default, if the table does not exist an exception is raised. To suppress this, you can pass in `ignore_missing=True`. -=== "Javascript/Typescript" +=== "TypeScript" ```typescript --8<-- "docs/src/basic_legacy.ts:drop_table" @@ -697,7 +789,7 @@ There are three possible settings for `read_consistency_interval`: This is only tune-able in LanceDB OSS. In LanceDB Cloud, readers are always eventually consistent. === "Python" - + To set strong consistency, use `timedelta(0)`: ```python @@ -719,33 +811,35 @@ There are three possible settings for `read_consistency_interval`: ```python db = lancedb.connect("./.lancedb") table = db.open_table("my_table") - + # (Other writes happen to my_table from another process) # Check for updates table.checkout_latest() ``` -=== "JavaScript/Typescript" +=== "Typescript[^1]" To set strong consistency, use `0`: - ```javascript + ```ts const db = await lancedb.connect({ uri: "./.lancedb", readConsistencyInterval: 0 }); const table = await db.openTable("my_table"); ``` For eventual consistency, specify the update interval as seconds: - ```javascript + ```ts const db = await lancedb.connect({ uri: "./.lancedb", readConsistencyInterval: 5 }); const table = await db.openTable("my_table"); ``` - ## What's next? -Learn the best practices on creating an ANN index and getting the most out of it. \ No newline at end of file +Learn the best practices on creating an ANN index and getting the most out of it. + +[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](migration.md) for more information. diff --git a/docs/src/search.md b/docs/src/search.md index bcf0b27a..62f46a63 100644 --- a/docs/src/search.md +++ b/docs/src/search.md @@ -53,13 +53,24 @@ db.create_table("my_vectors", data=data) .to_list() ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - --8<-- "docs/src/search_legacy.ts:import" + === "@lancedb/lancedb" - --8<-- "docs/src/search_legacy.ts:search1" - ``` + ```ts + --8<-- "nodejs/examples/search.ts:import" + + --8<-- "nodejs/examples/search.ts:search1" + ``` + + + === "vectordb (deprecated)" + + ```ts + --8<-- "docs/src/search_legacy.ts:import" + + --8<-- "docs/src/search_legacy.ts:search1" + ``` By default, `l2` will be used as metric type. You can specify the metric type as `cosine` or `dot` if required. @@ -73,11 +84,19 @@ By default, `l2` will be used as metric type. You can specify the metric type as .to_list() ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - --8<-- "docs/src/search_legacy.ts:search2" - ``` + === "@lancedb/lancedb" + + ```ts + --8<-- "nodejs/examples/search.ts:search2" + ``` + + === "vectordb (deprecated)" + + ```javascript + --8<-- "docs/src/search_legacy.ts:search2" + ``` ## Approximate nearest neighbor (ANN) search diff --git a/docs/src/sql.md b/docs/src/sql.md index c5e2d815..b1d10d97 100644 --- a/docs/src/sql.md +++ b/docs/src/sql.md @@ -44,11 +44,19 @@ const tbl = await db.createTable('myVectors', data) ) ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - --8<-- "docs/src/sql_legacy.ts:search" - ``` + === "@lancedb/lancedb" + + ```ts + --8<-- "nodejs/examples/filtering.ts:search" + ``` + + === "vectordb (deprecated)" + + ```ts + --8<-- "docs/src/sql_legacy.ts:search" + ``` ## SQL filters @@ -78,11 +86,19 @@ For example, the following filter string is acceptable: .to_arrow() ``` -=== "Javascript" +=== "TypeScript" - ```javascript - --8<-- "docs/src/sql_legacy.ts:vec_search" - ``` + === "@lancedb/lancedb" + + ```ts + --8<-- "nodejs/examples/filtering.ts:vec_search" + ``` + + === "vectordb (deprecated)" + + ```ts + --8<-- "docs/src/sql_legacy.ts:vec_search" + ``` If your column name contains special characters or is a [SQL Keyword](https://docs.rs/sqlparser/latest/sqlparser/keywords/index.html), you can use backtick (`` ` ``) to escape it. For nested fields, each segment of the @@ -148,10 +164,18 @@ You can also filter your data without search. tbl.search().where("id = 10").limit(10).to_arrow() ``` -=== "JavaScript" +=== "TypeScript" - ```javascript - --8<---- "docs/src/sql_legacy.ts:sql_search" - ``` + === "@lancedb/lancedb" + + ```ts + --8<-- "nodejs/examples/filtering.ts:sql_search" + ``` + + === "vectordb (deprecated)" + + ```ts + --8<---- "docs/src/sql_legacy.ts:sql_search" + ``` !!!warning "If your table is large, this could potentially return a very large amount of data. Please be sure to use a `limit` clause unless you're sure you want to return the whole result set." diff --git a/nodejs/__test__/tsconfig.json b/nodejs/__test__/tsconfig.json index 840e7ef0..1f50bcfe 100644 --- a/nodejs/__test__/tsconfig.json +++ b/nodejs/__test__/tsconfig.json @@ -6,5 +6,5 @@ "target": "es2022", "types": ["jest", "node"] }, - "include": ["**/*"] + "include": ["**/*", "../examples/ann_indexes.ts"] } diff --git a/nodejs/biome.json b/nodejs/biome.json index 1bd8738b..584976b9 100644 --- a/nodejs/biome.json +++ b/nodejs/biome.json @@ -94,7 +94,13 @@ "useValidTypeof": "error" } }, - "ignore": ["**/dist/**/*", "**/native.js", "**/native.d.ts"] + "ignore": [ + "**/dist/**/*", + "**/native.js", + "**/native.d.ts", + "__test__/docs/**/*", + "examples/**/*" + ] }, "javascript": { "globals": [] diff --git a/nodejs/examples/.gitignore b/nodejs/examples/.gitignore new file mode 100644 index 00000000..8fce6030 --- /dev/null +++ b/nodejs/examples/.gitignore @@ -0,0 +1 @@ +data/ diff --git a/nodejs/examples/ann_indexes.ts b/nodejs/examples/ann_indexes.ts new file mode 100644 index 00000000..0c14da4a --- /dev/null +++ b/nodejs/examples/ann_indexes.ts @@ -0,0 +1,49 @@ +// --8<-- [start:import] +import * as lancedb from "@lancedb/lancedb"; +// --8<-- [end:import] + +// --8<-- [start:ingest] +const db = await lancedb.connect("/tmp/lancedb/"); + +const data = Array.from({ length: 10_000 }, (_, i) => ({ + vector: Array(1536).fill(i), + id: `${i}`, + content: "", + longId: `${i}`, +})); + +const table = await db.createTable("my_vectors", data, { mode: "overwrite" }); +await table.createIndex("vector", { + config: lancedb.Index.ivfPq({ + numPartitions: 16, + numSubVectors: 48, + }), +}); +// --8<-- [end:ingest] + +// --8<-- [start:search1] +const _results1 = await table + .search(Array(1536).fill(1.2)) + .limit(2) + .nprobes(20) + .refineFactor(10) + .toArray(); +// --8<-- [end:search1] + +// --8<-- [start:search2] +const _results2 = await table + .search(Array(1536).fill(1.2)) + .where("id != '1141'") + .limit(2) + .toArray(); +// --8<-- [end:search2] + +// --8<-- [start:search3] +const _results3 = await table + .search(Array(1536).fill(1.2)) + .select(["id"]) + .limit(2) + .toArray(); +// --8<-- [end:search3] + +console.log("Ann indexes: done"); diff --git a/nodejs/examples/basic.ts b/nodejs/examples/basic.ts new file mode 100644 index 00000000..f632d047 --- /dev/null +++ b/nodejs/examples/basic.ts @@ -0,0 +1,149 @@ +// --8<-- [start:imports] +import * as lancedb from "@lancedb/lancedb"; +import * as arrow from "apache-arrow"; +import { Field, FixedSizeList, Float16, Int32, Schema } from "apache-arrow"; + +// --8<-- [end:imports] + +// --8<-- [start:connect] +const uri = "/tmp/lancedb/"; +const db = await lancedb.connect(uri); +// --8<-- [end:connect] +{ + // --8<-- [start:create_table] + const data = [ + { vector: [3.1, 4.1], item: "foo", price: 10.0 }, + { vector: [5.9, 26.5], item: "bar", price: 20.0 }, + ]; + const _tbl = await db.createTable("myTable", data); + // --8<-- [end:create_table] + { + // --8<-- [start:create_table_exists_ok] + const _tbl = await db.createTable("myTable", data, { + existsOk: true, + }); + // --8<-- [end:create_table_exists_ok] + } + { + // --8<-- [start:create_table_overwrite] + const _tbl = await db.createTable("myTable", data, { + mode: "overwrite", + }); + // --8<-- [end:create_table_overwrite] + } +} + +{ + // --8<-- [start:create_table_with_schema] + const schema = new arrow.Schema([ + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 2, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + new arrow.Field("item", new arrow.Utf8(), true), + new arrow.Field("price", new arrow.Float32(), true), + ]); + const data = [ + { vector: [3.1, 4.1], item: "foo", price: 10.0 }, + { vector: [5.9, 26.5], item: "bar", price: 20.0 }, + ]; + const _tbl = await db.createTable("myTable", data, { + schema, + }); + // --8<-- [end:create_table_with_schema] +} + +{ + // --8<-- [start:create_empty_table] + const schema = new arrow.Schema([ + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 2, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + ]); + const _tbl = await db.createEmptyTable("empty_table", schema); + // --8<-- [end:create_empty_table] +} +{ + // --8<-- [start:open_table] + const _tbl = await db.openTable("myTable"); + // --8<-- [end:open_table] +} + +{ + // --8<-- [start:table_names] + const tableNames = await db.tableNames(); + console.log(tableNames); + // --8<-- [end:table_names] +} + +const tbl = await db.openTable("myTable"); +{ + // --8<-- [start:add_data] + const data = [ + { vector: [1.3, 1.4], item: "fizz", price: 100.0 }, + { vector: [9.5, 56.2], item: "buzz", price: 200.0 }, + ]; + await tbl.add(data); + // --8<-- [end:add_data] +} +{ + // --8<-- [start:vector_search] + const _res = tbl.search([100, 100]).limit(2).toArray(); + // --8<-- [end:vector_search] +} +{ + const data = Array.from({ length: 1000 }) + .fill(null) + .map(() => ({ + vector: [Math.random(), Math.random()], + item: "autogen", + price: Math.round(Math.random() * 100), + })); + + await tbl.add(data); +} + +// --8<-- [start:create_index] +await tbl.createIndex("vector"); +// --8<-- [end:create_index] + +// --8<-- [start:delete_rows] +await tbl.delete('item = "fizz"'); +// --8<-- [end:delete_rows] + +// --8<-- [start:drop_table] +await db.dropTable("myTable"); +// --8<-- [end:drop_table] +await db.dropTable("empty_table"); + +{ + // --8<-- [start:create_f16_table] + const db = await lancedb.connect("/tmp/lancedb"); + const dim = 16; + const total = 10; + const f16Schema = new Schema([ + new Field("id", new Int32()), + new Field( + "vector", + new FixedSizeList(dim, new Field("item", new Float16(), true)), + false, + ), + ]); + const data = lancedb.makeArrowTable( + Array.from(Array(total), (_, i) => ({ + id: i, + vector: Array.from(Array(dim), Math.random), + })), + { schema: f16Schema }, + ); + const _table = await db.createTable("f16_tbl", data); + // --8<-- [end:create_f16_table] + await db.dropTable("f16_tbl"); +} diff --git a/nodejs/examples/embedding.ts b/nodejs/examples/embedding.ts new file mode 100644 index 00000000..dbf7a6e6 --- /dev/null +++ b/nodejs/examples/embedding.ts @@ -0,0 +1,83 @@ +// --8<-- [start:imports] +import * as lancedb from "@lancedb/lancedb"; +import { LanceSchema, getRegistry, register } from "@lancedb/lancedb/embedding"; +import { EmbeddingFunction } from "@lancedb/lancedb/embedding"; +import { type Float, Float32, Utf8 } from "apache-arrow"; +// --8<-- [end:imports] + +{ + // --8<-- [start:openai_embeddings] + + const db = await lancedb.connect("/tmp/db"); + const func = getRegistry() + .get("openai") + ?.create({ model: "text-embedding-ada-002" }) as EmbeddingFunction; + + const wordsSchema = LanceSchema({ + text: func.sourceField(new Utf8()), + vector: func.vectorField(), + }); + const tbl = await db.createEmptyTable("words", wordsSchema, { + mode: "overwrite", + }); + await tbl.add([{ text: "hello world" }, { text: "goodbye world" }]); + + const query = "greetings"; + const actual = (await (await tbl.search(query)).limit(1).toArray())[0]; + + // --8<-- [end:openai_embeddings] + console.log("result = ", actual.text); +} + +{ + // --8<-- [start:embedding_function] + const db = await lancedb.connect("/tmp/db"); + + @register("my_embedding") + class MyEmbeddingFunction extends EmbeddingFunction { + toJSON(): object { + return {}; + } + ndims() { + return 3; + } + embeddingDataType(): Float { + return new Float32(); + } + async computeQueryEmbeddings(_data: string) { + // This is a placeholder for a real embedding function + return [1, 2, 3]; + } + async computeSourceEmbeddings(data: string[]) { + // This is a placeholder for a real embedding function + return Array.from({ length: data.length }).fill([1, 2, 3]) as number[][]; + } + } + + const func = new MyEmbeddingFunction(); + + const data = [{ text: "pepperoni" }, { text: "pineapple" }]; + + // Option 1: manually specify the embedding function + const table = await db.createTable("vectors", data, { + embeddingFunction: { + function: func, + sourceColumn: "text", + vectorColumn: "vector", + }, + mode: "overwrite", + }); + + // Option 2: provide the embedding function through a schema + + const schema = LanceSchema({ + text: func.sourceField(new Utf8()), + vector: func.vectorField(), + }); + + const table2 = await db.createTable("vectors2", data, { + schema, + mode: "overwrite", + }); + // --8<-- [end:embedding_function] +} diff --git a/nodejs/examples/filtering.ts b/nodejs/examples/filtering.ts new file mode 100644 index 00000000..a999581b --- /dev/null +++ b/nodejs/examples/filtering.ts @@ -0,0 +1,34 @@ +import * as lancedb from "@lancedb/lancedb"; + +const db = await lancedb.connect("data/sample-lancedb"); + +const data = Array.from({ length: 10_000 }, (_, i) => ({ + vector: Array(1536).fill(i), + id: i, + item: `item ${i}`, + strId: `${i}`, +})); + +const tbl = await db.createTable("myVectors", data, { mode: "overwrite" }); + +// --8<-- [start:search] +const _result = await tbl + .search(Array(1536).fill(0.5)) + .limit(1) + .where("id = 10") + .toArray(); +// --8<-- [end:search] + +// --8<-- [start:vec_search] +await tbl + .search(Array(1536).fill(0)) + .where("(item IN ('item 0', 'item 2')) AND (id > 10)") + .postfilter() + .toArray(); +// --8<-- [end:vec_search] + +// --8<-- [start:sql_search] +await tbl.query().where("id = 10").limit(10).toArray(); +// --8<-- [end:sql_search] + +console.log("SQL search: done"); diff --git a/nodejs/examples/jsconfig.json b/nodejs/examples/jsconfig.json new file mode 100644 index 00000000..238655f2 --- /dev/null +++ b/nodejs/examples/jsconfig.json @@ -0,0 +1,27 @@ +{ + "compilerOptions": { + // Enable latest features + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + + // Bundler mode + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + + // Best practices + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + + // Some stricter flags (disabled by default) + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + } +} diff --git a/nodejs/examples/package-lock.json b/nodejs/examples/package-lock.json new file mode 100644 index 00000000..14b2bce9 --- /dev/null +++ b/nodejs/examples/package-lock.json @@ -0,0 +1,79 @@ +{ + "name": "examples", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "examples", + "version": "1.0.0", + "license": "Apache-2.0", + "dependencies": { + "@lancedb/lancedb": "file:../" + }, + "peerDependencies": { + "typescript": "^5.0.0" + } + }, + "..": { + "name": "@lancedb/lancedb", + "version": "0.6.0", + "cpu": [ + "x64", + "arm64" + ], + "license": "Apache 2.0", + "os": [ + "darwin", + "linux", + "win32" + ], + "dependencies": { + "apache-arrow": "^15.0.0", + "axios": "^1.7.2", + "openai": "^4.29.2", + "reflect-metadata": "^0.2.2" + }, + "devDependencies": { + "@aws-sdk/client-kms": "^3.33.0", + "@aws-sdk/client-s3": "^3.33.0", + "@biomejs/biome": "^1.7.3", + "@jest/globals": "^29.7.0", + "@napi-rs/cli": "^2.18.0", + "@types/axios": "^0.14.0", + "@types/jest": "^29.1.2", + "@types/tmp": "^0.2.6", + "apache-arrow-old": "npm:apache-arrow@13.0.0", + "eslint": "^8.57.0", + "jest": "^29.7.0", + "shx": "^0.3.4", + "tmp": "^0.2.3", + "ts-jest": "^29.1.2", + "typedoc": "^0.25.7", + "typedoc-plugin-markdown": "^3.17.1", + "typescript": "^5.3.3", + "typescript-eslint": "^7.1.0" + }, + "engines": { + "node": ">= 18" + } + }, + "node_modules/@lancedb/lancedb": { + "resolved": "..", + "link": true + }, + "node_modules/typescript": { + "version": "5.5.2", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz", + "integrity": "sha512-NcRtPEOsPFFWjobJEtfihkLCZCXZt/os3zf8nTxjVH3RvTSxjrCamJpbExGvYOF+tFHc3pA65qpdwPbzjohhew==", + "peer": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + } + } +} diff --git a/nodejs/examples/package.json b/nodejs/examples/package.json new file mode 100644 index 00000000..8333e4af --- /dev/null +++ b/nodejs/examples/package.json @@ -0,0 +1,18 @@ +{ + "name": "examples", + "version": "1.0.0", + "description": "Examples for LanceDB", + "main": "index.js", + "type": "module", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Lance Devs", + "license": "Apache-2.0", + "dependencies": { + "@lancedb/lancedb": "file:../" + }, + "peerDependencies": { + "typescript": "^5.0.0" + } +} diff --git a/nodejs/examples/search.ts b/nodejs/examples/search.ts new file mode 100644 index 00000000..07f4323a --- /dev/null +++ b/nodejs/examples/search.ts @@ -0,0 +1,37 @@ +// --8<-- [end:import] +import * as fs from "node:fs"; +// --8<-- [start:import] +import * as lancedb from "@lancedb/lancedb"; + +async function setup() { + fs.rmSync("data/sample-lancedb", { recursive: true, force: true }); + const db = await lancedb.connect("data/sample-lancedb"); + + const data = Array.from({ length: 10_000 }, (_, i) => ({ + vector: Array(1536).fill(i), + id: `${i}`, + content: "", + longId: `${i}`, + })); + + await db.createTable("my_vectors", data); +} + +await setup(); + +// --8<-- [start:search1] +const db = await lancedb.connect("data/sample-lancedb"); +const tbl = await db.openTable("my_vectors"); + +const _results1 = await tbl.search(Array(1536).fill(1.2)).limit(10).toArray(); +// --8<-- [end:search1] + +// --8<-- [start:search2] +const _results2 = await tbl + .search(Array(1536).fill(1.2)) + .distanceType("cosine") + .limit(10) + .toArray(); +// --8<-- [end:search2] + +console.log("search: done"); diff --git a/nodejs/native.d.ts b/nodejs/native.d.ts new file mode 100644 index 00000000..057797ae --- /dev/null +++ b/nodejs/native.d.ts @@ -0,0 +1,208 @@ +/* tslint:disable */ +/* eslint-disable */ + +/* auto-generated by NAPI-RS */ + +/** A description of an index currently configured on a column */ +export interface IndexConfig { + /** The name of the index */ + name: string + /** The type of the index */ + indexType: string + /** + * The columns in the index + * + * Currently this is always an array of size 1. In the future there may + * be more columns to represent composite indices. + */ + columns: Array +} +/** Statistics about a compaction operation. */ +export interface CompactionStats { + /** The number of fragments removed */ + fragmentsRemoved: number + /** The number of new, compacted fragments added */ + fragmentsAdded: number + /** The number of data files removed */ + filesRemoved: number + /** The number of new, compacted data files added */ + filesAdded: number +} +/** Statistics about a cleanup operation */ +export interface RemovalStats { + /** The number of bytes removed */ + bytesRemoved: number + /** The number of old versions removed */ + oldVersionsRemoved: number +} +/** Statistics about an optimize operation */ +export interface OptimizeStats { + /** Statistics about the compaction operation */ + compaction: CompactionStats + /** Statistics about the removal operation */ + prune: RemovalStats +} +/** + * A definition of a column alteration. The alteration changes the column at + * `path` to have the new name `name`, to be nullable if `nullable` is true, + * and to have the data type `data_type`. At least one of `rename` or `nullable` + * must be provided. + */ +export interface ColumnAlteration { + /** + * The path to the column to alter. This is a dot-separated path to the column. + * If it is a top-level column then it is just the name of the column. If it is + * a nested column then it is the path to the column, e.g. "a.b.c" for a column + * `c` nested inside a column `b` nested inside a column `a`. + */ + path: string + /** + * The new name of the column. If not provided then the name will not be changed. + * This must be distinct from the names of all other columns in the table. + */ + rename?: string + /** Set the new nullability. Note that a nullable column cannot be made non-nullable. */ + nullable?: boolean +} +/** A definition of a new column to add to a table. */ +export interface AddColumnsSql { + /** The name of the new column. */ + name: string + /** + * The values to populate the new column with, as a SQL expression. + * The expression can reference other columns in the table. + */ + valueSql: string +} +export interface IndexStatistics { + /** The number of rows indexed by the index */ + numIndexedRows: number + /** The number of rows not indexed */ + numUnindexedRows: number + /** The type of the index */ + indexType?: string + /** The metadata for each index */ + indices: Array +} +export interface IndexMetadata { + metricType?: string + indexType?: string +} +export interface ConnectionOptions { + /** + * (For LanceDB OSS only): The interval, in seconds, at which to check for + * updates to the table from other processes. If None, then consistency is not + * checked. For performance reasons, this is the default. For strong + * consistency, set this to zero seconds. Then every read will check for + * updates from other processes. As a compromise, you can set this to a + * non-zero value for eventual consistency. If more than that interval + * has passed since the last check, then the table will be checked for updates. + * Note: this consistency only applies to read operations. Write operations are + * always consistent. + */ + readConsistencyInterval?: number + /** + * (For LanceDB OSS only): configuration for object storage. + * + * The available options are described at https://lancedb.github.io/lancedb/guides/storage/ + */ + storageOptions?: Record +} +/** Write mode for writing a table. */ +export const enum WriteMode { + Create = 'Create', + Append = 'Append', + Overwrite = 'Overwrite' +} +/** Write options when creating a Table. */ +export interface WriteOptions { + /** Write mode for writing to a table. */ + mode?: WriteMode +} +export interface OpenTableOptions { + storageOptions?: Record +} +export class Connection { + /** Create a new Connection instance from the given URI. */ + static new(uri: string, options: ConnectionOptions): Promise + display(): string + isOpen(): boolean + close(): void + /** List all tables in the dataset. */ + tableNames(startAfter?: string | undefined | null, limit?: number | undefined | null): Promise> + /** + * Create table from a Apache Arrow IPC (file) buffer. + * + * Parameters: + * - name: The name of the table. + * - buf: The buffer containing the IPC file. + * + */ + createTable(name: string, buf: Buffer, mode: string, storageOptions?: Record | undefined | null, useLegacyFormat?: boolean | undefined | null): Promise + createEmptyTable(name: string, schemaBuf: Buffer, mode: string, storageOptions?: Record | undefined | null, useLegacyFormat?: boolean | undefined | null): Promise
+ openTable(name: string, storageOptions?: Record | undefined | null, indexCacheSize?: number | undefined | null): Promise
+ /** Drop table with the name. Or raise an error if the table does not exist. */ + dropTable(name: string): Promise +} +export class Index { + static ivfPq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index + static btree(): Index +} +/** Typescript-style Async Iterator over RecordBatches */ +export class RecordBatchIterator { + next(): Promise +} +/** A builder used to create and run a merge insert operation */ +export class NativeMergeInsertBuilder { + whenMatchedUpdateAll(condition?: string | undefined | null): NativeMergeInsertBuilder + whenNotMatchedInsertAll(): NativeMergeInsertBuilder + whenNotMatchedBySourceDelete(filter?: string | undefined | null): NativeMergeInsertBuilder + execute(buf: Buffer): Promise +} +export class Query { + onlyIf(predicate: string): void + select(columns: Array<[string, string]>): void + limit(limit: number): void + nearestTo(vector: Float32Array): VectorQuery + execute(maxBatchLength?: number | undefined | null): Promise + explainPlan(verbose: boolean): Promise +} +export class VectorQuery { + column(column: string): void + distanceType(distanceType: string): void + postfilter(): void + refineFactor(refineFactor: number): void + nprobes(nprobe: number): void + bypassVectorIndex(): void + onlyIf(predicate: string): void + select(columns: Array<[string, string]>): void + limit(limit: number): void + execute(maxBatchLength?: number | undefined | null): Promise + explainPlan(verbose: boolean): Promise +} +export class Table { + name: string + display(): string + isOpen(): boolean + close(): void + /** Return Schema as empty Arrow IPC file. */ + schema(): Promise + add(buf: Buffer, mode: string): Promise + countRows(filter?: string | undefined | null): Promise + delete(predicate: string): Promise + createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise + update(onlyIf: string | undefined | null, columns: Array<[string, string]>): Promise + query(): Query + vectorSearch(vector: Float32Array): VectorQuery + addColumns(transforms: Array): Promise + alterColumns(alterations: Array): Promise + dropColumns(columns: Array): Promise + version(): Promise + checkout(version: number): Promise + checkoutLatest(): Promise + restore(): Promise + optimize(olderThanMs?: number | undefined | null): Promise + listIndices(): Promise> + indexStats(indexName: string): Promise + mergeInsert(on: Array): NativeMergeInsertBuilder +} diff --git a/rust/lancedb/examples/openai.rs b/rust/lancedb/examples/openai.rs index ce5f811f..3c1244f3 100644 --- a/rust/lancedb/examples/openai.rs +++ b/rust/lancedb/examples/openai.rs @@ -1,3 +1,5 @@ +// --8<-- [start:imports] + use std::{iter::once, sync::Arc}; use arrow_array::{Float64Array, Int32Array, RecordBatch, RecordBatchIterator, StringArray}; @@ -11,6 +13,9 @@ use lancedb::{ Result, }; +// --8<-- [end:imports] + +// --8<-- [start:openai_embeddings] #[tokio::main] async fn main() -> Result<()> { let tempdir = tempfile::tempdir().unwrap(); @@ -35,7 +40,6 @@ async fn main() -> Result<()> { .execute() .await?; - // there is no equivalent to '.search()' yet let query = Arc::new(StringArray::from_iter_values(once("something warm"))); let query_vector = embedding.compute_query_embeddings(query)?; let mut results = table @@ -53,9 +57,9 @@ async fn main() -> Result<()> { .unwrap(); let text = out.iter().next().unwrap().unwrap(); println!("Closest match: {}", text); - Ok(()) } +// --8<-- [end:openai_embeddings] fn make_data() -> impl IntoArrow { let schema = Schema::new(vec![