Compare commits

...

18 Commits

Author SHA1 Message Date
albertlockett
b7fed59278 linter and clippy 2024-11-21 07:11:35 -05:00
albertlockett
60ad82b6ad add tests for rust 2024-11-21 06:58:51 -05:00
albertlockett
134258308c it passes version for all read calls 2024-11-20 11:46:04 -05:00
albertlockett
d36334d565 fixed for describe 2024-11-20 10:14:39 -05:00
albertlockett
131c01d702 feat: support for checkout and checkout_latest in remote rust and python sdks 2024-11-19 17:24:28 -05:00
BubbleCal
b2f88f0b29 feat: support to sepcify ef search param (#1844)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2024-11-19 23:12:25 +08:00
fzowl
f2e3989831 docs: voyageai embedding in the index (#1813)
The code to support VoyageAI embedding and rerank models was added in
the https://github.com/lancedb/lancedb/pull/1799 PR.
Some of the documentation changes was also made, here adding the
VoyageAI embedding doc link to the index page.

These are my first PRs in lancedb and while i checked the
documentation/code structure, i might missed something important. Please
let me know if any changes required!
2024-11-18 14:34:16 -08:00
Emmanuel Ferdman
83ae52938a docs: update migration reference (#1837)
# PR Summary
PR fixes the `migration.md` reference in `docs/src/guides/tables.md`. On
the way, it also fixes some typos found in that document.

Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
2024-11-18 14:33:32 -08:00
Lei Xu
267aa83bf8 feat(python): check vector query is not None (#1847)
Fix the type hints of `nearest_to` method, and raise `ValueError` when
the input is None
2024-11-18 14:15:22 -08:00
Will Jones
cc72050206 chore: update package locks (#1845)
Also ran `npm audit`.
2024-11-18 13:44:06 -08:00
Will Jones
72543c8b9d test(python): test with_row_id in sync query (#1835)
Also remove weird `MockTable` fixture.
2024-11-18 11:32:52 -08:00
Will Jones
97d6210c33 ci: remove invalid references (#1834)
Fix release job
2024-11-18 11:32:44 -08:00
Ho Kim
a3d0c27b0a feat: add support for rustls (#1842)
Hello, this is a simple PR that supports `rustls-tls` feature.

The `reqwest`\`s default TLS `default-tls` is enabled by default, to
dismiss the side-effect.

The user can use `rustls-tls` like this:

```toml
lancedb = { version = "*", default-features = false, features = ["rustls-tls"] }
```
2024-11-18 10:36:20 -08:00
BubbleCal
b23d8abcdd docs: introduce incremental indexing for FTS (#1789)
don't merge it before https://github.com/lancedb/lancedb/pull/1769
merged

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2024-11-18 20:21:28 +08:00
Rob Meng
e3ea5cf9b9 chore: bump lance to 0.19.3 (#1839) 2024-11-16 14:57:52 -05:00
Lance Release
4f8b086175 Updating package-lock.json 2024-11-15 20:18:16 +00:00
Lance Release
72330fb759 Bump version: 0.13.0-beta.3 → 0.13.0 2024-11-15 20:17:59 +00:00
Lance Release
e3b2c5f438 Bump version: 0.13.0-beta.2 → 0.13.0-beta.3 2024-11-15 20:17:55 +00:00
38 changed files with 678 additions and 135 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.13.0-beta.2"
current_version = "0.13.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -460,7 +460,7 @@ jobs:
release:
name: vectordb NPM Publish
needs: [node, node-macos, node-linux, node-windows, node-windows-arm64]
needs: [node, node-macos, node-linux, node-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
@@ -500,7 +500,7 @@ jobs:
release-nodejs:
name: lancedb NPM Publish
needs: [nodejs-macos, nodejs-linux, nodejs-windows, nodejs-windows-arm64]
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')

View File

@@ -18,18 +18,18 @@ repository = "https://github.com/lancedb/lancedb"
description = "Serverless, low-latency vector database for AI applications"
keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
[workspace.dependencies]
lance = { "version" = "=0.19.2", "features" = [
lance = { "version" = "=0.19.3", "features" = [
"dynamodb",
]}
lance-index = "=0.19.2"
lance-linalg = "=0.19.2"
lance-table = "=0.19.2"
lance-testing = "=0.19.2"
lance-datafusion = "=0.19.2"
lance-encoding = "=0.19.2"
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-index = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-linalg = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-table = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-testing = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-datafusion = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-encoding = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false }
arrow-array = "52.2"

21
docs/package-lock.json generated
View File

@@ -19,7 +19,7 @@
},
"../node": {
"name": "vectordb",
"version": "0.4.6",
"version": "0.12.0",
"cpu": [
"x64",
"arm64"
@@ -31,9 +31,7 @@
"win32"
],
"dependencies": {
"@apache-arrow/ts": "^14.0.2",
"@neon-rs/load": "^0.0.74",
"apache-arrow": "^14.0.2",
"axios": "^1.4.0"
},
"devDependencies": {
@@ -46,6 +44,7 @@
"@types/temp": "^0.9.1",
"@types/uuid": "^9.0.3",
"@typescript-eslint/eslint-plugin": "^5.59.1",
"apache-arrow-old": "npm:apache-arrow@13.0.0",
"cargo-cp-artifact": "^0.1",
"chai": "^4.3.7",
"chai-as-promised": "^7.1.1",
@@ -62,15 +61,19 @@
"ts-node-dev": "^2.0.0",
"typedoc": "^0.24.7",
"typedoc-plugin-markdown": "^3.15.3",
"typescript": "*",
"typescript": "^5.1.0",
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.6",
"@lancedb/vectordb-darwin-x64": "0.4.6",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.6",
"@lancedb/vectordb-linux-x64-gnu": "0.4.6",
"@lancedb/vectordb-win32-x64-msvc": "0.4.6"
"@lancedb/vectordb-darwin-arm64": "0.12.0",
"@lancedb/vectordb-darwin-x64": "0.12.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
"apache-arrow": "^14.0.2"
}
},
"../node/node_modules/apache-arrow": {

View File

@@ -20,7 +20,7 @@ Supported parameters (to be passed in `create` method) are:
| Parameter | Type | Default Value | Description |
|---|---|--------|---------|
| `name` | `str` | `"voyage-3"` | The model ID of the model to use. Supported base models for Text Embeddings: voyage-3, voyage-3-lite, voyage-finance-2, voyage-multilingual-2, voyage-law-2, voyage-code-2 |
| `name` | `str` | `None` | The model ID of the model to use. Supported base models for Text Embeddings: voyage-3, voyage-3-lite, voyage-finance-2, voyage-multilingual-2, voyage-law-2, voyage-code-2 |
| `input_type` | `str` | `None` | Type of the input text. Default to None. Other options: query, document. |
| `truncation` | `bool` | `True` | Whether to truncate the input texts to fit within the context length. |

View File

@@ -53,6 +53,7 @@ These functions are registered by default to handle text embeddings.
| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |
| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/aws_bedrock.png" alt="AWS Bedrock Icon" width="120" height="35">](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |
| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/watsonx.png" alt="Watsonx Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
| [**VoyageAI Embeddings**](available_embedding_models/text_embedding_functions/voyageai_embedding.md "voyageai") | 🌕 Voyage AI provides cutting-edge embedding and rerankers. This will help you get started with **VoyageAI** embedding models using LanceDB. Using voyageai API requires voyageai package. Install it via `pip`. | [<img src="https://www.voyageai.com/logo.svg" alt="VoyageAI Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/voyageai_embedding.md) |
@@ -66,6 +67,7 @@ These functions are registered by default to handle text embeddings.
[jina-key]: "jina"
[aws-key]: "bedrock-text"
[watsonx-key]: "watsonx"
[voyageai-key]: "voyageai"
## Multi-modal Embedding Functions🖼

View File

@@ -160,3 +160,32 @@ To search for a phrase, the index must be created with `with_position=True`:
table.create_fts_index("text", use_tantivy=False, with_position=True)
```
This will allow you to search for phrases, but it will also significantly increase the index size and indexing time.
## Incremental indexing
LanceDB supports incremental indexing, which means you can add new records to the table without reindexing the entire table.
This can make the query more efficient, especially when the table is large and the new records are relatively small.
=== "Python"
```python
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
table.optimize()
```
=== "TypeScript"
```typescript
await tbl.add([{ vector: [3.1, 4.1], text: "Frodo was a happy puppy" }]);
await tbl.optimize();
```
=== "Rust"
```rust
let more_data: Box<dyn RecordBatchReader + Send> = create_some_records()?;
tbl.add(more_data).execute().await?;
tbl.optimize(OptimizeAction::All).execute().await?;
```

View File

@@ -274,7 +274,7 @@ table = db.create_table(table_name, schema=Content)
Sometimes your data model may contain nested objects.
For example, you may want to store the document string
and the document soure name as a nested Document object:
and the document source name as a nested Document object:
```python
class Document(BaseModel):
@@ -466,7 +466,7 @@ You can create an empty table for scenarios where you want to add data to the ta
## Adding to a table
After a table has been created, you can always add more data to it usind the `add` method
After a table has been created, you can always add more data to it using the `add` method
=== "Python"
You can add any of the valid data structures accepted by LanceDB table, i.e, `dict`, `list[dict]`, `pd.DataFrame`, or `Iterator[pa.RecordBatch]`. Below are some examples.
@@ -535,7 +535,7 @@ After a table has been created, you can always add more data to it usind the `ad
```
??? "Ingesting Pydantic models with LanceDB embedding API"
When using LanceDB's embedding API, you can add Pydantic models directly to the table. LanceDB will automatically convert the `vector` field to a vector before adding it to the table. You need to specify the default value of `vector` feild as None to allow LanceDB to automatically vectorize the data.
When using LanceDB's embedding API, you can add Pydantic models directly to the table. LanceDB will automatically convert the `vector` field to a vector before adding it to the table. You need to specify the default value of `vector` field as None to allow LanceDB to automatically vectorize the data.
```python
import lancedb
@@ -880,4 +880,4 @@ There are three possible settings for `read_consistency_interval`:
Learn the best practices on creating an ANN index and getting the most out of it.
[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](migration.md) for more information.
[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](../migration.md) for more information.

View File

@@ -9,6 +9,7 @@ LanceDB comes with some built-in rerankers. Some of the rerankers that are avail
| `CrossEncoderReranker` | Uses a cross-encoder model to rerank search results | Vector, FTS, Hybrid |
| `ColbertReranker` | Uses a colbert model to rerank search results | Vector, FTS, Hybrid |
| `OpenaiReranker`(Experimental) | Uses OpenAI's chat model to rerank search results | Vector, FTS, Hybrid |
| `VoyageAIReranker` | Uses voyageai Reranker API to rerank results | Vector, FTS, Hybrid |
## Using a Reranker
@@ -73,6 +74,7 @@ LanceDB comes with some built-in rerankers. Here are some of the rerankers that
- [Jina Reranker](./jina.md)
- [AnswerDotAI Rerankers](./answerdotai.md)
- [Reciprocal Rank Fusion Reranker](./rrf.md)
- [VoyageAI Reranker](./voyageai.md)
## Creating Custom Rerankers

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.0-beta.2</version>
<version>0.13.0-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.0-beta.2</version>
<version>0.13.0-final.0</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

82
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"cpu": [
"x64",
"arm64"
@@ -52,12 +52,12 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2"
"@lancedb/vectordb-darwin-arm64": "0.13.0",
"@lancedb/vectordb-darwin-x64": "0.13.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -327,6 +327,66 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0.tgz",
"integrity": "sha512-8hdcjkRmgrdQYf1jN+DyZae40LIv8UUfnWy70Uid5qy63sSvRW/+MvIdqIPFr9QlLUXmpyyQuX0y3bZhUR99cQ==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0.tgz",
"integrity": "sha512-fWzAY4l5SQtNfMYh80v+M66ugZHhdxbkpk5mNEv6Zsug3DL6kRj3Uv31/i0wgzY6F5G3LUlbjZerN+eTnDLwOw==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0.tgz",
"integrity": "sha512-ltwAT9baOSuR5YiGykQXPC8/HGYF13vpI47qxhP9yfgiz9pA8EUn8p8YrBRzq7J4DIZ4b8JSVDXQnMIqEtB4Kg==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0.tgz",
"integrity": "sha512-MiT/RBlMPGGRh7BX+MXwRuNiiUnKmuDcHH8nm88IH28T7TQxXIbA9w6UpSg5m9f3DgKQI2K8oLi29oKIB8ZwDQ==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0.tgz",
"integrity": "sha512-SovP/hwWYLJIy65DKbVuXlBPTb/nwvVpTO6dh9zRch+L5ek6JmVAkwsfeTS2p5bMa8VPujsCXYUAVuCDEJU8wg==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"win32"
]
},
"node_modules/@neon-rs/cli": {
"version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
@@ -1441,9 +1501,9 @@
"dev": true
},
"node_modules/cross-spawn": {
"version": "7.0.3",
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
"integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
"version": "7.0.6",
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
"integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
"dev": true,
"dependencies": {
"path-key": "^3.1.0",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -89,11 +89,11 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2"
"@lancedb/vectordb-darwin-arm64": "0.13.0",
"@lancedb/vectordb-darwin-x64": "0.13.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.13.0-beta.2"
version = "0.13.0"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -477,6 +477,54 @@ describe("When creating an index", () => {
expect(rst.numRows).toBe(1);
});
it("should create and search IVF_HNSW indices", async () => {
await tbl.createIndex("vec", {
config: Index.hnswSq(),
});
// check index directory
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
expect(fs.readdirSync(indexDir)).toHaveLength(1);
const indices = await tbl.listIndices();
expect(indices.length).toBe(1);
expect(indices[0]).toEqual({
name: "vec_idx",
indexType: "IvfHnswSq",
columns: ["vec"],
});
// Search without specifying the column
let rst = await tbl
.query()
.limit(2)
.nearestTo(queryVec)
.distanceType("dot")
.toArrow();
expect(rst.numRows).toBe(2);
// Search using `vectorSearch`
rst = await tbl.vectorSearch(queryVec).limit(2).toArrow();
expect(rst.numRows).toBe(2);
// Search with specifying the column
const rst2 = await tbl
.query()
.limit(2)
.nearestTo(queryVec)
.column("vec")
.toArrow();
expect(rst2.numRows).toBe(2);
expect(rst.toString()).toEqual(rst2.toString());
// test offset
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
expect(rst.numRows).toBe(1);
// test ef
rst = await tbl.query().limit(2).nearestTo(queryVec).ef(100).toArrow();
expect(rst.numRows).toBe(2);
});
it("should be able to query unindexed data", async () => {
await tbl.createIndex("vec");
await tbl.add([

View File

@@ -385,6 +385,20 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
return this;
}
/**
* Set the number of candidates to consider during the search
*
* This argument is only used when the vector column has an HNSW index.
* If there is no index then this value is ignored.
*
* Increasing this value will increase the recall of your query but will
* also increase the latency of your query. The default value is 1.5*limit.
*/
ef(ef: number): VectorQuery {
super.doCall((inner) => inner.ef(ef));
return this;
}
/**
* Set the vector column to query
*

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.13.0-beta.2",
"version": "0.13.0",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.13.0-beta.1",
"version": "0.13.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.13.0-beta.1",
"version": "0.13.0",
"cpu": [
"x64",
"arm64"
@@ -6052,9 +6052,9 @@
}
},
"node_modules/cross-spawn": {
"version": "7.0.3",
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
"integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
"version": "7.0.6",
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
"integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
"devOptional": true,
"dependencies": {
"path-key": "^3.1.0",

View File

@@ -10,7 +10,7 @@
"vector database",
"ann"
],
"version": "0.13.0-beta.2",
"version": "0.13.0",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -167,6 +167,11 @@ impl VectorQuery {
self.inner = self.inner.clone().nprobes(nprobe as usize);
}
#[napi]
pub fn ef(&mut self, ef: u32) {
self.inner = self.inner.clone().ef(ef as usize);
}
#[napi]
pub fn bypass_vector_index(&mut self) {
self.inner = self.inner.clone().bypass_vector_index()

View File

@@ -15,7 +15,7 @@ crate-type = ["cdylib"]
[dependencies]
arrow = { version = "52.1", features = ["pyarrow"] }
lancedb = { path = "../rust/lancedb" }
lancedb = { path = "../rust/lancedb", default-features = false }
env_logger.workspace = true
pyo3 = { version = "0.21", features = ["extension-module", "abi3-py38", "gil-refs"] }
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
@@ -33,6 +33,11 @@ pyo3-build-config = { version = "0.20.3", features = [
] }
[features]
default = ["remote"]
default = ["default-tls", "remote"]
fp16kernels = ["lancedb/fp16kernels"]
remote = ["lancedb/remote"]
# TLS
default-tls = ["lancedb/default-tls"]
native-tls = ["lancedb/native-tls"]
rustls-tls = ["lancedb/rustls-tls"]

View File

@@ -4,7 +4,7 @@ name = "lancedb"
dependencies = [
"deprecation",
"nest-asyncio~=1.0",
"pylance==0.19.2",
"pylance==0.19.3b1",
"tqdm>=4.27.0",
"pydantic>=1.10",
"packaging",

View File

@@ -131,6 +131,8 @@ class Query(pydantic.BaseModel):
fast_search: bool = False
ef: Optional[int] = None
class LanceQueryBuilder(ABC):
"""An abstract query builder. Subclasses are defined for vector search,
@@ -257,6 +259,7 @@ class LanceQueryBuilder(ABC):
self._with_row_id = False
self._vector = None
self._text = None
self._ef = None
@deprecation.deprecated(
deprecated_in="0.3.1",
@@ -638,6 +641,28 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._nprobes = nprobes
return self
def ef(self, ef: int) -> LanceVectorQueryBuilder:
"""Set the number of candidates to consider during search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
This only applies to the HNSW-related index.
The default value is 1.5 * limit.
Parameters
----------
ef: int
The number of candidates to consider during search.
Returns
-------
LanceVectorQueryBuilder
The LanceQueryBuilder object.
"""
self._ef = ef
return self
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
"""Set the refine factor to use, increasing the number of vectors sampled.
@@ -700,6 +725,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
with_row_id=self._with_row_id,
offset=self._offset,
fast_search=self._fast_search,
ef=self._ef,
)
result_set = self._table._execute_query(query, batch_size)
if self._reranker is not None:
@@ -1071,6 +1097,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._vector_query.nprobes(self._nprobes)
if self._refine_factor:
self._vector_query.refine_factor(self._refine_factor)
if self._ef:
self._vector_query.ef(self._ef)
with ThreadPoolExecutor() as executor:
fts_future = executor.submit(self._fts_query.with_row_id(True).to_arrow)
@@ -1197,6 +1225,29 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._nprobes = nprobes
return self
def ef(self, ef: int) -> LanceHybridQueryBuilder:
"""
Set the number of candidates to consider during search.
Higher values will yield better recall (more likely to find vectors if
they exist) at the expense of latency.
This only applies to the HNSW-related index.
The default value is 1.5 * limit.
Parameters
----------
ef: int
The number of candidates to consider during search.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._ef = ef
return self
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
"""Set the distance metric to use.
@@ -1495,7 +1546,8 @@ class AsyncQuery(AsyncQueryBase):
return pa.array(vec)
def nearest_to(
self, query_vector: Optional[Union[VEC, Tuple, List[VEC]]] = None
self,
query_vector: Union[VEC, Tuple, List[VEC]],
) -> AsyncVectorQuery:
"""
Find the nearest vectors to the given query vector.
@@ -1542,6 +1594,9 @@ class AsyncQuery(AsyncQueryBase):
will be added to the results. This column will contain the index of the
query vector that the result is nearest to.
"""
if query_vector is None:
raise ValueError("query_vector can not be None")
if (
isinstance(query_vector, list)
and len(query_vector) > 0
@@ -1618,7 +1673,7 @@ class AsyncVectorQuery(AsyncQueryBase):
"""
Set the number of partitions to search (probe)
This argument is only used when the vector column has an IVF PQ index.
This argument is only used when the vector column has an IVF-based index.
If there is no index then this value is ignored.
The IVF stage of IVF PQ divides the input into partitions (clusters) of
@@ -1640,6 +1695,21 @@ class AsyncVectorQuery(AsyncQueryBase):
self._inner.nprobes(nprobes)
return self
def ef(self, ef: int) -> AsyncVectorQuery:
"""
Set the number of candidates to consider during search
This argument is only used when the vector column has an HNSW index.
If there is no index then this value is ignored.
Increasing this value will increase the recall of your query but will also
increase the latency of your query. The default value is 1.5 * limit. This
default is good for many cases but the best value to use will depend on your
data and the recall that you need to achieve.
"""
self._inner.ef(ef)
return self
def refine_factor(self, refine_factor: int) -> AsyncVectorQuery:
"""
A multiplier to control how many additional rows are taken during the refine

View File

@@ -86,6 +86,12 @@ class RemoteTable(Table):
"""to_pandas() is not yet supported on LanceDB cloud."""
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
def checkout(self, version):
return self._loop.run_until_complete(self._table.checkout(version))
def checkout_latest(self):
return self._loop.run_until_complete(self._table.checkout_latest())
def list_indices(self):
"""List all the indices on the table"""
return self._loop.run_until_complete(self._table.list_indices())

View File

@@ -1012,6 +1012,18 @@ class Table(ABC):
The names of the columns to drop.
"""
@abstractmethod
def checkout(self):
"""
TODO comments
"""
@abstractmethod
def checkout_latest(self):
"""
TODO comments
"""
@cached_property
def _dataset_uri(self) -> str:
return _table_uri(self._conn.uri, self.name)
@@ -1959,6 +1971,7 @@ class LanceTable(Table):
"metric": query.metric,
"nprobes": query.nprobes,
"refine_factor": query.refine_factor,
"ef": query.ef,
}
return ds.scanner(
columns=query.columns,
@@ -2697,7 +2710,7 @@ class AsyncTable:
def vector_search(
self,
query_vector: Optional[Union[VEC, Tuple]] = None,
query_vector: Union[VEC, Tuple],
) -> AsyncVectorQuery:
"""
Search the table with a given query vector.
@@ -2736,6 +2749,8 @@ class AsyncTable:
async_query = async_query.refine_factor(query.refine_factor)
if query.vector_column:
async_query = async_query.column(query.vector_column)
if query.ef:
async_query = async_query.ef(query.ef)
if not query.prefilter:
async_query = async_query.postfilter()

View File

@@ -1,21 +1,9 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
import unittest.mock as mock
from datetime import timedelta
from typing import Optional
import lance
import lancedb
from lancedb.index import IvfPq
import numpy as np
@@ -23,41 +11,15 @@ import pandas.testing as tm
import pyarrow as pa
import pytest
import pytest_asyncio
from lancedb.db import LanceDBConnection
from lancedb.pydantic import LanceModel, Vector
from lancedb.query import AsyncQueryBase, LanceVectorQueryBuilder, Query
from lancedb.table import AsyncTable, LanceTable
class MockTable:
def __init__(self, tmp_path):
self.uri = tmp_path
self._conn = LanceDBConnection(self.uri)
def to_lance(self):
return lance.dataset(self.uri)
def _execute_query(self, query, batch_size: Optional[int] = None):
ds = self.to_lance()
return ds.scanner(
columns=query.columns,
filter=query.filter,
prefilter=query.prefilter,
nearest={
"column": query.vector_column,
"q": query.vector,
"k": query.k,
"metric": query.metric,
"nprobes": query.nprobes,
"refine_factor": query.refine_factor,
},
batch_size=batch_size,
offset=query.offset,
).to_reader()
@pytest.fixture
def table(tmp_path) -> MockTable:
@pytest.fixture(scope="module")
def table(tmpdir_factory) -> lancedb.table.Table:
tmp_path = str(tmpdir_factory.mktemp("data"))
db = lancedb.connect(tmp_path)
df = pa.table(
{
"vector": pa.array(
@@ -68,8 +30,7 @@ def table(tmp_path) -> MockTable:
"float_field": pa.array([1.0, 2.0]),
}
)
lance.write_dataset(df, tmp_path)
return MockTable(tmp_path)
return db.create_table("test", df)
@pytest_asyncio.fixture
@@ -126,6 +87,12 @@ def test_query_builder(table):
assert all(np.array(rs[0]["vector"]) == [1, 2])
def test_with_row_id(table: lancedb.table.Table):
rs = table.search().with_row_id(True).to_arrow()
assert "_rowid" in rs.column_names
assert rs["_rowid"].to_pylist() == [0, 1]
def test_vector_query_with_no_limit(table):
with pytest.raises(ValueError):
LanceVectorQueryBuilder(table, [0, 0], "vector").limit(0).select(
@@ -365,6 +332,12 @@ async def test_query_to_pandas_async(table_async: AsyncTable):
assert df.shape == (0, 4)
@pytest.mark.asyncio
async def test_none_query(table_async: AsyncTable):
with pytest.raises(ValueError):
await table_async.query().nearest_to(None).to_arrow()
@pytest.mark.asyncio
async def test_fast_search_async(tmp_path):
db = await lancedb.connect_async(tmp_path)

View File

@@ -185,6 +185,7 @@ def test_query_sync_minimal():
"k": 10,
"prefilter": False,
"refine_factor": None,
"ef": None,
"vector": [1.0, 2.0, 3.0],
"nprobes": 20,
}
@@ -223,6 +224,7 @@ def test_query_sync_maximal():
"refine_factor": 10,
"vector": [1.0, 2.0, 3.0],
"nprobes": 5,
"ef": None,
"filter": "id > 0",
"columns": ["id", "name"],
"vector_column": "vector2",
@@ -318,6 +320,7 @@ def test_query_sync_hybrid():
"refine_factor": None,
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
"nprobes": 20,
"ef": None,
"with_row_id": True,
}
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})

View File

@@ -195,6 +195,10 @@ impl VectorQuery {
self.inner = self.inner.clone().nprobes(nprobe as usize);
}
pub fn ef(&mut self, ef: u32) {
self.inner = self.inner.clone().ef(ef as usize);
}
pub fn bypass_vector_index(&mut self) {
self.inner = self.inner.clone().bypass_vector_index()
}

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.13.0-beta.2"
version = "0.13.0"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.13.0-beta.2"
version = "0.13.0"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -48,9 +48,16 @@ async-openai = { version = "0.20.0", optional = true }
serde_with = { version = "3.8.1" }
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
# For remote feature
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
http = { version = "1", optional = true } # Matching what is in reqwest
reqwest = { version = "0.12.0", default-features = false, features = [
"charset",
"gzip",
"http2",
"json",
"macos-system-configuration",
"stream",
], optional = true }
rand = { version = "0.8.3", features = ["small_rng"], optional = true }
http = { version = "1", optional = true } # Matching what is in reqwest
uuid = { version = "1.7.0", features = ["v4"], optional = true }
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
polars = { version = ">=0.37,<0.40.0", optional = true }
@@ -75,7 +82,7 @@ http-body = "1" # Matching reqwest
[features]
default = []
default = ["default-tls"]
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
fp16kernels = ["lance-linalg/fp16kernels"]
s3-test = []
@@ -90,6 +97,11 @@ sentence-transformers = [
"dep:tokenizers"
]
# TLS
default-tls = ["reqwest?/default-tls"]
native-tls = ["reqwest?/native-tls"]
rustls-tls = ["reqwest?/rustls-tls"]
[[example]]
name = "openai"
required-features = ["openai"]

View File

@@ -704,6 +704,9 @@ pub struct VectorQuery {
// IVF PQ - ANN search.
pub(crate) query_vector: Vec<Arc<dyn Array>>,
pub(crate) nprobes: usize,
// The number of candidates to return during the refine step for HNSW,
// defaults to 1.5 * limit.
pub(crate) ef: Option<usize>,
pub(crate) refine_factor: Option<u32>,
pub(crate) distance_type: Option<DistanceType>,
/// Default is true. Set to false to enforce a brute force search.
@@ -717,6 +720,7 @@ impl VectorQuery {
column: None,
query_vector: Vec::new(),
nprobes: 20,
ef: None,
refine_factor: None,
distance_type: None,
use_index: true,
@@ -776,6 +780,18 @@ impl VectorQuery {
self
}
/// Set the number of candidates to return during the refine step for HNSW
///
/// This argument is only used when the vector column has an HNSW index.
/// If there is no index then this value is ignored.
///
/// Increasing this value will increase the recall of your query but will
/// also increase the latency of your query. The default value is 1.5*limit.
pub fn ef(mut self, ef: usize) -> Self {
self.ef = Some(ef);
self
}
/// A multiplier to control how many additional rows are taken during the refine step
///
/// This argument is only used when the vector column has an IVF PQ index.

View File

@@ -22,6 +22,7 @@ use lance::dataset::scanner::DatasetRecordBatchStream;
use lance::dataset::{ColumnAlteration, NewColumnTransform};
use lance_datafusion::exec::OneShotExec;
use serde::{Deserialize, Serialize};
use tokio::sync::RwLock;
use crate::{
connection::NoData,
@@ -43,17 +44,32 @@ pub struct RemoteTable<S: HttpSend = Sender> {
#[allow(dead_code)]
client: RestfulLanceDbClient<S>,
name: String,
version: RwLock<Option<u64>>,
}
impl<S: HttpSend> RemoteTable<S> {
pub fn new(client: RestfulLanceDbClient<S>, name: String) -> Self {
Self { client, name }
Self {
client,
name,
version: RwLock::new(None),
}
}
async fn describe(&self) -> Result<TableDescription> {
let request = self
let version = self.current_version().await;
self.describe_version(version).await
}
async fn describe_version(&self, version: Option<u64>) -> Result<TableDescription> {
let mut request = self
.client
.post(&format!("/v1/table/{}/describe/", self.name));
let body = serde_json::json!({ "version": version });
request = request.json(&body);
let (request_id, response) = self.client.send(request, true).await?;
let response = self.check_table_response(&request_id, response).await?;
@@ -196,6 +212,7 @@ impl<S: HttpSend> RemoteTable<S> {
body["prefilter"] = query.base.prefilter.into();
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
body["nprobes"] = query.nprobes.into();
body["ef"] = query.ef.into();
body["refine_factor"] = query.refine_factor.into();
if let Some(vector_column) = query.column.as_ref() {
body["vector_column"] = serde_json::Value::String(vector_column.clone());
@@ -250,6 +267,24 @@ impl<S: HttpSend> RemoteTable<S> {
}
}
}
async fn check_mutable(&self) -> Result<()> {
let read_guard = self.version.read().await;
match *read_guard {
None => Ok(()),
Some(version) => Err(Error::NotSupported {
message: format!(
"Cannot mutate table reference fixed at version {}. Call checkout_latest() to get a mutable table reference.",
version
)
})
}
}
async fn current_version(&self) -> Option<u64> {
let read_guard = self.version.read().await;
*read_guard
}
}
#[derive(Deserialize)]
@@ -277,7 +312,11 @@ mod test_utils {
T: Into<reqwest::Body>,
{
let client = client_with_handler(handler);
Self { client, name }
Self {
client,
name,
version: RwLock::new(None),
}
}
}
}
@@ -296,17 +335,30 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
async fn version(&self) -> Result<u64> {
self.describe().await.map(|desc| desc.version)
}
async fn checkout(&self, _version: u64) -> Result<()> {
Err(Error::NotSupported {
message: "checkout is not supported on LanceDB cloud.".into(),
})
async fn checkout(&self, version: u64) -> Result<()> {
// check that the version exists
self.describe_version(Some(version))
.await
.map_err(|e| match e {
// try to map the error to a more user-friendly error telling them
// specifically that the version does not exist
Error::TableNotFound { name } => Error::TableNotFound {
name: format!("{} (version: {})", name, version),
},
e => e,
})?;
let mut write_guard = self.version.write().await;
*write_guard = Some(version);
Ok(())
}
async fn checkout_latest(&self) -> Result<()> {
Err(Error::NotSupported {
message: "checkout is not supported on LanceDB cloud.".into(),
})
let mut write_guard = self.version.write().await;
*write_guard = None;
Ok(())
}
async fn restore(&self) -> Result<()> {
self.check_mutable().await?;
Err(Error::NotSupported {
message: "restore is not supported on LanceDB cloud.".into(),
})
@@ -320,10 +372,13 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
.client
.post(&format!("/v1/table/{}/count_rows/", self.name));
let version = self.current_version().await;
if let Some(filter) = filter {
request = request.json(&serde_json::json!({ "predicate": filter }));
request = request.json(&serde_json::json!({ "predicate": filter, "version": version }));
} else {
request = request.json(&serde_json::json!({}));
let body = serde_json::json!({ "version": version });
request = request.json(&body);
}
let (request_id, response) = self.client.send(request, true).await?;
@@ -343,6 +398,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
add: AddDataBuilder<NoData>,
data: Box<dyn RecordBatchReader + Send>,
) -> Result<()> {
self.check_mutable().await?;
let body = Self::reader_as_body(data)?;
let mut request = self
.client
@@ -371,7 +427,8 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
) -> Result<Arc<dyn ExecutionPlan>> {
let request = self.client.post(&format!("/v1/table/{}/query/", self.name));
let body = serde_json::Value::Object(Default::default());
let version = self.current_version().await;
let body = serde_json::json!({ "version": version });
let bodies = Self::apply_vector_query_params(body, query)?;
let mut futures = Vec::with_capacity(bodies.len());
@@ -406,7 +463,8 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
.post(&format!("/v1/table/{}/query/", self.name))
.header(CONTENT_TYPE, JSON_CONTENT_TYPE);
let mut body = serde_json::Value::Object(Default::default());
let version = self.current_version().await;
let mut body = serde_json::json!({ "version": version });
Self::apply_query_params(&mut body, query)?;
// Empty vector can be passed if no vector search is performed.
body["vector"] = serde_json::Value::Array(Vec::new());
@@ -420,6 +478,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
Ok(DatasetRecordBatchStream::new(stream))
}
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
self.check_mutable().await?;
let request = self
.client
.post(&format!("/v1/table/{}/update/", self.name));
@@ -441,6 +500,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
Ok(0) // TODO: support returning number of modified rows once supported in SaaS.
}
async fn delete(&self, predicate: &str) -> Result<()> {
self.check_mutable().await?;
let body = serde_json::json!({ "predicate": predicate });
let request = self
.client
@@ -452,6 +512,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
}
async fn create_index(&self, mut index: IndexBuilder) -> Result<()> {
self.check_mutable().await?;
let request = self
.client
.post(&format!("/v1/table/{}/create_index/", self.name));
@@ -530,6 +591,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<()> {
self.check_mutable().await?;
let query = MergeInsertRequest::try_from(params)?;
let body = Self::reader_as_body(new_data)?;
let request = self
@@ -546,6 +608,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
Ok(())
}
async fn optimize(&self, _action: OptimizeAction) -> Result<OptimizeStats> {
self.check_mutable().await?;
Err(Error::NotSupported {
message: "optimize is not supported on LanceDB cloud.".into(),
})
@@ -555,16 +618,19 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
_transforms: NewColumnTransform,
_read_columns: Option<Vec<String>>,
) -> Result<()> {
self.check_mutable().await?;
Err(Error::NotSupported {
message: "add_columns is not yet supported.".into(),
})
}
async fn alter_columns(&self, _alterations: &[ColumnAlteration]) -> Result<()> {
self.check_mutable().await?;
Err(Error::NotSupported {
message: "alter_columns is not yet supported.".into(),
})
}
async fn drop_columns(&self, _columns: &[&str]) -> Result<()> {
self.check_mutable().await?;
Err(Error::NotSupported {
message: "drop_columns is not yet supported.".into(),
})
@@ -572,9 +638,13 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
// Make request to list the indices
let request = self
let mut request = self
.client
.post(&format!("/v1/table/{}/index/list/", self.name));
let version = self.current_version().await;
let body = serde_json::json!({ "version": version });
request = request.json(&body);
let (request_id, response) = self.client.send(request, true).await?;
let response = self.check_table_response(&request_id, response).await?;
@@ -624,10 +694,14 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
}
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>> {
let request = self.client.post(&format!(
let mut request = self.client.post(&format!(
"/v1/table/{}/index/{}/stats/",
self.name, index_name
));
let version = self.current_version().await;
let body = serde_json::json!({ "version": version });
request = request.json(&body);
let (request_id, response) = self.client.send(request, true).await?;
if response.status() == StatusCode::NOT_FOUND {
@@ -805,7 +879,10 @@ mod tests {
request.headers().get("Content-Type").unwrap(),
JSON_CONTENT_TYPE
);
assert_eq!(request.body().unwrap().as_bytes().unwrap(), br#"{}"#);
assert_eq!(
request.body().unwrap().as_bytes().unwrap(),
br#"{"version":null}"#
);
http::Response::builder().status(200).body("42").unwrap()
});
@@ -822,7 +899,7 @@ mod tests {
);
assert_eq!(
request.body().unwrap().as_bytes().unwrap(),
br#"{"predicate":"a > 10"}"#
br#"{"predicate":"a > 10","version":null}"#
);
http::Response::builder().status(200).body("42").unwrap()
@@ -1121,7 +1198,9 @@ mod tests {
"prefilter": true,
"distance_type": "l2",
"nprobes": 20,
"ef": Option::<usize>::None,
"refine_factor": null,
"version": null,
});
// Pass vector separately to make sure it matches f32 precision.
expected_body["vector"] = vec![0.1f32, 0.2, 0.3].into();
@@ -1166,7 +1245,9 @@ mod tests {
"bypass_vector_index": true,
"columns": ["a", "b"],
"nprobes": 12,
"ef": Option::<usize>::None,
"refine_factor": 2,
"version": null,
});
// Pass vector separately to make sure it matches f32 precision.
expected_body["vector"] = vec![0.1f32, 0.2, 0.3].into();
@@ -1222,6 +1303,7 @@ mod tests {
"k": 10,
"vector": [],
"with_row_id": true,
"version": null
});
assert_eq!(body, expected_body);
@@ -1451,4 +1533,195 @@ mod tests {
let indices = table.index_stats("my_index").await.unwrap();
assert!(indices.is_none());
}
#[tokio::test]
async fn test_passes_version() {
let table = Table::new_with_handler("my_table", |request| {
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
let version = body
.as_object()
.unwrap()
.get("version")
.unwrap()
.as_u64()
.unwrap();
assert_eq!(version, 42);
let response_body = match request.url().path() {
"/v1/table/my_table/describe/" => {
serde_json::json!({
"version": 42,
"schema": { "fields": [] }
})
}
"/v1/table/my_table/index/list/" => {
serde_json::json!({
"indexes": []
})
}
"/v1/table/my_table/index/my_idx/stats/" => {
serde_json::json!({
"num_indexed_rows": 100000,
"num_unindexed_rows": 0,
"index_type": "IVF_PQ",
"distance_type": "l2"
})
}
"/v1/table/my_table/count_rows/" => {
serde_json::json!(1000)
}
"/v1/table/my_table/query/" => {
let expected_data = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
)
.unwrap();
let expected_data_ref = expected_data.clone();
let response_body = write_ipc_file(&expected_data_ref);
return http::Response::builder()
.status(200)
.header(CONTENT_TYPE, ARROW_FILE_CONTENT_TYPE)
.body(response_body)
.unwrap();
}
path => panic!("Unexpected path: {}", path),
};
http::Response::builder()
.status(200)
.body(
serde_json::to_string(&response_body)
.unwrap()
.as_bytes()
.to_vec(),
)
.unwrap()
});
table.checkout(42).await.unwrap();
// ensure that version is passed to the /describe endpoint
let version = table.version().await.unwrap();
assert_eq!(version, 42);
// ensure it's passed to other read API calls
table.list_indices().await.unwrap();
table.index_stats("my_idx").await.unwrap();
table.count_rows(None).await.unwrap();
table
.query()
.nearest_to(vec![0.1, 0.2, 0.3])
.unwrap()
.execute()
.await
.unwrap();
}
#[tokio::test]
async fn test_fails_if_checkout_version_doesnt_exist() {
let table = Table::new_with_handler("my_table", |request| {
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
let version = body
.as_object()
.unwrap()
.get("version")
.unwrap()
.as_u64()
.unwrap();
if version != 42 {
return http::Response::builder()
.status(404)
.body(format!("Table my_table (version: {}) not found", version))
.unwrap();
}
let response_body = match request.url().path() {
"/v1/table/my_table/describe/" => {
serde_json::json!({
"version": 42,
"schema": { "fields": [] }
})
}
_ => panic!("Unexpected path"),
};
http::Response::builder()
.status(200)
.body(serde_json::to_string(&response_body).unwrap())
.unwrap()
});
let res = table.checkout(43).await;
println!("{:?}", res);
assert!(
matches!(res, Err(Error::TableNotFound { name }) if name == "my_table (version: 43)")
);
}
#[tokio::test]
async fn test_timetravel_immutable() {
let table = Table::new_with_handler::<String>("my_table", |request| {
let response_body = match request.url().path() {
"/v1/table/my_table/describe/" => {
serde_json::json!({
"version": 42,
"schema": { "fields": [] }
})
}
_ => panic!("Should not have made a request: {:?}", request),
};
http::Response::builder()
.status(200)
.body(serde_json::to_string(&response_body).unwrap())
.unwrap()
});
table.checkout(42).await.unwrap();
// Ensure that all mutable operations fail.
let res = table
.update()
.column("a", "a + 1")
.column("b", "b - 1")
.only_if("b > 10")
.execute()
.await;
assert!(matches!(res, Err(Error::NotSupported { .. })));
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
)
.unwrap();
let data = Box::new(RecordBatchIterator::new(
[Ok(batch.clone())],
batch.schema(),
));
let res = table.merge_insert(&["some_col"]).execute(data).await;
assert!(matches!(res, Err(Error::NotSupported { .. })));
let res = table.delete("id in (1, 2, 3)").await;
assert!(matches!(res, Err(Error::NotSupported { .. })));
let data = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
)
.unwrap();
let res = table
.add(RecordBatchIterator::new([Ok(data.clone())], data.schema()))
.execute()
.await;
assert!(matches!(res, Err(Error::NotSupported { .. })));
let res = table
.create_index(&["a"], Index::IvfPq(Default::default()))
.execute()
.await;
assert!(matches!(res, Err(Error::NotSupported { .. })));
}
}

View File

@@ -1904,6 +1904,9 @@ impl TableInternal for NativeTable {
query.base.offset.map(|offset| offset as i64),
)?;
scanner.nprobs(query.nprobes);
if let Some(ef) = query.ef {
scanner.ef(ef);
}
scanner.use_index(query.use_index);
scanner.prefilter(query.base.prefilter);
match query.base.select {