Compare commits

..

18 Commits

Author SHA1 Message Date
rmeng
60f6dc6a64 chore: switch over to requtes for remote client 2024-01-09 22:35:15 -05:00
Chang She
99ba5331f0 feat(python): support new style optional syntax (#793) 2024-01-09 07:03:29 -08:00
Chang She
121687231c chore(python): document phrase queries in fts (#788)
closes #769 

Add unit test and documentation on using quotes to perform a phrase
query
2024-01-08 21:49:31 -08:00
Chang She
ac40d4b235 feat(node): support table.schema for LocalTable (#789)
Close #773 

we pass an empty table over IPC so we don't need to manually deal with
serde. Then we just return the schema attribute from the empty table.

---------

Co-authored-by: albertlockett <albert.lockett@gmail.com>
2024-01-08 21:12:48 -08:00
Lei Xu
c5a52565ac chore: bump lance to 0.9.5 (#790) 2024-01-07 19:27:47 -08:00
Chang She
b0a88a7286 feat(python): Set heap size to get faster fts indexing performance (#762)
By default tantivy-py uses 128MB heapsize. We change the default to 1GB
and we allow the user to customize this

locally this makes `test_fts.py` run 10x faster
2024-01-07 15:15:13 -08:00
lucasiscovici
d41d849e0e raise exception if fts index does not exist (#776)
raise exception if fts index does not exist

---------

Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
2024-01-07 14:34:04 -08:00
sudhir
bf5202f196 Make examples work with current version of Openai api's (#779)
These examples don't work because of changes in openai api from version
1+
2024-01-07 14:27:56 -08:00
Chris
8be2861061 Minor Fixes to Ingest Embedding Functions Docs (#777)
Addressed minor typos and grammatical issues to improve readability

---------

Co-authored-by: Christopher Correa <chris.correa@gmail.com>
2024-01-07 14:27:40 -08:00
Vladimir Varankin
0560e3a0e5 Minor corrections for docs of embedding_functions (#780)
In addition to #777, this pull request fixes more typos in the
documentation for "Ingest Embedding Functions".
2024-01-07 14:26:35 -08:00
QianZhu
b83fbfc344 small bug fix for example code in SaaS JS doc (#770) 2024-01-04 14:30:34 -08:00
Chang She
60b22d84bf chore(python): handle NaN input in fts ingestion (#763)
If the input text is None, Tantivy raises an error
complaining it cannot add a NoneType. We handle this
upstream so None's are not added to the document.
If all of the indexed fields are None then we skip
this document.
2024-01-04 11:45:12 -08:00
Bengsoon Chuah
7d55a94efd Add relevant imports for each step (#764)
I found that it was quite incoherent to have to read through the
documentation and having to search which submodule that each class
should be imported from.

For example, it is cumbersome to have to navigate to another
documentation page to find out that `EmbeddingFunctionRegistry` is from
`lancedb.embeddings`
2024-01-04 11:15:42 -08:00
QianZhu
4d8e401d34 SaaS JS API sdk doc (#740)
Co-authored-by: Aidan <64613310+aidangomar@users.noreply.github.com>
2024-01-03 16:24:21 -08:00
Chang She
684eb8b087 feat(js): support list of string input (#755)
Add support for adding lists of string input (e.g., list of categorical
labels)

Follow-up items: #757 #758
2024-01-02 20:55:33 -08:00
Lance Release
4e3b82feaa Updating package-lock.json 2023-12-30 03:16:41 +00:00
Lance Release
8e248a9d67 Updating package-lock.json 2023-12-30 00:53:51 +00:00
Lance Release
065ffde443 Bump version: 0.4.1 → 0.4.2 2023-12-30 00:53:30 +00:00
37 changed files with 1190 additions and 344 deletions

View File

@@ -1,5 +1,5 @@
[bumpversion] [bumpversion]
current_version = 0.4.1 current_version = 0.4.2
commit = True commit = True
message = Bump version: {current_version} → {new_version} message = Bump version: {current_version} → {new_version}
tag = True tag = True

View File

@@ -88,6 +88,9 @@ jobs:
cd docs/test cd docs/test
node md_testing.js node md_testing.js
- name: Test - name: Test
env:
LANCEDB_URI: ${{ secrets.LANCEDB_URI }}
LANCEDB_DEV_API_KEY: ${{ secrets.LANCEDB_DEV_API_KEY }}
run: | run: |
cd docs/test/node cd docs/test/node
for d in *; do cd "$d"; echo "$d".js; node "$d".js; cd ..; done for d in *; do cd "$d"; echo "$d".js; node "$d".js; cd ..; done

View File

@@ -5,10 +5,10 @@ exclude = ["python"]
resolver = "2" resolver = "2"
[workspace.dependencies] [workspace.dependencies]
lance = { "version" = "=0.9.2", "features" = ["dynamodb"] } lance = { "version" = "=0.9.5", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.9.2" } lance-index = { "version" = "=0.9.5" }
lance-linalg = { "version" = "=0.9.2" } lance-linalg = { "version" = "=0.9.5" }
lance-testing = { "version" = "=0.9.2" } lance-testing = { "version" = "=0.9.5" }
# Note that this one does not include pyarrow # Note that this one does not include pyarrow
arrow = { version = "49.0.0", optional = false } arrow = { version = "49.0.0", optional = false }
arrow-array = "49.0" arrow-array = "49.0"

View File

@@ -149,6 +149,7 @@ nav:
- OSS Python API: python/python.md - OSS Python API: python/python.md
- SaaS Python API: python/saas-python.md - SaaS Python API: python/saas-python.md
- Javascript API: javascript/modules.md - Javascript API: javascript/modules.md
- SaaS Javascript API: javascript/saas-modules.md
- LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms - LanceDB Cloud↗: https://noteforms.com/forms/lancedb-mailing-list-cloud-kty1o5?notionforms=1&utm_source=notionforms
extra_css: extra_css:

View File

@@ -164,6 +164,7 @@ You can further filter the elements returned by a search using a where clause.
const results_2 = await table const results_2 = await table
.search(Array(1536).fill(1.2)) .search(Array(1536).fill(1.2))
.where("id != '1141'") .where("id != '1141'")
.limit(2)
.execute() .execute()
``` ```
@@ -187,6 +188,7 @@ You can select the columns returned by the query using a select clause.
const results_3 = await table const results_3 = await table
.search(Array(1536).fill(1.2)) .search(Array(1536).fill(1.2))
.select(["id"]) .select(["id"])
.limit(2)
.execute() .execute()
``` ```

View File

@@ -1,13 +1,14 @@
Representing multi-modal data as vector embeddings is becoming a standard practice. Embedding functions themselves be thought of as a part of the processing pipeline that each request(input) has to be passed through. After initial setup these components are not expected to change for a particular project. Representing multi-modal data as vector embeddings is becoming a standard practice. Embedding functions themselves can be thought of as a part of the processing pipeline that each request(input) has to be passed through. After initial setup these components are not expected to change for a particular project.
This is main motivation behind our new embedding functions API, that allow you simply set it up once and the table remembers it, effectively making the **embedding functions disappear in the background** so you don't have to worry about modelling and simply focus on the DB aspects of VectorDB.
Our new embedding functions API allow you simply set it up once and the table remembers it, effectively making the **embedding functions disappear in the background** so you don't have to worry about modelling and can simply focus on the DB aspects of VectorDB.
You can simply follow these steps and forget about the details of your embedding functions as long as you don't intend to change it. You can simply follow these steps and forget about the details of your embedding functions as long as you don't intend to change it.
### Step 1 - Define the embedding function ### Step 1 - Define the embedding function
We have some pre-defined embedding functions in the global registry with more coming soon. Here's let's an implementation of CLIP as example. We have some pre-defined embedding functions in the global registry with more coming soon. Here's let's an implementation of CLIP as example.
``` ```
from lancedb.embeddings import EmbeddingFunctionRegistry
registry = EmbeddingFunctionRegistry.get_instance() registry = EmbeddingFunctionRegistry.get_instance()
clip = registry.get("open-clip").create() clip = registry.get("open-clip").create()
@@ -15,9 +16,11 @@ clip = registry.get("open-clip").create()
You can also define your own embedding function by implementing the `EmbeddingFunction` abstract base interface. It subclasses PyDantic Model which can be utilized to write complex schemas simply as we'll see next! You can also define your own embedding function by implementing the `EmbeddingFunction` abstract base interface. It subclasses PyDantic Model which can be utilized to write complex schemas simply as we'll see next!
### Step 2 - Define the Data Model or Schema ### Step 2 - Define the Data Model or Schema
Our embedding function from the previous section abstracts away all the details about the models and dimensions required to define the schema. You can simply set a feild as **source** or **vector** column. Here's how Our embedding function from the previous section abstracts away all the details about the models and dimensions required to define the schema. You can simply set a field as **source** or **vector** column. Here's how
```python ```python
from lancedb.pydantic import LanceModel, Vector
class Pets(LanceModel): class Pets(LanceModel):
vector: Vector(clip.ndims) = clip.VectorField() vector: Vector(clip.ndims) = clip.VectorField()
image_uri: str = clip.SourceField() image_uri: str = clip.SourceField()
@@ -30,11 +33,13 @@ class Pets(LanceModel):
Now that we have chosen/defined our embedding function and the schema, we can create the table Now that we have chosen/defined our embedding function and the schema, we can create the table
```python ```python
import lancedb
db = lancedb.connect("~/lancedb") db = lancedb.connect("~/lancedb")
table = db.create_table("pets", schema=Pets) table = db.create_table("pets", schema=Pets)
``` ```
That's it! We have ingested all the information needed to embed source and query inputs. We can now forget about the model and dimension details and start to build or VectorDB
That's it! We have ingested all the information needed to embed source and query inputs. We can now forget about the model and dimension details and start to build our VectorDB.
### Step 4 - Ingest lots of data and run vector search! ### Step 4 - Ingest lots of data and run vector search!
Now you can just add the data and it'll be vectorized automatically Now you can just add the data and it'll be vectorized automatically
@@ -52,29 +57,32 @@ result = table.search("dog")
Let's query an image Let's query an image
```python ```python
from pathlib import Path
p = Path("path/to/images/samoyed_100.jpg") p = Path("path/to/images/samoyed_100.jpg")
query_image = Image.open(p) query_image = Image.open(p)
table.search(query_image) table.search(query_image)
``` ```
### Rate limit Handling ### Rate limit Handling
`EmbeddingFunction` class wraps the calls for source and query embedding generation inside a rate limit handler that retries the requests with exponential backoff after successive failures. By default the maximum retires is set to 7. You can tune it by setting it to a different number or disable it by setting it to 0. `EmbeddingFunction` class wraps the calls for source and query embedding generation inside a rate limit handler that retries the requests with exponential backoff after successive failures. By default the maximum retires is set to 7. You can tune it by setting it to a different number or disable it by setting it to 0. Example:
Example
----
```python ```python
clip = registry.get("open-clip").create() # Defaults to 7 max retries clip = registry.get("open-clip").create() # Defaults to 7 max retries
clip = registry.get("open-clip").create(max_retries=10) # Increase max retries to 10 clip = registry.get("open-clip").create(max_retries=10) # Increase max retries to 10
clip = registry.get("open-clip").create(max_retries=0) # Retries disabled clip = registry.get("open-clip").create(max_retries=0) # Retries disabled
```` ```
NOTE: NOTE:
Embedding functions can also fail due to other errors that have nothing to do with rate limits. This is why the error is also logged. Embedding functions can also fail due to other errors that have nothing to do with rate limits. This is why the errors are also logged.
### A little fun with PyDantic ### A little fun with PyDantic
LanceDB is integrated with PyDantic. Infact we've used the integration in the above example to define the schema. It is also being used behing the scene by the embdding function API to ingest useful information as table metadata. LanceDB is integrated with PyDantic. In fact, we've used the integration in the above example to define the schema. It is also being used behind the scene by the embedding function API to ingest useful information as table metadata.
You can also use it for adding utility operations in the schema. For example, in our multi-modal example, you can search images using text or another image. Let us define a utility function to plot the image. You can also use it for adding utility operations in the schema. For example, in our multi-modal example, you can search images using text or another image. Let's define a utility function to plot the image.
```python ```python
from lancedb.pydantic import LanceModel, Vector
class Pets(LanceModel): class Pets(LanceModel):
vector: Vector(clip.ndims) = clip.VectorField() vector: Vector(clip.ndims) = clip.VectorField()
image_uri: str = clip.SourceField() image_uri: str = clip.SourceField()
@@ -83,7 +91,8 @@ class Pets(LanceModel):
def image(self): def image(self):
return Image.open(self.image_uri) return Image.open(self.image_uri)
``` ```
Now, you can covert your search results to pydantic model and use this property.
Now, you can covert your search results to PyDantic model and use its property.
```python ```python
rs = table.search(query_image).limit(3).to_pydantic(Pets) rs = table.search(query_image).limit(3).to_pydantic(Pets)
@@ -92,4 +101,4 @@ rs[2].image
![](../assets/dog_clip_output.png) ![](../assets/dog_clip_output.png)
Now that you've the basic idea about LanceDB embedding function, let us now dive deeper into the API that you can use to implement your own embedding functions! Now that you have the basic idea about LanceDB embedding function, let us dive deeper into the API that you can use to implement your own embedding functions!

View File

@@ -75,6 +75,34 @@ applied on top of the full text search results. This can be invoked via the fami
table.search("puppy").limit(10).where("meta='foo'").to_list() table.search("puppy").limit(10).where("meta='foo'").to_list()
``` ```
## Syntax
For full-text search you can perform either a phrase query like "the old man and the sea",
or a structured search query like "(Old AND Man) AND Sea".
Double quotes are used to disambiguate.
For example:
If you intended "they could have been dogs OR cats" as a phrase query, this actually
raises a syntax error since `OR` is a recognized operator. If you make `or` lower case,
this avoids the syntax error. However, it is cumbersome to have to remember what will
conflict with the query syntax. Instead, if you search using
`table.search('"they could have been dogs OR cats"')`, then the syntax checker avoids
checking inside the quotes.
## Configurations
By default, LanceDB configures a 1GB heap size limit for creating the index. You can
reduce this if running on a smaller node, or increase this for faster performance while
indexing a larger corpus.
```python
# configure a 512MB heap size
heap = 1024 * 1024 * 512
table.create_fts_index(["text1", "text2"], writer_heap_size=heap, replace=True)
```
## Current limitations ## Current limitations
1. Currently we do not yet support incremental writes. 1. Currently we do not yet support incremental writes.

View File

@@ -0,0 +1,226 @@
[vectordb](../README.md) / [Exports](../saas-modules.md) / RemoteConnection
# Class: RemoteConnection
A connection to a remote LanceDB database. The class RemoteConnection implements interface Connection
## Implements
- [`Connection`](../interfaces/Connection.md)
## Table of contents
### Constructors
- [constructor](RemoteConnection.md#constructor)
### Methods
- [createTable](RemoteConnection.md#createtable)
- [tableNames](RemoteConnection.md#tablenames)
- [openTable](RemoteConnection.md#opentable)
- [dropTable](RemoteConnection.md#droptable)
## Constructors
### constructor
**new RemoteConnection**(`client`, `dbName`)
#### Parameters
| Name | Type |
| :------ | :------ |
| `client` | `HttpLancedbClient` |
| `dbName` | `string` |
#### Defined in
[remote/index.ts:37](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L37)
## Methods
### createTable
**createTable**(`name`, `data`, `mode?`): `Promise`<[`Table`](../interfaces/Table.md)<`number`[]\>\>
Creates a new Table and initialize it with new data.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `name` | `string` | The name of the table. |
| `data` | `Record`<`string`, `unknown`\>[] | Non-empty Array of Records to be inserted into the Table |
| `mode?` | [`WriteMode`](../enums/WriteMode.md) | The write mode to use when creating the table. |
#### Returns
`Promise`<[`Table`](../interfaces/Table.md)<`number`[]\>\>
#### Implementation of
[Connection](../interfaces/Connection.md).[createTable](../interfaces/Connection.md#createtable)
#### Defined in
[remote/index.ts:75](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L75)
**createTable**(`name`, `data`, `mode`): `Promise`<[`Table`](../interfaces/Table.md)<`number`[]\>\>
#### Parameters
| Name | Type |
| :------ | :------ |
| `name` | `string` |
| `data` | `Record`<`string`, `unknown`\>[] |
| `mode` | [`WriteMode`](../enums/WriteMode.md) |
| `embeddings` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)<`T`\> | An embedding function to use on this Table |
#### Returns
`Promise`<[`Table`](../interfaces/Table.md)<`number`[]\>\>
#### Implementation of
Connection.createTable
#### Defined in
[remote/index.ts:231](https://github.com/lancedb/lancedb/blob/b1eeb90/node/src/index.ts#L231)
___
### dropTable
**dropTable**(`name`): `Promise`<`void`\>
Drop an existing table.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `name` | `string` | The name of the table to drop. |
#### Returns
`Promise`<`void`\>
#### Implementation of
[Connection](../interfaces/Connection.md).[dropTable](../interfaces/Connection.md#droptable)
#### Defined in
[remote/index.ts:131](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L131)
___
### openTable
**openTable**(`name`): `Promise`<[`Table`](../interfaces/Table.md)<`number`[]\>\>
Open a table in the database.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `name` | `string` | The name of the table. |
#### Returns
`Promise`<[`Table`](../interfaces/Table.md)<`number`[]\>\>
#### Implementation of
[Connection](../interfaces/Connection.md).[openTable](../interfaces/Connection.md#opentable)
#### Defined in
[remote/index.ts:65](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L65)
**openTable**<`T`\>(`name`, `embeddings`): `Promise`<[`Table`](../interfaces/Table.md)<`T`\>\>
Open a table in the database.
#### Type parameters
| Name |
| :------ |
| `T` |
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `name` | `string` | The name of the table. |
| `embeddings` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)<`T`\> | An embedding function to use on this Table |
#### Returns
`Promise`<[`Table`](../interfaces/Table.md)<`T`\>\>
#### Implementation of
Connection.openTable
#### Defined in
[remote/index.ts:66](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L66)
**openTable**<`T`\>(`name`, `embeddings?`): `Promise`<[`Table`](../interfaces/Table.md)<`T`\>\>
#### Type parameters
| Name |
| :------ |
| `T` |
#### Parameters
| Name | Type |
| :------ | :------ |
| `name` | `string` |
| `embeddings?` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)<`T`\> |
#### Returns
`Promise`<[`Table`](../interfaces/Table.md)<`T`\>\>
#### Implementation of
Connection.openTable
#### Defined in
[remote/index.ts:67](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L67)
___
### tableNames
**tableNames**(): `Promise`<`string`[]\>
Get the names of all tables in the database, with pagination.
#### Parameters
| Name | Type |
| :------ | :------ |
| `pageToken` | `string` |
| `limit` | `int` |
#### Returns
`Promise`<`string`[]\>
#### Implementation of
[Connection](../interfaces/Connection.md).[tableNames](../interfaces/Connection.md#tablenames)
#### Defined in
[remote/index.ts:60](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L60)

View File

@@ -0,0 +1,76 @@
[vectordb](../README.md) / [Exports](../saas-modules.md) / RemoteQuery
# Class: Query<T\>
A builder for nearest neighbor queries for LanceDB.
## Type parameters
| Name | Type |
| :------ | :------ |
| `T` | `number`[] |
## Table of contents
### Constructors
- [constructor](RemoteQuery.md#constructor)
### Properties
- [\_embeddings](RemoteQuery.md#_embeddings)
- [\_query](RemoteQuery.md#_query)
- [\_name](RemoteQuery.md#_name)
- [\_client](RemoteQuery.md#_client)
### Methods
- [execute](RemoteQuery.md#execute)
## Constructors
### constructor
**new Query**<`T`\>(`name`, `client`, `query`, `embeddings?`)
#### Type parameters
| Name | Type |
| :------ | :------ |
| `T` | `number`[] |
#### Parameters
| Name | Type |
| :------ | :------ |
| `name` | `string` |
| `client` | `HttpLancedbClient` |
| `query` | `T` |
| `embeddings?` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)<`T`\> |
#### Defined in
[remote/index.ts:137](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L137)
## Methods
### execute
**execute**<`T`\>(): `Promise`<`T`[]\>
Execute the query and return the results as an Array of Objects
#### Type parameters
| Name | Type |
| :------ | :------ |
| `T` | `Record`<`string`, `unknown`\> |
#### Returns
`Promise`<`T`[]\>
#### Defined in
[remote/index.ts:143](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L143)

View File

@@ -0,0 +1,355 @@
[vectordb](../README.md) / [Exports](../saas-modules.md) / RemoteTable
# Class: RemoteTable<T\>
A LanceDB Table is the collection of Records. Each Record has one or more vector fields.
## Type parameters
| Name | Type |
| :------ | :------ |
| `T` | `number`[] |
## Implements
- [`Table`](../interfaces/Table.md)<`T`\>
## Table of contents
### Constructors
- [constructor](RemoteTable.md#constructor)
### Properties
- [\_name](RemoteTable.md#_name)
- [\_client](RemoteTable.md#_client)
- [\_embeddings](RemoteTable.md#_embeddings)
### Accessors
- [name](RemoteTable.md#name)
### Methods
- [add](RemoteTable.md#add)
- [countRows](RemoteTable.md#countrows)
- [createIndex](RemoteTable.md#createindex)
- [delete](RemoteTable.md#delete)
- [listIndices](classes/RemoteTable.md#listindices)
- [indexStats](classes/RemoteTable.md#liststats)
- [overwrite](RemoteTable.md#overwrite)
- [search](RemoteTable.md#search)
- [schema](classes/RemoteTable.md#schema)
- [update](RemoteTable.md#update)
## Constructors
### constructor
**new RemoteTable**<`T`\>(`client`, `name`)
#### Type parameters
| Name | Type |
| :------ | :------ |
| `T` | `number`[] |
#### Parameters
| Name | Type |
| :------ | :------ |
| `client` | `HttpLancedbClient` |
| `name` | `string` |
#### Defined in
[remote/index.ts:186](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L186)
**new RemoteTable**<`T`\>(`client`, `name`, `embeddings`)
#### Type parameters
| Name | Type |
| :------ | :------ |
| `T` | `number`[] |
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `client` | `HttpLancedbClient` | |
| `name` | `string` | |
| `embeddings` | [`EmbeddingFunction`](../interfaces/EmbeddingFunction.md)<`T`\> | An embedding function to use when interacting with this table |
#### Defined in
[remote/index.ts:187](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L187)
## Accessors
### name
`get` **name**(): `string`
#### Returns
`string`
#### Implementation of
[Table](../interfaces/Table.md).[name](../interfaces/Table.md#name)
#### Defined in
[remote/index.ts:194](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L194)
## Methods
### add
**add**(`data`): `Promise`<`number`\>
Insert records into this Table.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `data` | `Record`<`string`, `unknown`\>[] | Records to be inserted into the Table |
#### Returns
`Promise`<`number`\>
The number of rows added to the table
#### Implementation of
[Table](../interfaces/Table.md).[add](../interfaces/Table.md#add)
#### Defined in
[remote/index.ts:293](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L293)
___
### countRows
**countRows**(): `Promise`<`number`\>
Returns the number of rows in this table.
#### Returns
`Promise`<`number`\>
#### Implementation of
[Table](../interfaces/Table.md).[countRows](../interfaces/Table.md#countrows)
#### Defined in
[remote/index.ts:290](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L290)
___
### createIndex
**createIndex**(`metric_type`, `column`, `index_cache_size`): `Promise`<`any`\>
Create an ANN index on this Table vector index.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `metric_type` | `string` | distance metric type, L2 or cosine or dot |
| `column` | `string` | the name of the column to be indexed |
#### Returns
`Promise`<`any`\>
#### Implementation of
[Table](../interfaces/Table.md).[createIndex](../interfaces/Table.md#createindex)
#### Defined in
[remote/index.ts:249](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L249)
___
### delete
**delete**(`filter`): `Promise`<`void`\>
Delete rows from this table.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `filter` | `string` | A filter in the same format used by a sql WHERE clause. |
#### Returns
`Promise`<`void`\>
#### Implementation of
[Table](../interfaces/Table.md).[delete](../interfaces/Table.md#delete)
#### Defined in
[remote/index.ts:295](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L295)
___
### overwrite
**overwrite**(`data`): `Promise`<`number`\>
Insert records into this Table, replacing its contents.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `data` | `Record`<`string`, `unknown`\>[] | Records to be inserted into the Table |
#### Returns
`Promise`<`number`\>
The number of rows added to the table
#### Implementation of
[Table](../interfaces/Table.md).[overwrite](../interfaces/Table.md#overwrite)
#### Defined in
[remote/index.ts:231](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L231)
___
### search
**search**(`query`): [`Query`](Query.md)<`T`\>
Creates a search query to find the nearest neighbors of the given search term
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `query` | `T` | The query search term |
#### Returns
[`Query`](Query.md)<`T`\>
#### Implementation of
[Table](../interfaces/Table.md).[search](../interfaces/Table.md#search)
#### Defined in
[remote/index.ts:209](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L209)
___
### update
**update**(`args`): `Promise`<`void`\>
Update zero to all rows depending on how many rows match the where clause.
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `args` | `UpdateArgs` or `UpdateSqlArgs` | The query search arguments |
#### Returns
`Promise`<`any`\>
#### Implementation of
[Table](../interfaces/Table.md).[search](../interfaces/Table.md#update)
#### Defined in
[remote/index.ts:299](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L299)
___
### schema
**schema**(): `Promise`<`void`\>
Get the schema of the table
#### Returns
`Promise`<`any`\>
#### Implementation of
[Table](../interfaces/Table.md).[search](../interfaces/Table.md#schema)
#### Defined in
[remote/index.ts:198](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L198)
___
### listIndices
**listIndices**(): `Promise`<`void`\>
List the indices of the table
#### Returns
`Promise`<`any`\>
#### Implementation of
[Table](../interfaces/Table.md).[search](../interfaces/Table.md#listIndices)
#### Defined in
[remote/index.ts:319](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L319)
___
### indexStats
**indexStats**(`indexUuid`): `Promise`<`void`\>
Get the indexed/unindexed of rows from the table
#### Parameters
| Name | Type | Description |
| :------ | :------ | :------ |
| `indexUuid` | `string` | the uuid of the index |
#### Returns
`Promise`<`numIndexedRows`\>
`Promise`<`numUnindexedRows`\>
#### Implementation of
[Table](../interfaces/Table.md).[search](../interfaces/Table.md#indexStats)
#### Defined in
[remote/index.ts:328](https://github.com/lancedb/lancedb/blob/main/node/src/remote/index.ts#L328)

View File

@@ -0,0 +1,92 @@
# Table of contents
## Installation
```bash
npm install vectordb
```
This will download the appropriate native library for your platform. We currently
support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
yet support Windows or musl-based Linux (such as Alpine Linux).
## Classes
- [RemoteConnection](classes/RemoteConnection.md)
- [RemoteTable](classes/RemoteTable.md)
- [RemoteQuery](classes/RemoteQuery.md)
## Methods
- [add](classes/RemoteTable.md#add)
- [countRows](classes/RemoteTable.md#countrows)
- [createIndex](classes/RemoteTable.md#createindex)
- [createTable](classes/RemoteConnection.md#createtable)
- [delete](classes/RemoteTable.md#delete)
- [dropTable](classes/RemoteConnection.md#droptable)
- [listIndices](classes/RemoteTable.md#listindices)
- [indexStats](classes/RemoteTable.md#liststats)
- [openTable](classes/RemoteConnection.md#opentable)
- [overwrite](classes/RemoteTable.md#overwrite)
- [schema](classes/RemoteTable.md#schema)
- [search](classes/RemoteTable.md#search)
- [tableNames](classes/RemoteConnection.md#tablenames)
- [update](classes/RemoteTable.md#update)
## Example code
```javascript
const lancedb = require('vectordb');
const { Schema, Field, Int32, Float32, Utf8, FixedSizeList } = require ("apache-arrow/Arrow.node")
// connect to a remote DB
const devApiKey = process.env.LANCEDB_DEV_API_KEY
const dbURI = process.env.LANCEDB_URI
const db = await lancedb.connect({
uri: dbURI, // replace dbURI with your project, e.g. "db://your-project-name"
apiKey: devApiKey, // replace dbURI with your api key
region: "us-east-1-dev"
});
// create a new table
const tableName = "my_table_000"
const data = [
{ id: 1, vector: [0.1, 1.0], item: "foo", price: 10.0 },
{ id: 2, vector: [3.9, 0.5], item: "bar", price: 20.0 }
]
const schema = new Schema(
[
new Field('id', new Int32()),
new Field('vector', new FixedSizeList(2, new Field('float32', new Float32()))),
new Field('item', new Utf8()),
new Field('price', new Float32())
]
)
const table = await db.createTable({
name: tableName,
schema,
}, data)
// list the table
const tableNames_1 = await db.tableNames('')
// add some data and search should be okay
const newData = [
{ id: 3, vector: [10.3, 1.9], item: "test1", price: 30.0 },
{ id: 4, vector: [6.2, 9.2], item: "test2", price: 40.0 }
]
await table.add(newData)
// create the index for the table
await table.createIndex({
metric_type: "L2",
column: "vector"
})
let result = await table.search([2.8, 4.3]).select(["vector", "price"]).limit(1).execute()
// update the data
await table.update({
where: "id == 1",
values: { item: "foo1" }
})
//drop the table
await db.dropTable(tableName)
```

View File

@@ -44,15 +44,14 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import openai\n", "from openai import OpenAI\n",
"import os\n", "import os\n",
"\n", "\n",
"# Configuring the environment variable OPENAI_API_KEY\n", "# Configuring the environment variable OPENAI_API_KEY\n",
"if \"OPENAI_API_KEY\" not in os.environ:\n", "if \"OPENAI_API_KEY\" not in os.environ:\n",
" # OR set the key here as a variable\n", " os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
" openai.api_key = \"sk-...\"\n", "client = OpenAI()\n",
" \n", "assert len(client.models.list().data) > 0"
"assert len(openai.Model.list()[\"data\"]) > 0"
] ]
}, },
{ {

View File

@@ -27,11 +27,11 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\n", "\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"\n", "\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n" "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
] ]
} }
], ],
@@ -206,15 +206,16 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import openai\n", "from openai import OpenAI\n",
"import os\n", "import os\n",
"\n", "\n",
"# Configuring the environment variable OPENAI_API_KEY\n", "# Configuring the environment variable OPENAI_API_KEY\n",
"if \"OPENAI_API_KEY\" not in os.environ:\n", "if \"OPENAI_API_KEY\" not in os.environ:\n",
" # OR set the key here as a variable\n", " # OR set the key here as a variable\n",
" openai.api_key = \"sk-...\"\n", " os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
" \n", " \n",
"assert len(openai.Model.list()[\"data\"]) > 0" "client = OpenAI()\n",
"assert len(client.models.list().data) > 0"
] ]
}, },
{ {
@@ -234,8 +235,8 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"def embed_func(c): \n", "def embed_func(c): \n",
" rs = openai.Embedding.create(input=c, engine=\"text-embedding-ada-002\")\n", " rs = client.embeddings.create(input=c, model=\"text-embedding-ada-002\")\n",
" return [record[\"embedding\"] for record in rs[\"data\"]]" " return [rs.data[0].embedding]"
] ]
}, },
{ {
@@ -536,9 +537,8 @@
], ],
"source": [ "source": [
"def complete(prompt):\n", "def complete(prompt):\n",
" # query text-davinci-003\n", " res = client.completions.create(\n",
" res = openai.Completion.create(\n", " model='text-davinci-003',\n",
" engine='text-davinci-003',\n",
" prompt=prompt,\n", " prompt=prompt,\n",
" temperature=0,\n", " temperature=0,\n",
" max_tokens=400,\n", " max_tokens=400,\n",
@@ -547,7 +547,7 @@
" presence_penalty=0,\n", " presence_penalty=0,\n",
" stop=None\n", " stop=None\n",
" )\n", " )\n",
" return res['choices'][0]['text'].strip()\n", " return res.choices[0].text\n",
"\n", "\n",
"# check that it works\n", "# check that it works\n",
"query = \"who was the 12th person on the moon and when did they land?\"\n", "query = \"who was the 12th person on the moon and when did they land?\"\n",

74
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.4.1", "version": "0.4.2",
"lockfileVersion": 2, "lockfileVersion": 2,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "vectordb", "name": "vectordb",
"version": "0.4.1", "version": "0.4.2",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"
@@ -53,11 +53,11 @@
"uuid": "^9.0.0" "uuid": "^9.0.0"
}, },
"optionalDependencies": { "optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.1", "@lancedb/vectordb-darwin-arm64": "0.4.2",
"@lancedb/vectordb-darwin-x64": "0.4.1", "@lancedb/vectordb-darwin-x64": "0.4.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.1", "@lancedb/vectordb-linux-arm64-gnu": "0.4.2",
"@lancedb/vectordb-linux-x64-gnu": "0.4.1", "@lancedb/vectordb-linux-x64-gnu": "0.4.2",
"@lancedb/vectordb-win32-x64-msvc": "0.4.1" "@lancedb/vectordb-win32-x64-msvc": "0.4.2"
} }
}, },
"node_modules/@apache-arrow/ts": { "node_modules/@apache-arrow/ts": {
@@ -317,9 +317,9 @@
} }
}, },
"node_modules/@lancedb/vectordb-darwin-arm64": { "node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.2.tgz",
"integrity": "sha512-ul/Hvv5RX2RThpKSuiUjJRVrmXuBPvpU+HrLjcBmu4dzpuWN4+IeHIUM6xe79gLxOKlwkscVweTOuZnmMfsZeg==", "integrity": "sha512-Ec73W2IHnZK4VC8g/7JyLbgcwcpNb9YI20yEhfTjEEFjJKoElZhDD/ZgghC3QQSRnrXFTxDzPK1V9BDT5QB2Hg==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -329,9 +329,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-darwin-x64": { "node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.2.tgz",
"integrity": "sha512-sJtF2Cv6T9RhUpdeHNkryiJwPuW9QPQ3aMs5fID1hMCJA2U3BX27t/WlkiPT2+kTLeUcwF1JvAOgsfvZkfvI8w==", "integrity": "sha512-tj0JJlOfOdeSAfmM7EZhrhFdCFjoq9Bmrjt4741BNjtF+Nv4Otl53lFtUQrexTr4oh/E1yY1qaydJ7K++8u3UA==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -341,9 +341,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-arm64-gnu": { "node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.2.tgz",
"integrity": "sha512-tNnziT0BRjPsznKI4GgWROFdCOsCGx0inFu0z+WV1UomwXKcMWGslpWBqKE8IUiCq14duPVx/ie7Wwcf51IeJQ==", "integrity": "sha512-OQ7ra5Q5RrLLwxIyI338KfQ2sSl8NJfqAHWvwiMtjCYFFYxIJGjX7U0I2MjSEPqJ5/ZoyjV4mjsvs0G1q20u+Q==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -353,9 +353,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-x64-gnu": { "node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.2.tgz",
"integrity": "sha512-PAcF2p1FUsC0AD+qkLfgE5+ZlQwlHe9eTP9dSsX43V/NGPDQ9+gBzaBTEDbvyHj1wl2Wft2NwOqB1HAFhilSDg==", "integrity": "sha512-9tgIFSOYqNJzonnYsQr7v2gGdJm8aZ62UsVX2SWAIVhypoP4A05tAlbzjBgKO3R5xy5gpcW8tt/Pt8IsYWON7Q==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -365,9 +365,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-win32-x64-msvc": { "node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.2.tgz",
"integrity": "sha512-8mvThCppI/AfSPby6Y3t6xpCfbo8IY6JH5exO8fDGTwBFHOqgwR4Izb2K7FgXxkwUYcN4EfGSsk/6B1GpwMudg==", "integrity": "sha512-jhG3MqZ3r8BexXANLRNX57RAnCZT9psdSBORG3KTu5qe2xaunRlJNSA2kk8a79tf+gtUT/BAmMiXMzAi/dwq8w==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -4869,33 +4869,33 @@
} }
}, },
"@lancedb/vectordb-darwin-arm64": { "@lancedb/vectordb-darwin-arm64": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.2.tgz",
"integrity": "sha512-ul/Hvv5RX2RThpKSuiUjJRVrmXuBPvpU+HrLjcBmu4dzpuWN4+IeHIUM6xe79gLxOKlwkscVweTOuZnmMfsZeg==", "integrity": "sha512-Ec73W2IHnZK4VC8g/7JyLbgcwcpNb9YI20yEhfTjEEFjJKoElZhDD/ZgghC3QQSRnrXFTxDzPK1V9BDT5QB2Hg==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-darwin-x64": { "@lancedb/vectordb-darwin-x64": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.2.tgz",
"integrity": "sha512-sJtF2Cv6T9RhUpdeHNkryiJwPuW9QPQ3aMs5fID1hMCJA2U3BX27t/WlkiPT2+kTLeUcwF1JvAOgsfvZkfvI8w==", "integrity": "sha512-tj0JJlOfOdeSAfmM7EZhrhFdCFjoq9Bmrjt4741BNjtF+Nv4Otl53lFtUQrexTr4oh/E1yY1qaydJ7K++8u3UA==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-linux-arm64-gnu": { "@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.2.tgz",
"integrity": "sha512-tNnziT0BRjPsznKI4GgWROFdCOsCGx0inFu0z+WV1UomwXKcMWGslpWBqKE8IUiCq14duPVx/ie7Wwcf51IeJQ==", "integrity": "sha512-OQ7ra5Q5RrLLwxIyI338KfQ2sSl8NJfqAHWvwiMtjCYFFYxIJGjX7U0I2MjSEPqJ5/ZoyjV4mjsvs0G1q20u+Q==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-linux-x64-gnu": { "@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.2.tgz",
"integrity": "sha512-PAcF2p1FUsC0AD+qkLfgE5+ZlQwlHe9eTP9dSsX43V/NGPDQ9+gBzaBTEDbvyHj1wl2Wft2NwOqB1HAFhilSDg==", "integrity": "sha512-9tgIFSOYqNJzonnYsQr7v2gGdJm8aZ62UsVX2SWAIVhypoP4A05tAlbzjBgKO3R5xy5gpcW8tt/Pt8IsYWON7Q==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-win32-x64-msvc": { "@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.1", "version": "0.4.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.1.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.2.tgz",
"integrity": "sha512-8mvThCppI/AfSPby6Y3t6xpCfbo8IY6JH5exO8fDGTwBFHOqgwR4Izb2K7FgXxkwUYcN4EfGSsk/6B1GpwMudg==", "integrity": "sha512-jhG3MqZ3r8BexXANLRNX57RAnCZT9psdSBORG3KTu5qe2xaunRlJNSA2kk8a79tf+gtUT/BAmMiXMzAi/dwq8w==",
"optional": true "optional": true
}, },
"@neon-rs/cli": { "@neon-rs/cli": {

View File

@@ -1,6 +1,6 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.4.1", "version": "0.4.2",
"description": " Serverless, low-latency vector database for AI applications", "description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",
@@ -81,10 +81,10 @@
} }
}, },
"optionalDependencies": { "optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.1", "@lancedb/vectordb-darwin-arm64": "0.4.2",
"@lancedb/vectordb-darwin-x64": "0.4.1", "@lancedb/vectordb-darwin-x64": "0.4.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.1", "@lancedb/vectordb-linux-arm64-gnu": "0.4.2",
"@lancedb/vectordb-linux-x64-gnu": "0.4.1", "@lancedb/vectordb-linux-x64-gnu": "0.4.2",
"@lancedb/vectordb-win32-x64-msvc": "0.4.1" "@lancedb/vectordb-win32-x64-msvc": "0.4.2"
} }
} }

View File

@@ -20,7 +20,7 @@ import {
Utf8, Utf8,
type Vector, type Vector,
FixedSizeList, FixedSizeList,
vectorFromArray, type Schema, Table as ArrowTable, RecordBatchStreamWriter vectorFromArray, type Schema, Table as ArrowTable, RecordBatchStreamWriter, List, Float64
} from 'apache-arrow' } from 'apache-arrow'
import { type EmbeddingFunction } from './index' import { type EmbeddingFunction } from './index'
@@ -59,6 +59,24 @@ export async function convertToTable<T> (data: Array<Record<string, unknown>>, e
if (typeof values[0] === 'string') { if (typeof values[0] === 'string') {
// `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column // `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column
records[columnsKey] = vectorFromArray(values, new Utf8()) records[columnsKey] = vectorFromArray(values, new Utf8())
} else if (Array.isArray(values[0])) {
const elementType = getElementType(values[0])
let innerType
if (elementType === 'string') {
innerType = new Utf8()
} else if (elementType === 'number') {
innerType = new Float64()
} else {
// TODO: pass in schema if it exists, else keep going to the next element
throw new Error(`Unsupported array element type ${elementType}`)
}
const listBuilder = makeBuilder({
type: new List(new Field('item', innerType, true))
})
for (const value of values) {
listBuilder.append(value)
}
records[columnsKey] = listBuilder.finish().toVector()
} else { } else {
records[columnsKey] = vectorFromArray(values) records[columnsKey] = vectorFromArray(values)
} }
@@ -68,6 +86,14 @@ export async function convertToTable<T> (data: Array<Record<string, unknown>>, e
return new ArrowTable(records) return new ArrowTable(records)
} }
function getElementType (arr: any[]): string {
if (arr.length === 0) {
return 'undefined'
}
return typeof arr[0]
}
// Creates a new Arrow ListBuilder that stores a Vector column // Creates a new Arrow ListBuilder that stores a Vector column
function newVectorBuilder (dim: number): FixedSizeListBuilder<Float32> { function newVectorBuilder (dim: number): FixedSizeListBuilder<Float32> {
return makeBuilder({ return makeBuilder({

View File

@@ -14,7 +14,8 @@
import { import {
type Schema, type Schema,
Table as ArrowTable Table as ArrowTable,
tableFromIPC
} from 'apache-arrow' } from 'apache-arrow'
import { createEmptyTable, fromRecordsToBuffer, fromTableToBuffer } from './arrow' import { createEmptyTable, fromRecordsToBuffer, fromTableToBuffer } from './arrow'
import type { EmbeddingFunction } from './embedding/embedding_function' import type { EmbeddingFunction } from './embedding/embedding_function'
@@ -24,7 +25,7 @@ import { isEmbeddingFunction } from './embedding/embedding_function'
import { type Literal, toSQL } from './util' import { type Literal, toSQL } from './util'
// eslint-disable-next-line @typescript-eslint/no-var-requires // eslint-disable-next-line @typescript-eslint/no-var-requires
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateScalarIndex, tableCreateVectorIndex, tableCountRows, tableDelete, tableUpdate, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js') const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateScalarIndex, tableCreateVectorIndex, tableCountRows, tableDelete, tableUpdate, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats, tableSchema } = require('../native.js')
export { Query } export { Query }
export type { EmbeddingFunction } export type { EmbeddingFunction }
@@ -354,6 +355,8 @@ export interface Table<T = number[]> {
* Get statistics about an index. * Get statistics about an index.
*/ */
indexStats: (indexUuid: string) => Promise<IndexStats> indexStats: (indexUuid: string) => Promise<IndexStats>
schema: Promise<Schema>
} }
export interface UpdateArgs { export interface UpdateArgs {
@@ -508,6 +511,7 @@ export class LocalConnection implements Connection {
export class LocalTable<T = number[]> implements Table<T> { export class LocalTable<T = number[]> implements Table<T> {
private _tbl: any private _tbl: any
private readonly _name: string private readonly _name: string
private readonly _isElectron: boolean
private readonly _embeddings?: EmbeddingFunction<T> private readonly _embeddings?: EmbeddingFunction<T>
private readonly _options: () => ConnectionOptions private readonly _options: () => ConnectionOptions
@@ -524,6 +528,7 @@ export class LocalTable<T = number[]> implements Table<T> {
this._name = name this._name = name
this._embeddings = embeddings this._embeddings = embeddings
this._options = () => options this._options = () => options
this._isElectron = this.checkElectron()
} }
get name (): string { get name (): string {
@@ -682,6 +687,27 @@ export class LocalTable<T = number[]> implements Table<T> {
async indexStats (indexUuid: string): Promise<IndexStats> { async indexStats (indexUuid: string): Promise<IndexStats> {
return tableIndexStats.call(this._tbl, indexUuid) return tableIndexStats.call(this._tbl, indexUuid)
} }
get schema (): Promise<Schema> {
// empty table
return this.getSchema()
}
private async getSchema (): Promise<Schema> {
const buffer = await tableSchema.call(this._tbl, this._isElectron)
const table = tableFromIPC(buffer)
return table.schema
}
// See https://github.com/electron/electron/issues/2288
private checkElectron (): boolean {
try {
// eslint-disable-next-line no-prototype-builtins
return (process?.versions?.hasOwnProperty('electron') || navigator?.userAgent?.toLowerCase()?.includes(' electron'))
} catch (e) {
return false
}
}
} }
export interface CleanupStats { export interface CleanupStats {

View File

@@ -218,6 +218,25 @@ describe('LanceDB client', function () {
assert.equal(await table.countRows(), 2) assert.equal(await table.countRows(), 2)
}) })
it('creates a new table from javascript objects with variable sized list', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const data = [
{ id: 1, vector: [0.1, 0.2], list_of_str: ['a', 'b', 'c'], list_of_num: [1, 2, 3] },
{ id: 2, vector: [1.1, 1.2], list_of_str: ['x', 'y'], list_of_num: [4, 5, 6] }
]
const tableName = 'with_variable_sized_list'
const table = await con.createTable(tableName, data) as LocalTable
assert.equal(table.name, tableName)
assert.equal(await table.countRows(), 2)
const rs = await table.filter('id>1').execute()
assert.equal(rs.length, 1)
assert.deepEqual(rs[0].list_of_str, ['x', 'y'])
assert.isTrue(rs[0].list_of_num instanceof Float64Array)
})
it('fails to create a new table when the vector column is missing', async function () { it('fails to create a new table when the vector column is missing', async function () {
const dir = await track().mkdir('lancejs') const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir) const con = await lancedb.connect(dir)
@@ -479,6 +498,27 @@ describe('LanceDB client', function () {
assert.equal(results.length, 2) assert.equal(results.length, 2)
}) })
}) })
describe('when inspecting the schema', function () {
it('should return the schema', async function () {
const uri = await createTestDB()
const db = await lancedb.connect(uri)
// the fsl inner field must be named 'item' and be nullable
const expectedSchema = new Schema(
[
new Field('id', new Int32()),
new Field('vector', new FixedSizeList(128, new Field('item', new Float32(), true))),
new Field('s', new Utf8())
]
)
const table = await db.createTable({
name: 'some_table',
schema: expectedSchema
})
const schema = await table.schema
assert.deepEqual(expectedSchema, schema)
})
})
}) })
describe('Remote LanceDB client', function () { describe('Remote LanceDB client', function () {

View File

@@ -13,7 +13,7 @@
"""Full text search index using tantivy-py""" """Full text search index using tantivy-py"""
import os import os
from typing import List, Tuple from typing import List, Optional, Tuple
import pyarrow as pa import pyarrow as pa
@@ -56,7 +56,12 @@ def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
return index return index
def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int: def populate_index(
index: tantivy.Index,
table: LanceTable,
fields: List[str],
writer_heap_size: int = 1024 * 1024 * 1024,
) -> int:
""" """
Populate an index with data from a LanceTable Populate an index with data from a LanceTable
@@ -68,6 +73,8 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
The table to index The table to index
fields : List[str] fields : List[str]
List of fields to index List of fields to index
writer_heap_size : int
The writer heap size in bytes, defaults to 1GB
Returns Returns
------- -------
@@ -87,7 +94,7 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
raise TypeError(f"Field {name} is not a string type") raise TypeError(f"Field {name} is not a string type")
# create a tantivy writer # create a tantivy writer
writer = index.writer() writer = index.writer(heap_size=writer_heap_size)
# write data into index # write data into index
dataset = table.to_lance() dataset = table.to_lance()
row_id = 0 row_id = 0
@@ -103,10 +110,13 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
b = b.flatten() b = b.flatten()
for i in range(b.num_rows): for i in range(b.num_rows):
doc = tantivy.Document() doc = tantivy.Document()
doc.add_integer("doc_id", row_id)
for name in fields: for name in fields:
doc.add_text(name, b[name][i].as_py()) value = b[name][i].as_py()
writer.add_document(doc) if value is not None:
doc.add_text(name, value)
if not doc.is_empty:
doc.add_integer("doc_id", row_id)
writer.add_document(doc)
row_id += 1 row_id += 1
# commit changes # commit changes
writer.commit() writer.commit()

View File

@@ -192,6 +192,7 @@ else:
def _pydantic_to_arrow_type(field: pydantic.fields.FieldInfo) -> pa.DataType: def _pydantic_to_arrow_type(field: pydantic.fields.FieldInfo) -> pa.DataType:
"""Convert a Pydantic FieldInfo to Arrow DataType""" """Convert a Pydantic FieldInfo to Arrow DataType"""
if isinstance(field.annotation, _GenericAlias) or ( if isinstance(field.annotation, _GenericAlias) or (
sys.version_info > (3, 9) and isinstance(field.annotation, types.GenericAlias) sys.version_info > (3, 9) and isinstance(field.annotation, types.GenericAlias)
): ):
@@ -203,6 +204,13 @@ def _pydantic_to_arrow_type(field: pydantic.fields.FieldInfo) -> pa.DataType:
elif origin == Union: elif origin == Union:
if len(args) == 2 and args[1] == type(None): if len(args) == 2 and args[1] == type(None):
return _py_type_to_arrow_type(args[0], field) return _py_type_to_arrow_type(args[0], field)
elif sys.version_info >= (3, 10) and isinstance(field.annotation, types.UnionType):
args = field.annotation.__args__
if len(args) == 2:
for typ in args:
if typ == type(None):
continue
return _py_type_to_arrow_type(typ, field)
elif inspect.isclass(field.annotation): elif inspect.isclass(field.annotation):
if issubclass(field.annotation, pydantic.BaseModel): if issubclass(field.annotation, pydantic.BaseModel):
# Struct # Struct
@@ -221,6 +229,11 @@ def is_nullable(field: pydantic.fields.FieldInfo) -> bool:
if origin == Union: if origin == Union:
if len(args) == 2 and args[1] == type(None): if len(args) == 2 and args[1] == type(None):
return True return True
elif sys.version_info >= (3, 10) and isinstance(field.annotation, types.UnionType):
args = field.annotation.__args__
for typ in args:
if typ == type(None):
return True
return False return False

View File

@@ -14,6 +14,7 @@
from __future__ import annotations from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path
from typing import TYPE_CHECKING, List, Literal, Optional, Type, Union from typing import TYPE_CHECKING, List, Literal, Optional, Type, Union
import deprecation import deprecation
@@ -480,6 +481,12 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
# get the index path # get the index path
index_path = self._table._get_fts_index_path() index_path = self._table._get_fts_index_path()
# check if the index exist
if not Path(index_path).exists():
raise FileNotFoundError(
"Fts index does not exist."
f"Please first call table.create_fts_index(['<field_names>']) to create the fts index."
)
# open the index # open the index
index = tantivy.Index.open(index_path) index = tantivy.Index.open(index_path)
# get the scores and doc ids # get the scores and doc ids

View File

@@ -15,7 +15,8 @@
import functools import functools
from typing import Any, Callable, Dict, Iterable, Optional, Union from typing import Any, Callable, Dict, Iterable, Optional, Union
import aiohttp import requests
import urllib.parse
import attrs import attrs
import pyarrow as pa import pyarrow as pa
from pydantic import BaseModel from pydantic import BaseModel
@@ -37,8 +38,8 @@ def _check_not_closed(f):
return wrapped return wrapped
async def _read_ipc(resp: aiohttp.ClientResponse) -> pa.Table: def _read_ipc(resp: requests.Response) -> pa.Table:
resp_body = await resp.read() resp_body = resp.raw.read()
with pa.ipc.open_file(pa.BufferReader(resp_body)) as reader: with pa.ipc.open_file(pa.BufferReader(resp_body)) as reader:
return reader.read_all() return reader.read_all()
@@ -53,15 +54,24 @@ class RestfulLanceDBClient:
closed: bool = attrs.field(default=False, init=False) closed: bool = attrs.field(default=False, init=False)
@functools.cached_property @functools.cached_property
def session(self) -> aiohttp.ClientSession: def session(self) -> requests.Session:
url = ( session = requests.session()
session.stream = True
return session
@functools.cached_property
def url(self) -> str:
return (
self.host_override self.host_override
or f"https://{self.db_name}.{self.region}.api.lancedb.com" or f"https://{self.db_name}.{self.region}.api.lancedb.com"
) )
return aiohttp.ClientSession(url)
async def close(self): def _get_request_url(self, uri: str) -> str:
await self.session.close() return urllib.parse.urljoin(self.url, uri)
def close(self):
self.session.close()
self.closed = True self.closed = True
@functools.cached_property @functools.cached_property
@@ -75,39 +85,25 @@ class RestfulLanceDBClient:
headers["x-lancedb-database"] = self.db_name headers["x-lancedb-database"] = self.db_name
return headers return headers
@staticmethod
async def _check_status(resp: aiohttp.ClientResponse):
if resp.status == 404:
raise LanceDBClientError(f"Not found: {await resp.text()}")
elif 400 <= resp.status < 500:
raise LanceDBClientError(
f"Bad Request: {resp.status}, error: {await resp.text()}"
)
elif 500 <= resp.status < 600:
raise LanceDBClientError(
f"Internal Server Error: {resp.status}, error: {await resp.text()}"
)
elif resp.status != 200:
raise LanceDBClientError(
f"Unknown Error: {resp.status}, error: {await resp.text()}"
)
@_check_not_closed @_check_not_closed
async def get(self, uri: str, params: Union[Dict[str, Any], BaseModel] = None): def get(self, uri: str, params: Union[Dict[str, Any], BaseModel] = None):
"""Send a GET request and returns the deserialized response payload.""" """Send a GET request and returns the deserialized response payload."""
if isinstance(params, BaseModel): if isinstance(params, BaseModel):
params: Dict[str, Any] = params.dict(exclude_none=True) params: Dict[str, Any] = params.dict(exclude_none=True)
async with self.session.get(
uri, resp = self.session.get(
self._get_request_url(uri),
params=params, params=params,
headers=self.headers, headers=self.headers,
timeout=aiohttp.ClientTimeout(total=30), # 5s connect timeout, 30s read timeout
) as resp: timeout=(5.0, 30.0),
await self._check_status(resp) )
return await resp.json()
resp.raise_for_status()
return resp.json()
@_check_not_closed @_check_not_closed
async def post( def post(
self, self,
uri: str, uri: str,
data: Optional[Union[Dict[str, Any], BaseModel, bytes]] = None, data: Optional[Union[Dict[str, Any], BaseModel, bytes]] = None,
@@ -139,31 +135,31 @@ class RestfulLanceDBClient:
headers["content-type"] = content_type headers["content-type"] = content_type
if request_id is not None: if request_id is not None:
headers["x-request-id"] = request_id headers["x-request-id"] = request_id
async with self.session.post(
uri, resp = self.session.post(
headers=headers, self._get_request_url(uri),
params=params, params=params,
timeout=aiohttp.ClientTimeout(total=30), headers=self.headers,
# 5s connect timeout, 30s read timeout
timeout=(5.0, 30.0),
**req_kwargs, **req_kwargs,
) as resp: )
resp: aiohttp.ClientResponse = resp resp.raise_for_status()
await self._check_status(resp)
return await deserialize(resp) return deserialize(resp)
@_check_not_closed @_check_not_closed
async def list_tables( def list_tables(
self, limit: int, page_token: Optional[str] = None self, limit: int, page_token: Optional[str] = None
) -> Iterable[str]: ) -> Iterable[str]:
"""List all tables in the database.""" """List all tables in the database."""
if page_token is None: if page_token is None:
page_token = "" page_token = ""
json = await self.get("/v1/table/", {"limit": limit, "page_token": page_token}) json = self.get("/v1/table/", {"limit": limit, "page_token": page_token})
return json["tables"] return json["tables"]
@_check_not_closed @_check_not_closed
async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult: def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
"""Query a table.""" """Query a table."""
tbl = await self.post( tbl = self.post(f"/v1/table/{table_name}/query/", query, deserialize=_read_ipc)
f"/v1/table/{table_name}/query/", query, deserialize=_read_ipc
)
return VectorQueryResult(tbl) return VectorQueryResult(tbl)

View File

@@ -50,10 +50,6 @@ class RemoteDBConnection(DBConnection):
self._client = RestfulLanceDBClient( self._client = RestfulLanceDBClient(
self.db_name, region, api_key, host_override self.db_name, region, api_key, host_override
) )
try:
self._loop = asyncio.get_running_loop()
except RuntimeError:
self._loop = asyncio.get_event_loop()
def __repr__(self) -> str: def __repr__(self) -> str:
return f"RemoteConnect(name={self.db_name})" return f"RemoteConnect(name={self.db_name})"
@@ -76,15 +72,13 @@ class RemoteDBConnection(DBConnection):
An iterator of table names. An iterator of table names.
""" """
while True: while True:
result = self._loop.run_until_complete( result = self._client.list_tables(limit, page_token)
self._client.list_tables(limit, page_token)
)
if len(result) > 0:
page_token = result[len(result) - 1]
else:
break
for item in result: for item in result:
yield item yield item
if len(result) < limit:
break
else:
page_token = result[len(result) - 1]
@override @override
def open_table(self, name: str) -> Table: def open_table(self, name: str) -> Table:
@@ -103,9 +97,7 @@ class RemoteDBConnection(DBConnection):
# check if table exists # check if table exists
try: try:
self._loop.run_until_complete( self._client.post(f"/v1/table/{name}/describe/")
self._client.post(f"/v1/table/{name}/describe/")
)
except LanceDBClientError as err: except LanceDBClientError as err:
if str(err).startswith("Not found"): if str(err).startswith("Not found"):
logging.error( logging.error(
@@ -248,13 +240,11 @@ class RemoteDBConnection(DBConnection):
data = to_ipc_binary(data) data = to_ipc_binary(data)
request_id = uuid.uuid4().hex request_id = uuid.uuid4().hex
self._loop.run_until_complete( self._client.post(
self._client.post( f"/v1/table/{name}/create/",
f"/v1/table/{name}/create/", data=data,
data=data, request_id=request_id,
request_id=request_id, content_type=ARROW_STREAM_CONTENT_TYPE,
content_type=ARROW_STREAM_CONTENT_TYPE,
)
) )
return RemoteTable(self, name) return RemoteTable(self, name)
@@ -267,13 +257,10 @@ class RemoteDBConnection(DBConnection):
name: str name: str
The name of the table. The name of the table.
""" """
self._loop.run_until_complete( self._client.post(
self._client.post( f"/v1/table/{name}/drop/",
f"/v1/table/{name}/drop/",
)
) )
async def close(self): async def close(self):
"""Close the connection to the database.""" """Close the connection to the database."""
self._loop.close() self._client.close()
await self._client.close()

View File

@@ -11,7 +11,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import asyncio
import uuid import uuid
from functools import cached_property from functools import cached_property
from typing import Dict, Optional, Union from typing import Dict, Optional, Union
@@ -43,18 +42,14 @@ class RemoteTable(Table):
of this Table of this Table
""" """
resp = self._conn._loop.run_until_complete( resp = self._conn._client.post(f"/v1/table/{self._name}/describe/")
self._conn._client.post(f"/v1/table/{self._name}/describe/")
)
schema = json_to_schema(resp["schema"]) schema = json_to_schema(resp["schema"])
return schema return schema
@property @property
def version(self) -> int: def version(self) -> int:
"""Get the current version of the table""" """Get the current version of the table"""
resp = self._conn._loop.run_until_complete( resp = self._conn._client.post(f"/v1/table/{self._name}/describe/")
self._conn._client.post(f"/v1/table/{self._name}/describe/")
)
return resp["version"] return resp["version"]
def to_arrow(self) -> pa.Table: def to_arrow(self) -> pa.Table:
@@ -116,8 +111,8 @@ class RemoteTable(Table):
"metric_type": metric, "metric_type": metric,
"index_cache_size": index_cache_size, "index_cache_size": index_cache_size,
} }
resp = self._conn._loop.run_until_complete( resp = self._conn._client.post(
self._conn._client.post(f"/v1/table/{self._name}/create_index/", data=data) f"/v1/table/{self._name}/create_index/", data=data
) )
return resp return resp
@@ -161,13 +156,11 @@ class RemoteTable(Table):
request_id = uuid.uuid4().hex request_id = uuid.uuid4().hex
self._conn._loop.run_until_complete( self._conn._client.post(
self._conn._client.post( f"/v1/table/{self._name}/insert/",
f"/v1/table/{self._name}/insert/", data=payload,
data=payload, params={"request_id": request_id, "mode": mode},
params={"request_id": request_id, "mode": mode}, content_type=ARROW_STREAM_CONTENT_TYPE,
content_type=ARROW_STREAM_CONTENT_TYPE,
)
) )
def search( def search(
@@ -233,19 +226,17 @@ class RemoteTable(Table):
and len(query.vector) > 0 and len(query.vector) > 0
and not isinstance(query.vector[0], float) and not isinstance(query.vector[0], float)
): ):
futures = [] result = []
for v in query.vector: for v in query.vector:
v = list(v) v = list(v)
q = query.copy() q = query.copy()
q.vector = v q.vector = v
futures.append(self._conn._client.query(self._name, q)) result.append(self._conn._client.query(self._name, q))
result = self._conn._loop.run_until_complete(asyncio.gather(*futures))
return pa.concat_tables( return pa.concat_tables(
[add_index(r.to_arrow(), i) for i, r in enumerate(result)] [add_index(r.to_arrow(), i) for i, r in enumerate(result)]
) )
else: else:
result = self._conn._client.query(self._name, query) return self._conn._client.query(self._name, query).to_arrow()
return self._conn._loop.run_until_complete(result).to_arrow()
def delete(self, predicate: str): def delete(self, predicate: str):
"""Delete rows from the table. """Delete rows from the table.
@@ -294,9 +285,7 @@ class RemoteTable(Table):
0 2 [3.0, 4.0] 85.0 # doctest: +SKIP 0 2 [3.0, 4.0] 85.0 # doctest: +SKIP
""" """
payload = {"predicate": predicate} payload = {"predicate": predicate}
self._conn._loop.run_until_complete( self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
self._conn._client.post(f"/v1/table/{self._name}/delete/", data=payload)
)
def update( def update(
self, self,
@@ -356,9 +345,7 @@ class RemoteTable(Table):
updates = [[k, v] for k, v in values_sql.items()] updates = [[k, v] for k, v in values_sql.items()]
payload = {"predicate": where, "updates": updates} payload = {"predicate": where, "updates": updates}
self._conn._loop.run_until_complete( self._conn._client.post(f"/v1/table/{self._name}/update/", data=payload)
self._conn._client.post(f"/v1/table/{self._name}/update/", data=payload)
)
def add_index(tbl: pa.Table, i: int) -> pa.Table: def add_index(tbl: pa.Table, i: int) -> pa.Table:

View File

@@ -709,7 +709,11 @@ class LanceTable(Table):
self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace) self._dataset.create_scalar_index(column, index_type="BTREE", replace=replace)
def create_fts_index( def create_fts_index(
self, field_names: Union[str, List[str]], *, replace: bool = False self,
field_names: Union[str, List[str]],
*,
replace: bool = False,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
): ):
"""Create a full-text search index on the table. """Create a full-text search index on the table.
@@ -724,6 +728,7 @@ class LanceTable(Table):
If True, replace the existing index if it exists. Note that this is If True, replace the existing index if it exists. Note that this is
not yet an atomic operation; the index will be temporarily not yet an atomic operation; the index will be temporarily
unavailable while the new index is being created. unavailable while the new index is being created.
writer_heap_size: int, default 1GB
""" """
from .fts import create_index, populate_index from .fts import create_index, populate_index
@@ -740,7 +745,7 @@ class LanceTable(Table):
fs.delete_dir(path) fs.delete_dir(path)
index = create_index(self._get_fts_index_path(), field_names) index = create_index(self._get_fts_index_path(), field_names)
populate_index(index, self, field_names) populate_index(index, self, field_names, writer_heap_size=writer_heap_size)
register_event("create_fts_index") register_event("create_fts_index")
def _get_fts_index_path(self): def _get_fts_index_path(self):

View File

@@ -3,11 +3,11 @@ name = "lancedb"
version = "0.4.3" version = "0.4.3"
dependencies = [ dependencies = [
"deprecation", "deprecation",
"pylance==0.9.2", "pylance==0.9.5",
"ratelimiter~=1.0", "ratelimiter~=1.0",
"retry>=0.9.2", "retry>=0.9.2",
"tqdm>=4.27.0", "tqdm>=4.27.0",
"aiohttp", "requests>=2.31,<3",
"pydantic>=1.10", "pydantic>=1.10",
"attrs>=21.3.0", "attrs>=21.3.0",
"semver>=3.0", "semver>=3.0",

View File

@@ -1,27 +0,0 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pytest
from lancedb import LanceDBConnection
# TODO: setup integ test mark and script
@pytest.mark.skip(reason="Need to set up a local server")
def test_against_local_server():
conn = LanceDBConnection("lancedb+http://localhost:10024")
table = conn.open_table("sift1m_ivf1024_pq16")
df = table.search(np.random.rand(128)).to_pandas()
assert len(df) == 10

View File

@@ -82,7 +82,7 @@ def test_search_index(tmp_path, table):
def test_create_index_from_table(tmp_path, table): def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text") table.create_fts_index("text")
df = table.search("puppy").limit(10).select(["text"]).to_pandas() df = table.search("puppy").limit(10).select(["text"]).to_pandas()
assert len(df) == 10 assert len(df) <= 10
assert "text" in df.columns assert "text" in df.columns
# Check whether it can be updated # Check whether it can be updated
@@ -147,3 +147,35 @@ def test_search_index_with_filter(table):
assert r["id"] == 1 assert r["id"] == 1
assert rs == rs2 assert rs == rs2
def test_null_input(table):
table.add(
[
{
"vector": np.random.randn(128),
"id": 101,
"text": None,
"text2": None,
"nested": {"text": None},
}
]
)
table.create_fts_index("text")
def test_syntax(table):
# https://github.com/lancedb/lancedb/issues/769
table.create_fts_index("text")
with pytest.raises(ValueError, match="Syntax Error"):
table.search("they could have been dogs OR cats").limit(10).to_list()
# this should work
table.search('"they could have been dogs OR cats"').limit(10).to_list()
# this should work too
table.search('''"the cats OR dogs were not really 'pets' at all"''').limit(
10
).to_list()
with pytest.raises(ValueError, match="Syntax Error"):
table.search('''"the cats OR dogs were not really "pets" at all"''').limit(
10
).to_list()

View File

@@ -88,6 +88,28 @@ def test_pydantic_to_arrow():
assert schema == expect_schema assert schema == expect_schema
@pytest.mark.skipif(
sys.version_info < (3, 10),
reason="using | type syntax requires python3.10 or higher",
)
def test_optional_types_py310():
class TestModel(pydantic.BaseModel):
a: str | None
b: None | str
c: Optional[str]
schema = pydantic_to_schema(TestModel)
expect_schema = pa.schema(
[
pa.field("a", pa.utf8(), True),
pa.field("b", pa.utf8(), True),
pa.field("c", pa.utf8(), True),
]
)
assert schema == expect_schema
@pytest.mark.skipif( @pytest.mark.skipif(
sys.version_info > (3, 8), sys.version_info > (3, 8),
reason="using native type alias requires python3.9 or higher", reason="using native type alias requires python3.9 or higher",

View File

@@ -1,95 +0,0 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import attrs
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
from aiohttp import web
from lancedb.remote.client import RestfulLanceDBClient, VectorQuery
@attrs.define
class MockLanceDBServer:
runner: web.AppRunner = attrs.field(init=False)
site: web.TCPSite = attrs.field(init=False)
async def query_handler(self, request: web.Request) -> web.Response:
table_name = request.match_info["table_name"]
assert table_name == "test_table"
await request.json()
# TODO: do some matching
vecs = pd.Series([np.random.rand(128) for x in range(10)], name="vector")
ids = pd.Series(range(10), name="id")
df = pd.DataFrame([vecs, ids]).T
batch = pa.RecordBatch.from_pandas(
df,
schema=pa.schema(
[
pa.field("vector", pa.list_(pa.float32(), 128)),
pa.field("id", pa.int64()),
]
),
)
sink = pa.BufferOutputStream()
with pa.ipc.new_file(sink, batch.schema) as writer:
writer.write_batch(batch)
return web.Response(body=sink.getvalue().to_pybytes())
async def setup(self):
app = web.Application()
app.add_routes([web.post("/table/{table_name}", self.query_handler)])
self.runner = web.AppRunner(app)
await self.runner.setup()
self.site = web.TCPSite(self.runner, "localhost", 8111)
async def start(self):
await self.site.start()
async def stop(self):
await self.runner.cleanup()
@pytest.mark.skip(reason="flaky somehow, fix later")
@pytest.mark.asyncio
async def test_e2e_with_mock_server():
mock_server = MockLanceDBServer()
await mock_server.setup()
await mock_server.start()
try:
client = RestfulLanceDBClient("lancedb+http://localhost:8111")
df = (
await client.query(
"test_table",
VectorQuery(
vector=np.random.rand(128).tolist(),
k=10,
_metric="L2",
columns=["id", "vector"],
),
)
).to_pandas()
assert "vector" in df.columns
assert "id" in df.columns
finally:
# make sure we don't leak resources
await mock_server.stop()

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "vectordb-node" name = "vectordb-node"
version = "0.4.1" version = "0.4.2"
description = "Serverless, low-latency vector database for AI applications" description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0" license = "Apache-2.0"
edition = "2018" edition = "2018"

View File

@@ -36,7 +36,7 @@ fn validate_vector_column(record_batch: &RecordBatch) -> Result<()> {
pub(crate) fn arrow_buffer_to_record_batch(slice: &[u8]) -> Result<(Vec<RecordBatch>, SchemaRef)> { pub(crate) fn arrow_buffer_to_record_batch(slice: &[u8]) -> Result<(Vec<RecordBatch>, SchemaRef)> {
let mut batches: Vec<RecordBatch> = Vec::new(); let mut batches: Vec<RecordBatch> = Vec::new();
let file_reader = FileReader::try_new(Cursor::new(slice), None)?; let file_reader = FileReader::try_new(Cursor::new(slice), None)?;
let schema = file_reader.schema().clone(); let schema = file_reader.schema();
for b in file_reader { for b in file_reader {
let record_batch = b?; let record_batch = b?;
validate_vector_column(&record_batch)?; validate_vector_column(&record_batch)?;

View File

@@ -13,6 +13,9 @@
// limitations under the License. // limitations under the License.
use neon::prelude::*; use neon::prelude::*;
use neon::types::buffer::TypedArray;
use crate::error::ResultExt;
pub(crate) fn vec_str_to_array<'a, C: Context<'a>>( pub(crate) fn vec_str_to_array<'a, C: Context<'a>>(
vec: &Vec<String>, vec: &Vec<String>,
@@ -34,3 +37,20 @@ pub(crate) fn js_array_to_vec(array: &JsArray, cx: &mut FunctionContext) -> Vec<
} }
query_vec query_vec
} }
// Creates a new JsBuffer from a rust buffer with a special logic for electron
pub(crate) fn new_js_buffer<'a>(
buffer: Vec<u8>,
cx: &mut TaskContext<'a>,
is_electron: bool,
) -> NeonResult<Handle<'a, JsBuffer>> {
if is_electron {
// Electron does not support `external`: https://github.com/neon-bindings/neon/pull/937
let mut js_buffer = JsBuffer::new(cx, buffer.len()).or_throw(cx)?;
let buffer_data = js_buffer.as_mut_slice(cx);
buffer_data.copy_from_slice(buffer.as_slice());
Ok(js_buffer)
} else {
Ok(JsBuffer::external(cx, buffer))
}
}

View File

@@ -250,5 +250,6 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
"tableCreateVectorIndex", "tableCreateVectorIndex",
index::vector::table_create_vector_index, index::vector::table_create_vector_index,
)?; )?;
cx.export_function("tableSchema", JsTable::js_schema)?;
Ok(()) Ok(())
} }

View File

@@ -7,7 +7,6 @@ use lance_linalg::distance::MetricType;
use neon::context::FunctionContext; use neon::context::FunctionContext;
use neon::handle::Handle; use neon::handle::Handle;
use neon::prelude::*; use neon::prelude::*;
use neon::types::buffer::TypedArray;
use crate::arrow::record_batch_to_buffer; use crate::arrow::record_batch_to_buffer;
use crate::error::ResultExt; use crate::error::ResultExt;
@@ -96,26 +95,9 @@ impl JsQuery {
deferred.settle_with(&channel, move |mut cx| { deferred.settle_with(&channel, move |mut cx| {
let results = results.or_throw(&mut cx)?; let results = results.or_throw(&mut cx)?;
let buffer = record_batch_to_buffer(results).or_throw(&mut cx)?; let buffer = record_batch_to_buffer(results).or_throw(&mut cx)?;
Self::new_js_buffer(buffer, &mut cx, is_electron) convert::new_js_buffer(buffer, &mut cx, is_electron)
}); });
}); });
Ok(promise) Ok(promise)
} }
// Creates a new JsBuffer from a rust buffer with a special logic for electron
fn new_js_buffer<'a>(
buffer: Vec<u8>,
cx: &mut TaskContext<'a>,
is_electron: bool,
) -> NeonResult<Handle<'a, JsBuffer>> {
if is_electron {
// Electron does not support `external`: https://github.com/neon-bindings/neon/pull/937
let mut js_buffer = JsBuffer::new(cx, buffer.len()).or_throw(cx)?;
let buffer_data = js_buffer.as_mut_slice(cx);
buffer_data.copy_from_slice(buffer.as_slice());
Ok(js_buffer)
} else {
Ok(JsBuffer::external(cx, buffer))
}
}
} }

View File

@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use arrow_array::RecordBatchIterator; use arrow_array::{RecordBatch, RecordBatchIterator};
use lance::dataset::optimize::CompactionOptions; use lance::dataset::optimize::CompactionOptions;
use lance::dataset::{WriteMode, WriteParams}; use lance::dataset::{WriteMode, WriteParams};
use lance::io::object_store::ObjectStoreParams; use lance::io::object_store::ObjectStoreParams;
use crate::arrow::arrow_buffer_to_record_batch; use crate::arrow::{arrow_buffer_to_record_batch, record_batch_to_buffer};
use neon::prelude::*; use neon::prelude::*;
use neon::types::buffer::TypedArray; use neon::types::buffer::TypedArray;
use vectordb::Table; use vectordb::Table;
use crate::error::ResultExt; use crate::error::ResultExt;
use crate::{get_aws_creds, get_aws_region, runtime, JsDatabase}; use crate::{convert, get_aws_creds, get_aws_region, runtime, JsDatabase};
pub(crate) struct JsTable { pub(crate) struct JsTable {
pub table: Table, pub table: Table,
@@ -426,4 +426,27 @@ impl JsTable {
Ok(promise) Ok(promise)
} }
pub(crate) fn js_schema(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let table = js_table.table.clone();
let is_electron = cx
.argument::<JsBoolean>(0)
.or_throw(&mut cx)?
.value(&mut cx);
rt.spawn(async move {
deferred.settle_with(&channel, move |mut cx| {
let schema = table.schema();
let batches = vec![RecordBatch::new_empty(schema)];
let buffer = record_batch_to_buffer(batches).or_throw(&mut cx)?;
convert::new_js_buffer(buffer, &mut cx, is_electron)
})
});
Ok(promise)
}
} }

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "vectordb" name = "vectordb"
version = "0.4.1" version = "0.4.2"
edition = "2021" edition = "2021"
description = "LanceDB: A serverless, low-latency vector database for AI applications" description = "LanceDB: A serverless, low-latency vector database for AI applications"
license = "Apache-2.0" license = "Apache-2.0"