mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
4 Commits
qian/saas-
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
600bfd7237 | ||
|
|
d087e7891d | ||
|
|
098e397cf0 | ||
|
|
63ee8fa6a1 |
@@ -5,10 +5,10 @@ exclude = ["python"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.8.17", "features" = ["dynamodb"] }
|
||||
lance-index = { "version" = "=0.8.17" }
|
||||
lance-linalg = { "version" = "=0.8.17" }
|
||||
lance-testing = { "version" = "=0.8.17" }
|
||||
lance = { "version" = "=0.8.20", "features" = ["dynamodb"] }
|
||||
lance-index = { "version" = "=0.8.20" }
|
||||
lance-linalg = { "version" = "=0.8.20" }
|
||||
lance-testing = { "version" = "=0.8.20" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "47.0.0", optional = false }
|
||||
arrow-array = "47.0"
|
||||
|
||||
@@ -21,9 +21,10 @@ import type { EmbeddingFunction } from './embedding/embedding_function'
|
||||
import { RemoteConnection } from './remote'
|
||||
import { Query } from './query'
|
||||
import { isEmbeddingFunction } from './embedding/embedding_function'
|
||||
import { type Literal, toSQL } from './util'
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js')
|
||||
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableUpdate, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js')
|
||||
|
||||
export { Query }
|
||||
export type { EmbeddingFunction }
|
||||
@@ -261,6 +262,39 @@ export interface Table<T = number[]> {
|
||||
*/
|
||||
delete: (filter: string) => Promise<void>
|
||||
|
||||
/**
|
||||
* Update rows in this table.
|
||||
*
|
||||
* This can be used to update a single row, many rows, all rows, or
|
||||
* sometimes no rows (if your predicate matches nothing).
|
||||
*
|
||||
* @param args see {@link UpdateArgs} and {@link UpdateSqlArgs} for more details
|
||||
*
|
||||
* @examples
|
||||
*
|
||||
* ```ts
|
||||
* const con = await lancedb.connect("./.lancedb")
|
||||
* const data = [
|
||||
* {id: 1, vector: [3, 3], name: 'Ye'},
|
||||
* {id: 2, vector: [4, 4], name: 'Mike'},
|
||||
* ];
|
||||
* const tbl = await con.createTable("my_table", data)
|
||||
*
|
||||
* await tbl.update({
|
||||
* filter: "id = 2",
|
||||
* updates: { vector: [2, 2], name: "Michael" },
|
||||
* })
|
||||
*
|
||||
* let results = await tbl.search([1, 1]).execute();
|
||||
* // Returns [
|
||||
* // {id: 2, vector: [2, 2], name: 'Michael'}
|
||||
* // {id: 1, vector: [3, 3], name: 'Ye'}
|
||||
* // ]
|
||||
* ```
|
||||
*
|
||||
*/
|
||||
update: (args: UpdateArgs | UpdateSqlArgs) => Promise<void>
|
||||
|
||||
/**
|
||||
* List the indicies on this table.
|
||||
*/
|
||||
@@ -272,6 +306,34 @@ export interface Table<T = number[]> {
|
||||
indexStats: (indexUuid: string) => Promise<IndexStats>
|
||||
}
|
||||
|
||||
export interface UpdateArgs {
|
||||
/**
|
||||
* A filter in the same format used by a sql WHERE clause. The filter may be empty,
|
||||
* in which case all rows will be updated.
|
||||
*/
|
||||
where?: string
|
||||
|
||||
/**
|
||||
* A key-value map of updates. The keys are the column names, and the values are the
|
||||
* new values to set
|
||||
*/
|
||||
values: Record<string, Literal>
|
||||
}
|
||||
|
||||
export interface UpdateSqlArgs {
|
||||
/**
|
||||
* A filter in the same format used by a sql WHERE clause. The filter may be empty,
|
||||
* in which case all rows will be updated.
|
||||
*/
|
||||
where?: string
|
||||
|
||||
/**
|
||||
* A key-value map of updates. The keys are the column names, and the values are the
|
||||
* new values to set as SQL expressions.
|
||||
*/
|
||||
valuesSql: Record<string, string>
|
||||
}
|
||||
|
||||
export interface VectorIndex {
|
||||
columns: string[]
|
||||
name: string
|
||||
@@ -426,6 +488,16 @@ export class LocalTable<T = number[]> implements Table<T> {
|
||||
return new Query(query, this._tbl, this._embeddings)
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a filter query to find all rows matching the specified criteria
|
||||
* @param value The filter criteria (like SQL where clause syntax)
|
||||
*/
|
||||
filter (value: string): Query<T> {
|
||||
return new Query(undefined, this._tbl, this._embeddings).filter(value)
|
||||
}
|
||||
|
||||
where = this.filter
|
||||
|
||||
/**
|
||||
* Insert records into this Table.
|
||||
*
|
||||
@@ -481,6 +553,31 @@ export class LocalTable<T = number[]> implements Table<T> {
|
||||
return tableDelete.call(this._tbl, filter).then((newTable: any) => { this._tbl = newTable })
|
||||
}
|
||||
|
||||
/**
|
||||
* Update rows in this table.
|
||||
*
|
||||
* @param args see {@link UpdateArgs} and {@link UpdateSqlArgs} for more details
|
||||
*
|
||||
* @returns
|
||||
*/
|
||||
async update (args: UpdateArgs | UpdateSqlArgs): Promise<void> {
|
||||
let filter: string | null
|
||||
let updates: Record<string, string>
|
||||
|
||||
if ('valuesSql' in args) {
|
||||
filter = args.where ?? null
|
||||
updates = args.valuesSql
|
||||
} else {
|
||||
filter = args.where ?? null
|
||||
updates = {}
|
||||
for (const [key, value] of Object.entries(args.values)) {
|
||||
updates[key] = toSQL(value)
|
||||
}
|
||||
}
|
||||
|
||||
return tableUpdate.call(this._tbl, filter, updates).then((newTable: any) => { this._tbl = newTable })
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up old versions of the table, freeing disk space.
|
||||
*
|
||||
|
||||
@@ -23,10 +23,10 @@ const { tableSearch } = require('../native.js')
|
||||
* A builder for nearest neighbor queries for LanceDB.
|
||||
*/
|
||||
export class Query<T = number[]> {
|
||||
private readonly _query: T
|
||||
private readonly _query?: T
|
||||
private readonly _tbl?: any
|
||||
private _queryVector?: number[]
|
||||
private _limit: number
|
||||
private _limit?: number
|
||||
private _refineFactor?: number
|
||||
private _nprobes: number
|
||||
private _select?: string[]
|
||||
@@ -35,10 +35,10 @@ export class Query<T = number[]> {
|
||||
private _prefilter: boolean
|
||||
protected readonly _embeddings?: EmbeddingFunction<T>
|
||||
|
||||
constructor (query: T, tbl?: any, embeddings?: EmbeddingFunction<T>) {
|
||||
constructor (query?: T, tbl?: any, embeddings?: EmbeddingFunction<T>) {
|
||||
this._tbl = tbl
|
||||
this._query = query
|
||||
this._limit = 10
|
||||
this._limit = undefined
|
||||
this._nprobes = 20
|
||||
this._refineFactor = undefined
|
||||
this._select = undefined
|
||||
@@ -113,10 +113,12 @@ export class Query<T = number[]> {
|
||||
* Execute the query and return the results as an Array of Objects
|
||||
*/
|
||||
async execute<T = Record<string, unknown>> (): Promise<T[]> {
|
||||
if (this._embeddings !== undefined) {
|
||||
this._queryVector = (await this._embeddings.embed([this._query]))[0]
|
||||
} else {
|
||||
this._queryVector = this._query as number[]
|
||||
if (this._query !== undefined) {
|
||||
if (this._embeddings !== undefined) {
|
||||
this._queryVector = (await this._embeddings.embed([this._query]))[0]
|
||||
} else {
|
||||
this._queryVector = this._query as number[]
|
||||
}
|
||||
}
|
||||
|
||||
const isElectron = this.isElectron()
|
||||
|
||||
@@ -16,7 +16,8 @@ import {
|
||||
type EmbeddingFunction, type Table, type VectorIndexParams, type Connection,
|
||||
type ConnectionOptions, type CreateTableOptions, type VectorIndex,
|
||||
type WriteOptions,
|
||||
type IndexStats
|
||||
type IndexStats,
|
||||
type UpdateArgs, type UpdateSqlArgs
|
||||
} from '../index'
|
||||
import { Query } from '../query'
|
||||
|
||||
@@ -246,6 +247,10 @@ export class RemoteTable<T = number[]> implements Table<T> {
|
||||
await this._client.post(`/v1/table/${this._name}/delete/`, { predicate: filter })
|
||||
}
|
||||
|
||||
async update (args: UpdateArgs | UpdateSqlArgs): Promise<void> {
|
||||
throw new Error('Not implemented')
|
||||
}
|
||||
|
||||
async listIndices (): Promise<VectorIndex[]> {
|
||||
const results = await this._client.post(`/v1/table/${this._name}/index/list/`)
|
||||
return results.data.indexes?.map((index: any) => ({
|
||||
|
||||
@@ -78,12 +78,31 @@ describe('LanceDB client', function () {
|
||||
})
|
||||
|
||||
it('limits # of results', async function () {
|
||||
const uri = await createTestDB()
|
||||
const uri = await createTestDB(2, 100)
|
||||
const con = await lancedb.connect(uri)
|
||||
const table = await con.openTable('vectors')
|
||||
const results = await table.search([0.1, 0.3]).limit(1).execute()
|
||||
let results = await table.search([0.1, 0.3]).limit(1).execute()
|
||||
assert.equal(results.length, 1)
|
||||
assert.equal(results[0].id, 1)
|
||||
|
||||
// there is a default limit if unspecified
|
||||
results = await table.search([0.1, 0.3]).execute()
|
||||
assert.equal(results.length, 10)
|
||||
})
|
||||
|
||||
it('uses a filter / where clause without vector search', async function () {
|
||||
// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
|
||||
const assertResults = (results: Array<Record<string, unknown>>) => {
|
||||
assert.equal(results.length, 50)
|
||||
}
|
||||
|
||||
const uri = await createTestDB(2, 100)
|
||||
const con = await lancedb.connect(uri)
|
||||
const table = (await con.openTable('vectors')) as LocalTable
|
||||
let results = await table.filter('id % 2 = 0').execute()
|
||||
assertResults(results)
|
||||
results = await table.where('id % 2 = 0').execute()
|
||||
assertResults(results)
|
||||
})
|
||||
|
||||
it('uses a filter / where clause', async function () {
|
||||
@@ -260,6 +279,46 @@ describe('LanceDB client', function () {
|
||||
assert.equal(await table.countRows(), 2)
|
||||
})
|
||||
|
||||
it('can update records in the table', async function () {
|
||||
const uri = await createTestDB()
|
||||
const con = await lancedb.connect(uri)
|
||||
|
||||
const table = await con.openTable('vectors')
|
||||
assert.equal(await table.countRows(), 2)
|
||||
|
||||
await table.update({ where: 'price = 10', valuesSql: { price: '100' } })
|
||||
const results = await table.search([0.1, 0.2]).execute()
|
||||
assert.equal(results[0].price, 100)
|
||||
assert.equal(results[1].price, 11)
|
||||
})
|
||||
|
||||
it('can update the records using a literal value', async function () {
|
||||
const uri = await createTestDB()
|
||||
const con = await lancedb.connect(uri)
|
||||
|
||||
const table = await con.openTable('vectors')
|
||||
assert.equal(await table.countRows(), 2)
|
||||
|
||||
await table.update({ where: 'price = 10', values: { price: 100 } })
|
||||
const results = await table.search([0.1, 0.2]).execute()
|
||||
assert.equal(results[0].price, 100)
|
||||
assert.equal(results[1].price, 11)
|
||||
})
|
||||
|
||||
it('can update every record in the table', async function () {
|
||||
const uri = await createTestDB()
|
||||
const con = await lancedb.connect(uri)
|
||||
|
||||
const table = await con.openTable('vectors')
|
||||
assert.equal(await table.countRows(), 2)
|
||||
|
||||
await table.update({ valuesSql: { price: '100' } })
|
||||
const results = await table.search([0.1, 0.2]).execute()
|
||||
|
||||
assert.equal(results[0].price, 100)
|
||||
assert.equal(results[1].price, 100)
|
||||
})
|
||||
|
||||
it('can delete records from a table', async function () {
|
||||
const uri = await createTestDB()
|
||||
const con = await lancedb.connect(uri)
|
||||
@@ -542,7 +601,7 @@ describe('Compact and cleanup', function () {
|
||||
|
||||
// should have no effect, but this validates the arguments are parsed.
|
||||
await table.compactFiles({
|
||||
targetRowsPerFragment: 1024 * 10,
|
||||
targetRowsPerFragment: 102410,
|
||||
maxRowsPerGroup: 1024,
|
||||
materializeDeletions: true,
|
||||
materializeDeletionsThreshold: 0.5,
|
||||
|
||||
45
node/src/test/util.ts
Normal file
45
node/src/test/util.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
// Copyright 2023 LanceDB Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import { toSQL } from '../util'
|
||||
import * as chai from 'chai'
|
||||
|
||||
const expect = chai.expect
|
||||
|
||||
describe('toSQL', function () {
|
||||
it('should turn string to SQL expression', function () {
|
||||
expect(toSQL('foo')).to.equal("'foo'")
|
||||
})
|
||||
|
||||
it('should turn number to SQL expression', function () {
|
||||
expect(toSQL(123)).to.equal('123')
|
||||
})
|
||||
|
||||
it('should turn boolean to SQL expression', function () {
|
||||
expect(toSQL(true)).to.equal('TRUE')
|
||||
})
|
||||
|
||||
it('should turn null to SQL expression', function () {
|
||||
expect(toSQL(null)).to.equal('NULL')
|
||||
})
|
||||
|
||||
it('should turn Date to SQL expression', function () {
|
||||
const date = new Date('05 October 2011 14:48 UTC')
|
||||
expect(toSQL(date)).to.equal("'2011-10-05T14:48:00.000Z'")
|
||||
})
|
||||
|
||||
it('should turn array to SQL expression', function () {
|
||||
expect(toSQL(['foo', 'bar', true, 1])).to.equal("['foo', 'bar', TRUE, 1]")
|
||||
})
|
||||
})
|
||||
44
node/src/util.ts
Normal file
44
node/src/util.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
// Copyright 2023 LanceDB Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
export type Literal = string | number | boolean | null | Date | Literal[]
|
||||
|
||||
export function toSQL (value: Literal): string {
|
||||
if (typeof value === 'string') {
|
||||
return `'${value}'`
|
||||
}
|
||||
|
||||
if (typeof value === 'number') {
|
||||
return value.toString()
|
||||
}
|
||||
|
||||
if (typeof value === 'boolean') {
|
||||
return value ? 'TRUE' : 'FALSE'
|
||||
}
|
||||
|
||||
if (value === null) {
|
||||
return 'NULL'
|
||||
}
|
||||
|
||||
if (value instanceof Date) {
|
||||
return `'${value.toISOString()}'`
|
||||
}
|
||||
|
||||
if (Array.isArray(value)) {
|
||||
return `[${value.map(toSQL).join(', ')}]`
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/restrict-template-expressions
|
||||
throw new Error(`Unsupported value type: ${typeof value} value: (${value})`)
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.3.4
|
||||
current_version = 0.3.5
|
||||
commit = True
|
||||
message = [python] Bump version: {current_version} → {new_version}
|
||||
tag = True
|
||||
|
||||
@@ -17,7 +17,7 @@ import inspect
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
|
||||
|
||||
import lance
|
||||
import numpy as np
|
||||
@@ -30,7 +30,7 @@ from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||
from .pydantic import LanceModel
|
||||
from .query import LanceQueryBuilder, Query
|
||||
from .util import fs_from_uri, safe_import_pandas
|
||||
from .util import fs_from_uri, safe_import_pandas, value_to_sql
|
||||
from .utils.events import register_event
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -913,30 +913,35 @@ class LanceTable(Table):
|
||||
def delete(self, where: str):
|
||||
self._dataset.delete(where)
|
||||
|
||||
def update(self, where: str, values: dict):
|
||||
def update(
|
||||
self,
|
||||
where: Optional[str] = None,
|
||||
values: Optional[dict] = None,
|
||||
*,
|
||||
values_sql: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
EXPERIMENTAL: Update rows in the table (not threadsafe).
|
||||
|
||||
This can be used to update zero to all rows depending on how many
|
||||
rows match the where clause.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where: str
|
||||
where: str, optional
|
||||
The SQL where clause to use when updating rows. For example, 'x = 2'
|
||||
or 'x IN (1, 2, 3)'. The filter must not be empty, or it will error.
|
||||
values: dict
|
||||
values: dict, optional
|
||||
The values to update. The keys are the column names and the values
|
||||
are the values to set.
|
||||
values_sql: dict, optional
|
||||
The values to update, expressed as SQL expression strings. These can
|
||||
reference existing columns. For example, {"x": "x + 1"} will increment
|
||||
the x column by 1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import lancedb
|
||||
>>> data = [
|
||||
... {"x": 1, "vector": [1, 2]},
|
||||
... {"x": 2, "vector": [3, 4]},
|
||||
... {"x": 3, "vector": [5, 6]}
|
||||
... ]
|
||||
>>> import pandas as pd
|
||||
>>> data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})
|
||||
>>> db = lancedb.connect("./.lancedb")
|
||||
>>> table = db.create_table("my_table", data)
|
||||
>>> table.to_pandas()
|
||||
@@ -952,18 +957,15 @@ class LanceTable(Table):
|
||||
2 2 [10.0, 10.0]
|
||||
|
||||
"""
|
||||
orig_data = self._dataset.to_table(filter=where).combine_chunks()
|
||||
if len(orig_data) == 0:
|
||||
return
|
||||
for col, val in values.items():
|
||||
i = orig_data.column_names.index(col)
|
||||
if i < 0:
|
||||
raise ValueError(f"Column {col} does not exist")
|
||||
orig_data = orig_data.set_column(
|
||||
i, col, pa.array([val] * len(orig_data), type=orig_data[col].type)
|
||||
)
|
||||
self.delete(where)
|
||||
self.add(orig_data, mode="append")
|
||||
if values is not None and values_sql is not None:
|
||||
raise ValueError("Only one of values or values_sql can be provided")
|
||||
if values is None and values_sql is None:
|
||||
raise ValueError("Either values or values_sql must be provided")
|
||||
|
||||
if values is not None:
|
||||
values_sql = {k: value_to_sql(v) for k, v in values.items()}
|
||||
|
||||
self.to_lance().update(values_sql, where)
|
||||
self._reset_dataset()
|
||||
register_event("update")
|
||||
|
||||
|
||||
@@ -12,9 +12,12 @@
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from datetime import date, datetime
|
||||
from functools import singledispatch
|
||||
from typing import Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import numpy as np
|
||||
import pyarrow.fs as pa_fs
|
||||
|
||||
|
||||
@@ -88,3 +91,53 @@ def safe_import_pandas():
|
||||
return pd
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
|
||||
@singledispatch
|
||||
def value_to_sql(value):
|
||||
raise NotImplementedError("SQL conversion is not implemented for this type")
|
||||
|
||||
|
||||
@value_to_sql.register(str)
|
||||
def _(value: str):
|
||||
return f"'{value}'"
|
||||
|
||||
|
||||
@value_to_sql.register(int)
|
||||
def _(value: int):
|
||||
return str(value)
|
||||
|
||||
|
||||
@value_to_sql.register(float)
|
||||
def _(value: float):
|
||||
return str(value)
|
||||
|
||||
|
||||
@value_to_sql.register(bool)
|
||||
def _(value: bool):
|
||||
return str(value).upper()
|
||||
|
||||
|
||||
@value_to_sql.register(type(None))
|
||||
def _(value: type(None)):
|
||||
return "NULL"
|
||||
|
||||
|
||||
@value_to_sql.register(datetime)
|
||||
def _(value: datetime):
|
||||
return f"'{value.isoformat()}'"
|
||||
|
||||
|
||||
@value_to_sql.register(date)
|
||||
def _(value: date):
|
||||
return f"'{value.isoformat()}'"
|
||||
|
||||
|
||||
@value_to_sql.register(list)
|
||||
def _(value: list):
|
||||
return "[" + ", ".join(map(value_to_sql, value)) + "]"
|
||||
|
||||
|
||||
@value_to_sql.register(np.ndarray)
|
||||
def _(value: np.ndarray):
|
||||
return value_to_sql(value.tolist())
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
[project]
|
||||
name = "lancedb"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.8.17",
|
||||
"pylance==0.8.21",
|
||||
"ratelimiter~=1.0",
|
||||
"retry>=0.9.2",
|
||||
"tqdm>=4.1.0",
|
||||
"tqdm>=4.27.0",
|
||||
"aiohttp",
|
||||
"pydantic>=1.10",
|
||||
"attrs>=21.3.0",
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
from datetime import timedelta
|
||||
from datetime import date, datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from unittest.mock import PropertyMock, patch
|
||||
@@ -348,14 +348,79 @@ def test_update(db):
|
||||
assert len(table) == 2
|
||||
assert len(table.list_versions()) == 2
|
||||
table.update(where="id=0", values={"vector": [1.1, 1.1]})
|
||||
assert len(table.list_versions()) == 4
|
||||
assert table.version == 4
|
||||
assert len(table.list_versions()) == 3
|
||||
assert table.version == 3
|
||||
assert len(table) == 2
|
||||
v = table.to_arrow()["vector"].combine_chunks()
|
||||
v = v.values.to_numpy().reshape(2, 2)
|
||||
assert np.allclose(v, np.array([[1.2, 1.9], [1.1, 1.1]]))
|
||||
|
||||
|
||||
def test_update_types(db):
|
||||
table = LanceTable.create(
|
||||
db,
|
||||
"my_table",
|
||||
data=[
|
||||
{
|
||||
"id": 0,
|
||||
"str": "foo",
|
||||
"float": 1.1,
|
||||
"timestamp": datetime(2021, 1, 1),
|
||||
"date": date(2021, 1, 1),
|
||||
"vector1": [1.0, 0.0],
|
||||
"vector2": [1.0, 1.0],
|
||||
}
|
||||
],
|
||||
)
|
||||
# Update with SQL
|
||||
table.update(
|
||||
values_sql=dict(
|
||||
id="1",
|
||||
str="'bar'",
|
||||
float="2.2",
|
||||
timestamp="TIMESTAMP '2021-01-02 00:00:00'",
|
||||
date="DATE '2021-01-02'",
|
||||
vector1="[2.0, 2.0]",
|
||||
vector2="[3.0, 3.0]",
|
||||
)
|
||||
)
|
||||
actual = table.to_arrow().to_pylist()[0]
|
||||
expected = dict(
|
||||
id=1,
|
||||
str="bar",
|
||||
float=2.2,
|
||||
timestamp=datetime(2021, 1, 2),
|
||||
date=date(2021, 1, 2),
|
||||
vector1=[2.0, 2.0],
|
||||
vector2=[3.0, 3.0],
|
||||
)
|
||||
assert actual == expected
|
||||
|
||||
# Update with values
|
||||
table.update(
|
||||
values=dict(
|
||||
id=2,
|
||||
str="baz",
|
||||
float=3.3,
|
||||
timestamp=datetime(2021, 1, 3),
|
||||
date=date(2021, 1, 3),
|
||||
vector1=[3.0, 3.0],
|
||||
vector2=np.array([4.0, 4.0]),
|
||||
)
|
||||
)
|
||||
actual = table.to_arrow().to_pylist()[0]
|
||||
expected = dict(
|
||||
id=2,
|
||||
str="baz",
|
||||
float=3.3,
|
||||
timestamp=datetime(2021, 1, 3),
|
||||
date=date(2021, 1, 3),
|
||||
vector1=[3.0, 3.0],
|
||||
vector2=[4.0, 4.0],
|
||||
)
|
||||
assert actual == expected
|
||||
|
||||
|
||||
def test_create_with_embedding_function(db):
|
||||
class MyTable(LanceModel):
|
||||
text: str
|
||||
|
||||
@@ -237,6 +237,7 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
||||
cx.export_function("tableAdd", JsTable::js_add)?;
|
||||
cx.export_function("tableCountRows", JsTable::js_count_rows)?;
|
||||
cx.export_function("tableDelete", JsTable::js_delete)?;
|
||||
cx.export_function("tableUpdate", JsTable::js_update)?;
|
||||
cx.export_function("tableCleanupOldVersions", JsTable::js_cleanup)?;
|
||||
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
|
||||
cx.export_function("tableListIndices", JsTable::js_list_indices)?;
|
||||
|
||||
@@ -23,8 +23,14 @@ impl JsQuery {
|
||||
let query_obj = cx.argument::<JsObject>(0)?;
|
||||
|
||||
let limit = query_obj
|
||||
.get::<JsNumber, _, _>(&mut cx, "_limit")?
|
||||
.value(&mut cx);
|
||||
.get_opt::<JsNumber, _, _>(&mut cx, "_limit")?
|
||||
.map(|value| {
|
||||
let limit = value.value(&mut cx) as u64;
|
||||
if limit <= 0 {
|
||||
panic!("Limit must be a positive integer");
|
||||
}
|
||||
limit
|
||||
});
|
||||
let select = query_obj
|
||||
.get_opt::<JsArray, _, _>(&mut cx, "_select")?
|
||||
.map(|arr| {
|
||||
@@ -48,7 +54,9 @@ impl JsQuery {
|
||||
.map(|s| s.value(&mut cx))
|
||||
.map(|s| MetricType::try_from(s.as_str()).unwrap());
|
||||
|
||||
let prefilter = query_obj.get::<JsBoolean, _, _>(&mut cx, "_prefilter")?.value(&mut cx);
|
||||
let prefilter = query_obj
|
||||
.get::<JsBoolean, _, _>(&mut cx, "_prefilter")?
|
||||
.value(&mut cx);
|
||||
|
||||
let is_electron = cx
|
||||
.argument::<JsBoolean>(1)
|
||||
@@ -59,20 +67,23 @@ impl JsQuery {
|
||||
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
let query_vector = query_obj.get::<JsArray, _, _>(&mut cx, "_queryVector")?;
|
||||
let query = convert::js_array_to_vec(query_vector.deref(), &mut cx);
|
||||
let query_vector = query_obj.get_opt::<JsArray, _, _>(&mut cx, "_queryVector")?;
|
||||
let table = js_table.table.clone();
|
||||
let query = query_vector.map(|q| convert::js_array_to_vec(q.deref(), &mut cx));
|
||||
|
||||
rt.spawn(async move {
|
||||
let builder = table
|
||||
.search(Float32Array::from(query))
|
||||
.limit(limit as usize)
|
||||
let mut builder = table
|
||||
.search(query.map(|q| Float32Array::from(q)))
|
||||
.refine_factor(refine_factor)
|
||||
.nprobes(nprobes)
|
||||
.filter(filter)
|
||||
.metric_type(metric_type)
|
||||
.select(select)
|
||||
.prefilter(prefilter);
|
||||
if let Some(limit) = limit {
|
||||
builder = builder.limit(limit as usize);
|
||||
};
|
||||
|
||||
let record_batch_stream = builder.execute();
|
||||
let results = record_batch_stream
|
||||
.and_then(|stream| {
|
||||
|
||||
@@ -165,6 +165,69 @@ impl JsTable {
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
pub(crate) fn js_update(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||
let mut table = js_table.table.clone();
|
||||
|
||||
let rt = runtime(&mut cx)?;
|
||||
let (deferred, promise) = cx.promise();
|
||||
let channel = cx.channel();
|
||||
|
||||
// create a vector of updates from the passed map
|
||||
let updates_arg = cx.argument::<JsObject>(1)?;
|
||||
let properties = updates_arg.get_own_property_names(&mut cx)?;
|
||||
let mut updates: Vec<(String, String)> =
|
||||
Vec::with_capacity(properties.len(&mut cx) as usize);
|
||||
|
||||
let len_properties = properties.len(&mut cx);
|
||||
for i in 0..len_properties {
|
||||
let property = properties
|
||||
.get_value(&mut cx, i)?
|
||||
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
||||
|
||||
let value = updates_arg
|
||||
.get_value(&mut cx, property.clone())?
|
||||
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
||||
|
||||
let property = property.value(&mut cx);
|
||||
let value = value.value(&mut cx);
|
||||
updates.push((property, value));
|
||||
}
|
||||
|
||||
// get the filter/predicate if the user passed one
|
||||
let predicate = cx.argument_opt(0);
|
||||
let predicate = predicate.unwrap().downcast::<JsString, _>(&mut cx);
|
||||
let predicate = match predicate {
|
||||
Ok(_) => {
|
||||
let val = predicate.map(|s| s.value(&mut cx)).unwrap();
|
||||
Some(val)
|
||||
}
|
||||
Err(_) => {
|
||||
// if the predicate is not string, check it's null otherwise an invalid
|
||||
// type was passed
|
||||
cx.argument::<JsNull>(0)?;
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
rt.spawn(async move {
|
||||
let updates_arg = updates
|
||||
.iter()
|
||||
.map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let predicate = predicate.as_ref().map(|s| s.as_str());
|
||||
|
||||
let update_result = table.update(predicate, updates_arg).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
update_result.or_throw(&mut cx)?;
|
||||
Ok(cx.boxed(JsTable::from(table)))
|
||||
})
|
||||
});
|
||||
|
||||
Ok(promise)
|
||||
}
|
||||
|
||||
pub(crate) fn js_cleanup(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||
let rt = runtime(&mut cx)?;
|
||||
|
||||
@@ -359,7 +359,7 @@ mod test {
|
||||
assert_eq!(t.count_rows().await.unwrap(), 100);
|
||||
|
||||
let q = t
|
||||
.search(PrimitiveArray::from_iter_values(vec![0.1, 0.1, 0.1, 0.1]))
|
||||
.search(Some(PrimitiveArray::from_iter_values(vec![0.1, 0.1, 0.1, 0.1])))
|
||||
.limit(10)
|
||||
.execute()
|
||||
.await
|
||||
|
||||
@@ -24,8 +24,8 @@ use crate::error::Result;
|
||||
/// A builder for nearest neighbor queries for LanceDB.
|
||||
pub struct Query {
|
||||
pub dataset: Arc<Dataset>,
|
||||
pub query_vector: Float32Array,
|
||||
pub limit: usize,
|
||||
pub query_vector: Option<Float32Array>,
|
||||
pub limit: Option<usize>,
|
||||
pub filter: Option<String>,
|
||||
pub select: Option<Vec<String>>,
|
||||
pub nprobes: usize,
|
||||
@@ -46,11 +46,11 @@ impl Query {
|
||||
/// # Returns
|
||||
///
|
||||
/// * A [Query] object.
|
||||
pub(crate) fn new(dataset: Arc<Dataset>, vector: Float32Array) -> Self {
|
||||
pub(crate) fn new(dataset: Arc<Dataset>, vector: Option<Float32Array>) -> Self {
|
||||
Query {
|
||||
dataset,
|
||||
query_vector: vector,
|
||||
limit: 10,
|
||||
limit: None,
|
||||
nprobes: 20,
|
||||
refine_factor: None,
|
||||
metric_type: None,
|
||||
@@ -69,11 +69,13 @@ impl Query {
|
||||
pub async fn execute(&self) -> Result<DatasetRecordBatchStream> {
|
||||
let mut scanner: Scanner = self.dataset.scan();
|
||||
|
||||
scanner.nearest(
|
||||
crate::table::VECTOR_COLUMN_NAME,
|
||||
&self.query_vector,
|
||||
self.limit,
|
||||
)?;
|
||||
if let Some(query) = self.query_vector.as_ref() {
|
||||
// If there is a vector query, default to limit=10 if unspecified
|
||||
scanner.nearest(crate::table::VECTOR_COLUMN_NAME, query, self.limit.unwrap_or(10))?;
|
||||
} else {
|
||||
// If there is no vector query, it's ok to not have a limit
|
||||
scanner.limit(self.limit.map(|limit| limit as i64), None)?;
|
||||
}
|
||||
scanner.nprobs(self.nprobes);
|
||||
scanner.use_index(self.use_index);
|
||||
scanner.prefilter(self.prefilter);
|
||||
@@ -91,7 +93,7 @@ impl Query {
|
||||
///
|
||||
/// * `limit` - The maximum number of results to return.
|
||||
pub fn limit(mut self, limit: usize) -> Query {
|
||||
self.limit = limit;
|
||||
self.limit = Some(limit);
|
||||
self
|
||||
}
|
||||
|
||||
@@ -101,7 +103,7 @@ impl Query {
|
||||
///
|
||||
/// * `vector` - The vector that will be used for search.
|
||||
pub fn query_vector(mut self, query_vector: Float32Array) -> Query {
|
||||
self.query_vector = query_vector;
|
||||
self.query_vector = Some(query_vector);
|
||||
self
|
||||
}
|
||||
|
||||
@@ -174,7 +176,7 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use arrow_array::{Float32Array, RecordBatch, RecordBatchIterator, RecordBatchReader};
|
||||
use arrow_array::{Float32Array, RecordBatch, RecordBatchIterator, RecordBatchReader, cast::AsArray, Int32Array};
|
||||
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
|
||||
use futures::StreamExt;
|
||||
use lance::dataset::Dataset;
|
||||
@@ -187,7 +189,7 @@ mod tests {
|
||||
let batches = make_test_batches();
|
||||
let ds = Dataset::write(batches, "memory://foo", None).await.unwrap();
|
||||
|
||||
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
||||
let vector = Some(Float32Array::from_iter_values([0.1, 0.2]));
|
||||
let query = Query::new(Arc::new(ds), vector.clone());
|
||||
assert_eq!(query.query_vector, vector);
|
||||
|
||||
@@ -201,8 +203,8 @@ mod tests {
|
||||
.metric_type(Some(MetricType::Cosine))
|
||||
.refine_factor(Some(999));
|
||||
|
||||
assert_eq!(query.query_vector, new_vector);
|
||||
assert_eq!(query.limit, 100);
|
||||
assert_eq!(query.query_vector.unwrap(), new_vector);
|
||||
assert_eq!(query.limit.unwrap(), 100);
|
||||
assert_eq!(query.nprobes, 1000);
|
||||
assert_eq!(query.use_index, true);
|
||||
assert_eq!(query.metric_type, Some(MetricType::Cosine));
|
||||
@@ -214,7 +216,7 @@ mod tests {
|
||||
let batches = make_non_empty_batches();
|
||||
let ds = Arc::new(Dataset::write(batches, "memory://foo", None).await.unwrap());
|
||||
|
||||
let vector = Float32Array::from_iter_values([0.1; 4]);
|
||||
let vector = Some(Float32Array::from_iter_values([0.1; 4]));
|
||||
|
||||
let query = Query::new(ds.clone(), vector.clone());
|
||||
let result = query
|
||||
@@ -244,6 +246,27 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_execute_no_vector() {
|
||||
// test that it's ok to not specify a query vector (just filter / limit)
|
||||
let batches = make_non_empty_batches();
|
||||
let ds = Arc::new(Dataset::write(batches, "memory://foo", None).await.unwrap());
|
||||
|
||||
let query = Query::new(ds.clone(), None);
|
||||
let result = query
|
||||
.filter(Some("id % 2 == 0".to_string()))
|
||||
.execute()
|
||||
.await;
|
||||
let mut stream = result.expect("should have result");
|
||||
// should only have one batch
|
||||
while let Some(batch) = stream.next().await {
|
||||
let b = batch.expect("should be Ok");
|
||||
// cast arr into Int32Array
|
||||
let arr: &Int32Array = b["id"].as_primitive();
|
||||
assert!(arr.iter().all(|x| x.unwrap() % 2 == 0));
|
||||
}
|
||||
}
|
||||
|
||||
fn make_non_empty_batches() -> impl RecordBatchReader + Send + 'static {
|
||||
let vec = Box::new(RandomVector::new().named("vector".to_string()));
|
||||
let id = Box::new(IncrementingInt32::new().named("id".to_string()));
|
||||
|
||||
@@ -23,7 +23,7 @@ use lance::dataset::cleanup::RemovalStats;
|
||||
use lance::dataset::optimize::{
|
||||
compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions,
|
||||
};
|
||||
use lance::dataset::{Dataset, WriteParams};
|
||||
use lance::dataset::{Dataset, UpdateBuilder, WriteParams};
|
||||
use lance::index::DatasetIndexExt;
|
||||
use lance::io::object_store::WrappingObjectStore;
|
||||
use std::path::Path;
|
||||
@@ -308,10 +308,14 @@ impl Table {
|
||||
/// # Returns
|
||||
///
|
||||
/// * A [Query] object.
|
||||
pub fn search(&self, query_vector: Float32Array) -> Query {
|
||||
pub fn search(&self, query_vector: Option<Float32Array>) -> Query {
|
||||
Query::new(self.dataset.clone(), query_vector)
|
||||
}
|
||||
|
||||
pub fn filter(&self, expr: String) -> Query {
|
||||
Query::new(self.dataset.clone(), None).filter(Some(expr))
|
||||
}
|
||||
|
||||
/// Returns the number of rows in this Table
|
||||
pub async fn count_rows(&self) -> Result<usize> {
|
||||
Ok(self.dataset.count_rows().await?)
|
||||
@@ -338,6 +342,27 @@ impl Table {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn update(
|
||||
&mut self,
|
||||
predicate: Option<&str>,
|
||||
updates: Vec<(&str, &str)>,
|
||||
) -> Result<()> {
|
||||
let mut builder = UpdateBuilder::new(self.dataset.clone());
|
||||
if let Some(predicate) = predicate {
|
||||
builder = builder.update_where(predicate)?;
|
||||
}
|
||||
|
||||
for (column, value) in updates {
|
||||
builder = builder.set(column, value)?;
|
||||
}
|
||||
|
||||
let operation = builder.build()?;
|
||||
let new_ds = operation.execute().await?;
|
||||
self.dataset = new_ds;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove old versions of the dataset from disk.
|
||||
///
|
||||
/// # Arguments
|
||||
@@ -413,11 +438,14 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::{
|
||||
Array, FixedSizeListArray, Float32Array, Int32Array, RecordBatch, RecordBatchIterator,
|
||||
RecordBatchReader,
|
||||
Array, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array,
|
||||
Int32Array, Int64Array, LargeStringArray, RecordBatch, RecordBatchIterator,
|
||||
RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
||||
UInt32Array,
|
||||
};
|
||||
use arrow_data::ArrayDataBuilder;
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use arrow_schema::{DataType, Field, Schema, TimeUnit};
|
||||
use futures::TryStreamExt;
|
||||
use lance::dataset::{Dataset, WriteMode};
|
||||
use lance::index::vector::pq::PQBuildParams;
|
||||
use lance::io::object_store::{ObjectStoreParams, WrappingObjectStore};
|
||||
@@ -540,6 +568,272 @@ mod tests {
|
||||
assert_eq!(table.name, "test");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_update_with_predicate() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let dataset_path = tmp_dir.path().join("test.lance");
|
||||
let uri = dataset_path.to_str().unwrap();
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
]));
|
||||
|
||||
let record_batch_iter = RecordBatchIterator::new(
|
||||
vec![RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from_iter_values(0..10)),
|
||||
Arc::new(StringArray::from_iter_values(vec![
|
||||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
|
||||
])),
|
||||
],
|
||||
)
|
||||
.unwrap()]
|
||||
.into_iter()
|
||||
.map(Ok),
|
||||
schema.clone(),
|
||||
);
|
||||
|
||||
Dataset::write(record_batch_iter, uri, None).await.unwrap();
|
||||
let mut table = Table::open(uri).await.unwrap();
|
||||
|
||||
table
|
||||
.update(Some("id > 5"), vec![("name", "'foo'")])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let ds_after = Dataset::open(uri).await.unwrap();
|
||||
let mut batches = ds_after
|
||||
.scan()
|
||||
.project(&["id", "name"])
|
||||
.unwrap()
|
||||
.try_into_stream()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(batch) = batches.pop() {
|
||||
let ids = batch
|
||||
.column(0)
|
||||
.as_any()
|
||||
.downcast_ref::<Int32Array>()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.collect::<Vec<_>>();
|
||||
let names = batch
|
||||
.column(1)
|
||||
.as_any()
|
||||
.downcast_ref::<StringArray>()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.collect::<Vec<_>>();
|
||||
for (i, name) in names.iter().enumerate() {
|
||||
let id = ids[i].unwrap();
|
||||
let name = name.unwrap();
|
||||
if id > 5 {
|
||||
assert_eq!(name, "foo");
|
||||
} else {
|
||||
assert_eq!(name, &format!("{}", (b'a' + id as u8) as char));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_update_all_types() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let dataset_path = tmp_dir.path().join("test.lance");
|
||||
let uri = dataset_path.to_str().unwrap();
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("int32", DataType::Int32, false),
|
||||
Field::new("int64", DataType::Int64, false),
|
||||
Field::new("uint32", DataType::UInt32, false),
|
||||
Field::new("string", DataType::Utf8, false),
|
||||
Field::new("large_string", DataType::LargeUtf8, false),
|
||||
Field::new("float32", DataType::Float32, false),
|
||||
Field::new("float64", DataType::Float64, false),
|
||||
Field::new("bool", DataType::Boolean, false),
|
||||
Field::new("date32", DataType::Date32, false),
|
||||
Field::new(
|
||||
"timestamp_ns",
|
||||
DataType::Timestamp(TimeUnit::Nanosecond, None),
|
||||
false,
|
||||
),
|
||||
Field::new(
|
||||
"timestamp_ms",
|
||||
DataType::Timestamp(TimeUnit::Millisecond, None),
|
||||
false,
|
||||
),
|
||||
Field::new(
|
||||
"vec_f32",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 2),
|
||||
false,
|
||||
),
|
||||
Field::new(
|
||||
"vec_f64",
|
||||
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float64, true)), 2),
|
||||
false,
|
||||
),
|
||||
]));
|
||||
|
||||
let record_batch_iter = RecordBatchIterator::new(
|
||||
vec![RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from_iter_values(0..10)),
|
||||
Arc::new(Int64Array::from_iter_values(0..10)),
|
||||
Arc::new(UInt32Array::from_iter_values(0..10)),
|
||||
Arc::new(StringArray::from_iter_values(vec![
|
||||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
|
||||
])),
|
||||
Arc::new(LargeStringArray::from_iter_values(vec![
|
||||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
|
||||
])),
|
||||
Arc::new(Float32Array::from_iter_values(
|
||||
(0..10).into_iter().map(|i| i as f32),
|
||||
)),
|
||||
Arc::new(Float64Array::from_iter_values(
|
||||
(0..10).into_iter().map(|i| i as f64),
|
||||
)),
|
||||
Arc::new(Into::<BooleanArray>::into(vec![
|
||||
true, false, true, false, true, false, true, false, true, false,
|
||||
])),
|
||||
Arc::new(Date32Array::from_iter_values(0..10)),
|
||||
Arc::new(TimestampNanosecondArray::from_iter_values(0..10)),
|
||||
Arc::new(TimestampMillisecondArray::from_iter_values(0..10)),
|
||||
Arc::new(
|
||||
create_fixed_size_list(
|
||||
Float32Array::from_iter_values((0..20).into_iter().map(|i| i as f32)),
|
||||
2,
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
Arc::new(
|
||||
create_fixed_size_list(
|
||||
Float64Array::from_iter_values((0..20).into_iter().map(|i| i as f64)),
|
||||
2,
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
],
|
||||
)
|
||||
.unwrap()]
|
||||
.into_iter()
|
||||
.map(Ok),
|
||||
schema.clone(),
|
||||
);
|
||||
|
||||
Dataset::write(record_batch_iter, uri, None).await.unwrap();
|
||||
let mut table = Table::open(uri).await.unwrap();
|
||||
|
||||
// check it can do update for each type
|
||||
let updates: Vec<(&str, &str)> = vec![
|
||||
("string", "'foo'"),
|
||||
("large_string", "'large_foo'"),
|
||||
("int32", "1"),
|
||||
("int64", "1"),
|
||||
("uint32", "1"),
|
||||
("float32", "1.0"),
|
||||
("float64", "1.0"),
|
||||
("bool", "true"),
|
||||
("date32", "1"),
|
||||
("timestamp_ns", "1"),
|
||||
("timestamp_ms", "1"),
|
||||
("vec_f32", "[1.0, 1.0]"),
|
||||
("vec_f64", "[1.0, 1.0]"),
|
||||
];
|
||||
|
||||
// for (column, value) in test_cases {
|
||||
table.update(None, updates).await.unwrap();
|
||||
|
||||
let ds_after = Dataset::open(uri).await.unwrap();
|
||||
let mut batches = ds_after
|
||||
.scan()
|
||||
.project(&[
|
||||
"string",
|
||||
"large_string",
|
||||
"int32",
|
||||
"int64",
|
||||
"uint32",
|
||||
"float32",
|
||||
"float64",
|
||||
"bool",
|
||||
"date32",
|
||||
"timestamp_ns",
|
||||
"timestamp_ms",
|
||||
"vec_f32",
|
||||
"vec_f64",
|
||||
])
|
||||
.unwrap()
|
||||
.try_into_stream()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
let batch = batches.pop().unwrap();
|
||||
|
||||
macro_rules! assert_column {
|
||||
($column:expr, $array_type:ty, $expected:expr) => {
|
||||
let array = $column
|
||||
.as_any()
|
||||
.downcast_ref::<$array_type>()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.collect::<Vec<_>>();
|
||||
for v in array {
|
||||
assert_eq!(v, Some($expected));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
assert_column!(batch.column(0), StringArray, "foo");
|
||||
assert_column!(batch.column(1), LargeStringArray, "large_foo");
|
||||
assert_column!(batch.column(2), Int32Array, 1);
|
||||
assert_column!(batch.column(3), Int64Array, 1);
|
||||
assert_column!(batch.column(4), UInt32Array, 1);
|
||||
assert_column!(batch.column(5), Float32Array, 1.0);
|
||||
assert_column!(batch.column(6), Float64Array, 1.0);
|
||||
assert_column!(batch.column(7), BooleanArray, true);
|
||||
assert_column!(batch.column(8), Date32Array, 1);
|
||||
assert_column!(batch.column(9), TimestampNanosecondArray, 1);
|
||||
assert_column!(batch.column(10), TimestampMillisecondArray, 1);
|
||||
|
||||
let array = batch
|
||||
.column(11)
|
||||
.as_any()
|
||||
.downcast_ref::<FixedSizeListArray>()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.collect::<Vec<_>>();
|
||||
for v in array {
|
||||
let v = v.unwrap();
|
||||
let f32array = v.as_any().downcast_ref::<Float32Array>().unwrap();
|
||||
for v in f32array {
|
||||
assert_eq!(v, Some(1.0));
|
||||
}
|
||||
}
|
||||
|
||||
let array = batch
|
||||
.column(12)
|
||||
.as_any()
|
||||
.downcast_ref::<FixedSizeListArray>()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.collect::<Vec<_>>();
|
||||
for v in array {
|
||||
let v = v.unwrap();
|
||||
let f64array = v.as_any().downcast_ref::<Float64Array>().unwrap();
|
||||
for v in f64array {
|
||||
assert_eq!(v, Some(1.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_search() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
@@ -554,8 +848,8 @@ mod tests {
|
||||
let table = Table::open(uri).await.unwrap();
|
||||
|
||||
let vector = Float32Array::from_iter_values([0.1, 0.2]);
|
||||
let query = table.search(vector.clone());
|
||||
assert_eq!(vector, query.query_vector);
|
||||
let query = table.search(Some(vector.clone()));
|
||||
assert_eq!(vector, query.query_vector.unwrap());
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
|
||||
Reference in New Issue
Block a user