Compare commits

..

20 Commits

Author SHA1 Message Date
Lance Release
27a638362d Bump version: 0.3.4 → 0.3.5 2023-10-26 21:47:44 +00:00
Bert
22a6695d7a fix conv version (#605) 2023-10-26 17:44:11 -04:00
Lance Release
57eff82ee7 Updating package-lock.json 2023-10-26 21:03:07 +00:00
Lance Release
7732f7d41c Bump version: 0.3.3 → 0.3.4 2023-10-26 21:02:52 +00:00
Bert
5ca98c326f feat: added dataset stats api to node (#604) 2023-10-26 17:00:48 -04:00
Bert
b55db397eb feat: added data stats apis (#596) 2023-10-26 13:10:17 -04:00
Rob Meng
c04d72ac8a expose remap index api (#603)
expose index remap options in `compact_files`
2023-10-25 22:10:37 -04:00
Rob Meng
28b02fb72a feat: expose optimize index api (#602)
expose `optimize_index` api.
2023-10-25 19:40:23 -04:00
Lance Release
f3cf986777 [python] Bump version: 0.3.1 → 0.3.2 2023-10-24 19:06:38 +00:00
Bert
c73fcc8898 update lance to 0.8.7 (#598) 2023-10-24 14:49:36 -04:00
Chang She
cd9debc3b7 fix(python): fix multiple embedding functions bug (#597)
Closes #594

The embedding functions are pydantic models so multiple instances with
the same parameters are considered ==, which means that if you have
multiple embedding columns it's possible for the embeddings to get
overwritten. Instead we use `is` instead of == to avoid this problem.

testing: modified unit test to include this case
2023-10-24 13:05:05 -04:00
Rob Meng
26a97ba997 feat: add checkout method to table to reuse existing store and connections (#593)
Prior to this PR, to get a new version of a table, we need to re-open
the table. This has a few downsides w.r.t. performance:
* Object store is recreated, which takes time and throws away existing
warm connections
* Commit handler is thrown aways as well, which also may contain warm
connections
2023-10-23 12:06:13 -04:00
Rob Meng
ce19fedb08 feat: include manifest files in mirrow store (#589) 2023-10-21 12:21:41 -04:00
Will Jones
14e8e48de2 Revert "[python] Bump version: 0.3.2 → 0.3.3"
This reverts commit c30faf6083.
2023-10-20 17:52:49 -07:00
Will Jones
c30faf6083 [python] Bump version: 0.3.2 → 0.3.3 2023-10-20 17:30:00 -07:00
Ayush Chaurasia
64a4f025bb [Docs]: Minor Fixes (#587)
* Filename typo
* Remove rick_morty csv as users won't really be able to use it.. We can
create a an executable colab and download it from a bucket or smth.
2023-10-20 16:14:35 +02:00
Ayush Chaurasia
6dc968e7d3 [Docs] Embeddings API: Add multi-lingual semantic search example (#582) 2023-10-20 18:40:49 +05:30
Ayush Chaurasia
06b5b69f1e [Docs]Versioning docs (#586)
closes #564

---------

Co-authored-by: Chang She <chang@lancedb.com>
2023-10-20 18:40:16 +05:30
Lance Release
6bd3a838fc Updating package-lock.json 2023-10-19 20:45:39 +00:00
Lance Release
f36fea8f20 Updating package-lock.json 2023-10-19 20:06:10 +00:00
23 changed files with 2122 additions and 138 deletions

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.3
current_version = 0.3.5
commit = True
message = Bump version: {current_version} → {new_version}
tag = True

View File

@@ -5,9 +5,9 @@ exclude = ["python"]
resolver = "2"
[workspace.dependencies]
lance = { "version" = "=0.8.6", "features" = ["dynamodb"] }
lance-linalg = { "version" = "=0.8.6" }
lance-testing = { "version" = "=0.8.6" }
lance = { "version" = "=0.8.8", "features" = ["dynamodb"] }
lance-linalg = { "version" = "=0.8.8" }
lance-testing = { "version" = "=0.8.8" }
# Note that this one does not include pyarrow
arrow = { version = "47.0.0", optional = false }
arrow-array = "47.0"
@@ -18,8 +18,8 @@ arrow-schema = "47.0"
arrow-arith = "47.0"
arrow-cast = "47.0"
chrono = "0.4.23"
half = { "version" = "=2.2.1", default-features = false, features = [
"num-traits"
half = { "version" = "=2.3.1", default-features = false, features = [
"num-traits",
] }
log = "0.4"
object_store = "0.7.1"

View File

@@ -73,12 +73,14 @@ nav:
- Vector Search: search.md
- SQL filters: sql.md
- Indexing: ann_indexes.md
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
- 🧬 Embeddings:
- embeddings/index.md
- Ingest Embedding Functions: embeddings/embedding_functions.md
- Available Functions: embeddings/default_embedding_functions.md
- Create Custom Embedding Functions: embeddings/api.md
- Example- MultiModal CLIP Embeddings: notebooks/DisappearingEmbeddingFunction.ipynb
- Example - Multi-lingual semantic search: notebooks/multi_lingual_example.ipynb
- Example - MultiModal CLIP Embeddings: notebooks/DisappearingEmbeddingFunction.ipynb
- 🔍 Python full-text search: fts.md
- 🔌 Integrations:
- integrations/index.md
@@ -110,12 +112,14 @@ nav:
- Vector Search: search.md
- SQL filters: sql.md
- Indexing: ann_indexes.md
- Versioning & Reproducibility: notebooks/reproducibility.ipynb
- Embeddings:
- embeddings/index.md
- Ingest Embedding Functions: embeddings/embedding_functions.md
- Available Functions: embeddings/default_embedding_functions.md
- Create Custom Embedding Functions: embeddings/api.md
- Example- MultiModal CLIP Embeddings: notebooks/DisappearingEmbeddingFunction.ipynb
- Example - Multi-lingual semantic search: notebooks/multi_lingual_example.ipynb
- Example - MultiModal CLIP Embeddings: notebooks/DisappearingEmbeddingFunction.ipynb
- Python full-text search: fts.md
- Integrations:
- integrations/index.md

View File

@@ -1,5 +1,13 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "88c1af18",
"metadata": {},
"source": [
"# Example - MultiModal CLIP Embeddings"
]
},
{
"cell_type": "markdown",
"id": "c6b5d346-2c2a-4341-a132-00e53543f8d1",

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

104
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.3.2",
"version": "0.3.4",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.3.2",
"version": "0.3.4",
"cpu": [
"x64",
"arm64"
@@ -53,11 +53,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.3.2",
"@lancedb/vectordb-darwin-x64": "0.3.2",
"@lancedb/vectordb-linux-arm64-gnu": "0.3.2",
"@lancedb/vectordb-linux-x64-gnu": "0.3.2",
"@lancedb/vectordb-win32-x64-msvc": "0.3.2"
"@lancedb/vectordb-darwin-arm64": "0.3.4",
"@lancedb/vectordb-darwin-x64": "0.3.4",
"@lancedb/vectordb-linux-arm64-gnu": "0.3.4",
"@lancedb/vectordb-linux-x64-gnu": "0.3.4",
"@lancedb/vectordb-win32-x64-msvc": "0.3.4"
}
},
"node_modules/@apache-arrow/ts": {
@@ -316,66 +316,6 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.2.tgz",
"integrity": "sha512-CDh+sU2k4xVfWauwDZnybma8AJ+Q2i0SzHg05BwgDcani7I0k60NjJ5GobpgQ38xOiEmwHllES1xs4NRh+1YkA==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.2.tgz",
"integrity": "sha512-xevyA+M/UE8ttaNkx68AyIUKlyWMhIzOECx0hbyN1zfShJe2UcunQcmbM1NxUi7EywodByyiP7bfMI1ZR1Y4Mw==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.2.tgz",
"integrity": "sha512-mSKkQ/p6UTSLwWzfZMBS7wA6Gf335KljXLaOhdT4TUI/jC6e9/cvZKkXRgpdE9/gvfl4/WVzKY7sg3+azDYQ+A==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.2.tgz",
"integrity": "sha512-S1D0VwdidwyfIKE58t94rD+EEb5B64ORMVkTw5FBZJirShkk82+0G9H3jNgWrRMt1PB3Qn1286/wqDLQ9+fTsA==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.2.tgz",
"integrity": "sha512-tnct1hf9GAlMchhYU6Lqmbm2nUKPO8apS7tuTIiucQh6gx+vbHmFZHFNHhw1AUJTpsj/eH2Z9iNayuC5Scdvhw==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"win32"
]
},
"node_modules/@neon-rs/cli": {
"version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
@@ -4868,36 +4808,6 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"@lancedb/vectordb-darwin-arm64": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.3.2.tgz",
"integrity": "sha512-CDh+sU2k4xVfWauwDZnybma8AJ+Q2i0SzHg05BwgDcani7I0k60NjJ5GobpgQ38xOiEmwHllES1xs4NRh+1YkA==",
"optional": true
},
"@lancedb/vectordb-darwin-x64": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.3.2.tgz",
"integrity": "sha512-xevyA+M/UE8ttaNkx68AyIUKlyWMhIzOECx0hbyN1zfShJe2UcunQcmbM1NxUi7EywodByyiP7bfMI1ZR1Y4Mw==",
"optional": true
},
"@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.3.2.tgz",
"integrity": "sha512-mSKkQ/p6UTSLwWzfZMBS7wA6Gf335KljXLaOhdT4TUI/jC6e9/cvZKkXRgpdE9/gvfl4/WVzKY7sg3+azDYQ+A==",
"optional": true
},
"@lancedb/vectordb-linux-x64-gnu": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.3.2.tgz",
"integrity": "sha512-S1D0VwdidwyfIKE58t94rD+EEb5B64ORMVkTw5FBZJirShkk82+0G9H3jNgWrRMt1PB3Qn1286/wqDLQ9+fTsA==",
"optional": true
},
"@lancedb/vectordb-win32-x64-msvc": {
"version": "0.3.2",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.3.2.tgz",
"integrity": "sha512-tnct1hf9GAlMchhYU6Lqmbm2nUKPO8apS7tuTIiucQh6gx+vbHmFZHFNHhw1AUJTpsj/eH2Z9iNayuC5Scdvhw==",
"optional": true
},
"@neon-rs/cli": {
"version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.3.3",
"version": "0.3.5",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -81,10 +81,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.3.3",
"@lancedb/vectordb-darwin-x64": "0.3.3",
"@lancedb/vectordb-linux-arm64-gnu": "0.3.3",
"@lancedb/vectordb-linux-x64-gnu": "0.3.3",
"@lancedb/vectordb-win32-x64-msvc": "0.3.3"
"@lancedb/vectordb-darwin-arm64": "0.3.5",
"@lancedb/vectordb-darwin-x64": "0.3.5",
"@lancedb/vectordb-linux-arm64-gnu": "0.3.5",
"@lancedb/vectordb-linux-x64-gnu": "0.3.5",
"@lancedb/vectordb-win32-x64-msvc": "0.3.5"
}
}

View File

@@ -23,7 +23,7 @@ import { Query } from './query'
import { isEmbeddingFunction } from './embedding/embedding_function'
// eslint-disable-next-line @typescript-eslint/no-var-requires
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles } = require('../native.js')
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js')
export { Query }
export type { EmbeddingFunction }
@@ -260,6 +260,27 @@ export interface Table<T = number[]> {
* ```
*/
delete: (filter: string) => Promise<void>
/**
* List the indicies on this table.
*/
listIndices: () => Promise<VectorIndex[]>
/**
* Get statistics about an index.
*/
indexStats: (indexUuid: string) => Promise<IndexStats>
}
export interface VectorIndex {
columns: string[]
name: string
uuid: string
}
export interface IndexStats {
numIndexedRows: number | null
numUnindexedRows: number | null
}
/**
@@ -502,6 +523,14 @@ export class LocalTable<T = number[]> implements Table<T> {
return res.metrics
})
}
async listIndices (): Promise<VectorIndex[]> {
return tableListIndices.call(this._tbl)
}
async indexStats (indexUuid: string): Promise<IndexStats> {
return tableIndexStats.call(this._tbl, indexUuid)
}
}
export interface CleanupStats {

View File

@@ -65,8 +65,8 @@ describe('LanceDB Mirrored Store Integration test', function () {
const mirroredPath = path.join(dir, `${tableName}.lance`)
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
if (err != null) throw err
// there should be two dirs
assert.equal(files.length, 2)
// there should be three dirs
assert.equal(files.length, 3)
assert.isTrue(files[0].isDirectory())
assert.isTrue(files[1].isDirectory())
@@ -76,6 +76,12 @@ describe('LanceDB Mirrored Store Integration test', function () {
assert.isTrue(files[0].name.endsWith('.txn'))
})
fs.readdir(path.join(mirroredPath, '_versions'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
assert.isTrue(files[0].name.endsWith('.manifest'))
})
fs.readdir(path.join(mirroredPath, 'data'), { withFileTypes: true }, (err, files) => {
if (err != null) throw err
assert.equal(files.length, 1)
@@ -88,8 +94,8 @@ describe('LanceDB Mirrored Store Integration test', function () {
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
if (err != null) throw err
// there should be two dirs
assert.equal(files.length, 3)
// there should be four dirs
assert.equal(files.length, 4)
assert.isTrue(files[0].isDirectory())
assert.isTrue(files[1].isDirectory())
assert.isTrue(files[2].isDirectory())
@@ -128,12 +134,13 @@ describe('LanceDB Mirrored Store Integration test', function () {
fs.readdir(mirroredPath, { withFileTypes: true }, (err, files) => {
if (err != null) throw err
// there should be two dirs
assert.equal(files.length, 4)
// there should be five dirs
assert.equal(files.length, 5)
assert.isTrue(files[0].isDirectory())
assert.isTrue(files[1].isDirectory())
assert.isTrue(files[2].isDirectory())
assert.isTrue(files[3].isDirectory())
assert.isTrue(files[4].isDirectory())
// Three TXs now
fs.readdir(path.join(mirroredPath, '_transactions'), { withFileTypes: true }, (err, files) => {

View File

@@ -14,7 +14,9 @@
import {
type EmbeddingFunction, type Table, type VectorIndexParams, type Connection,
type ConnectionOptions, type CreateTableOptions, type WriteOptions
type ConnectionOptions, type CreateTableOptions, type VectorIndex,
type WriteOptions,
type IndexStats
} from '../index'
import { Query } from '../query'
@@ -241,4 +243,21 @@ export class RemoteTable<T = number[]> implements Table<T> {
async delete (filter: string): Promise<void> {
await this._client.post(`/v1/table/${this._name}/delete/`, { predicate: filter })
}
async listIndices (): Promise<VectorIndex[]> {
const results = await this._client.post(`/v1/table/${this._name}/index/list/`)
return results.data.indexes?.map((index: any) => ({
columns: index.columns,
name: index.index_name,
uuid: index.index_uuid
}))
}
async indexStats (indexUuid: string): Promise<IndexStats> {
const results = await this._client.post(`/v1/table/${this._name}/index/${indexUuid}/stats/`)
return {
numIndexedRows: results.data.num_indexed_rows,
numUnindexedRows: results.data.num_unindexed_rows
}
}
}

View File

@@ -328,6 +328,24 @@ describe('LanceDB client', function () {
const createIndex = table.createIndex({ type: 'ivf_pq', column: 'name', num_partitions: -1, max_iters: 2, num_sub_vectors: 2 })
await expect(createIndex).to.be.rejectedWith('num_partitions: must be > 0')
})
it('should be able to list index and stats', async function () {
const uri = await createTestDB(32, 300)
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
await table.createIndex({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2, num_sub_vectors: 2 })
const indices = await table.listIndices()
expect(indices).to.have.lengthOf(1)
expect(indices[0].name).to.equal('vector_idx')
expect(indices[0].uuid).to.not.be.equal(undefined)
expect(indices[0].columns).to.have.lengthOf(1)
expect(indices[0].columns[0]).to.equal('vector')
const stats = await table.indexStats(indices[0].uuid)
expect(stats.numIndexedRows).to.equal(300)
expect(stats.numUnindexedRows).to.equal(0)
}).timeout(50_000)
})
describe('when using a custom embedding function', function () {

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.1
current_version = 0.3.2
commit = True
message = [python] Bump version: {current_version} → {new_version}
tag = True

View File

@@ -327,7 +327,12 @@ class LanceModel(pydantic.BaseModel):
for vec, func in vec_and_function:
for source, field_info in cls.safe_get_fields().items():
src_func = get_extras(field_info, "source_column_for")
if src_func == func:
if src_func is func:
# note we can't use == here since the function is a pydantic
# model so two instances of the same function are ==, so if you
# have multiple vector columns from multiple sources, both will
# be mapped to the same source column
# GH594
configs.append(
EmbeddingFunctionConfig(
source_column=source, vector_column=vec, function=func

View File

@@ -1,9 +1,9 @@
[project]
name = "lancedb"
version = "0.3.1"
version = "0.3.2"
dependencies = [
"deprecation",
"pylance==0.8.6",
"pylance==0.8.7",
"ratelimiter~=1.0",
"retry>=0.9.2",
"tqdm>=4.1.0",

View File

@@ -33,10 +33,13 @@ def test_sentence_transformer(alias, tmp_path):
db = lancedb.connect(tmp_path)
registry = get_registry()
func = registry.get(alias).create()
func2 = registry.get(alias).create()
class Words(LanceModel):
text: str = func.SourceField()
text2: str = func2.SourceField()
vector: Vector(func.ndims()) = func.VectorField()
vector2: Vector(func2.ndims()) = func2.VectorField()
table = db.create_table("words", schema=Words)
table.add(
@@ -50,7 +53,16 @@ def test_sentence_transformer(alias, tmp_path):
"foo",
"bar",
"baz",
]
],
"text2": [
"to be or not to be",
"that is the question",
"for whether tis nobler",
"in the mind to suffer",
"the slings and arrows",
"of outrageous fortune",
"or to take arms",
],
}
)
)
@@ -62,6 +74,13 @@ def test_sentence_transformer(alias, tmp_path):
expected = table.search(vec).limit(1).to_pydantic(Words)[0]
assert actual.text == expected.text
assert actual.text == "hello world"
assert not np.allclose(actual.vector, actual.vector2)
actual = (
table.search(query, vector_column_name="vector2").limit(1).to_pydantic(Words)[0]
)
assert actual.text != "hello world"
assert not np.allclose(actual.vector, actual.vector2)
@pytest.mark.slow

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb-node"
version = "0.3.3"
version = "0.3.5"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
edition = "2018"

View File

@@ -239,6 +239,8 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("tableDelete", JsTable::js_delete)?;
cx.export_function("tableCleanupOldVersions", JsTable::js_cleanup)?;
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
cx.export_function("tableListIndices", JsTable::js_list_indices)?;
cx.export_function("tableIndexStats", JsTable::js_index_stats)?;
cx.export_function(
"tableCreateVectorIndex",
index::vector::table_create_vector_index,

View File

@@ -247,7 +247,7 @@ impl JsTable {
}
rt.spawn(async move {
let stats = table.compact_files(options).await;
let stats = table.compact_files(options, None).await;
deferred.settle_with(&channel, move |mut cx| {
let stats = stats.or_throw(&mut cx)?;
@@ -276,4 +276,91 @@ impl JsTable {
});
Ok(promise)
}
pub(crate) fn js_list_indices(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
// let predicate = cx.argument::<JsString>(0)?.value(&mut cx);
let channel = cx.channel();
let table = js_table.table.clone();
rt.spawn(async move {
let indices = table.load_indices().await;
deferred.settle_with(&channel, move |mut cx| {
let indices = indices.or_throw(&mut cx)?;
let output = JsArray::new(&mut cx, indices.len() as u32);
for (i, index) in indices.iter().enumerate() {
let js_index = JsObject::new(&mut cx);
let index_name = cx.string(index.index_name.clone());
js_index.set(&mut cx, "name", index_name)?;
let index_uuid = cx.string(index.index_uuid.clone());
js_index.set(&mut cx, "uuid", index_uuid)?;
let js_index_columns = JsArray::new(&mut cx, index.columns.len() as u32);
for (j, column) in index.columns.iter().enumerate() {
let js_column = cx.string(column.clone());
js_index_columns.set(&mut cx, j as u32, js_column)?;
}
js_index.set(&mut cx, "columns", js_index_columns)?;
output.set(&mut cx, i as u32, js_index)?;
}
Ok(output)
})
});
Ok(promise)
}
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let index_uuid = cx.argument::<JsString>(0)?.value(&mut cx);
let channel = cx.channel();
let table = js_table.table.clone();
rt.spawn(async move {
let load_stats = futures::try_join!(
table.count_indexed_rows(&index_uuid),
table.count_unindexed_rows(&index_uuid)
);
deferred.settle_with(&channel, move |mut cx| {
let (indexed_rows, unindexed_rows) = load_stats.or_throw(&mut cx)?;
let output = JsObject::new(&mut cx);
match indexed_rows {
Some(x) => {
let i = cx.number(x as f64);
output.set(&mut cx, "numIndexedRows", i)?;
}
None => {
let null = cx.null();
output.set(&mut cx, "numIndexedRows", null)?;
}
};
match unindexed_rows {
Some(x) => {
let i = cx.number(x as f64);
output.set(&mut cx, "numUnindexedRows", i)?;
}
None => {
let null = cx.null();
output.set(&mut cx, "numUnindexedRows", null)?;
}
};
Ok(output)
})
});
Ok(promise)
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb"
version = "0.3.3"
version = "0.3.5"
edition = "2021"
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license = "Apache-2.0"

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use lance::format::{Index, Manifest};
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::VectorIndexParams;
@@ -106,6 +107,27 @@ impl VectorIndexBuilder for IvfPQIndexBuilder {
}
}
pub struct VectorIndex {
pub columns: Vec<String>,
pub index_name: String,
pub index_uuid: String,
}
impl VectorIndex {
pub fn new_from_format(manifest: &Manifest, index: &Index) -> VectorIndex {
let fields = index
.fields
.iter()
.map(|i| manifest.schema.fields[*i as usize].name.clone())
.collect();
VectorIndex {
columns: fields,
index_name: index.name.clone(),
index_uuid: index.uuid.to_string(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -57,7 +57,7 @@ trait PrimaryOnly {
impl PrimaryOnly for Path {
fn primary_only(&self) -> bool {
self.to_string().contains("manifest")
self.filename().unwrap_or("") == "_latest.manifest"
}
}
@@ -118,8 +118,10 @@ impl ObjectStore for MirroringObjectStore {
self.primary.head(location).await
}
// garbage collection on secondary will happen async from other means
async fn delete(&self, location: &Path) -> Result<()> {
if !location.primary_only() {
self.secondary.delete(location).await?;
}
self.primary.delete(location).await
}
@@ -132,7 +134,7 @@ impl ObjectStore for MirroringObjectStore {
}
async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
if from.primary_only() {
if to.primary_only() {
self.primary.copy(from, to).await
} else {
self.secondary.copy(from, to).await?;
@@ -142,6 +144,9 @@ impl ObjectStore for MirroringObjectStore {
}
async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
if !to.primary_only() {
self.secondary.copy(from, to).await?;
}
self.primary.copy_if_not_exists(from, to).await
}
}
@@ -379,7 +384,7 @@ mod test {
let primary_f = primary_elem.unwrap().unwrap();
// hit manifest, skip, _versions contains all the manifest and should not exist on secondary
let primary_raw_path = primary_f.file_name().to_str().unwrap();
if primary_raw_path.contains("manifest") || primary_raw_path.contains("_versions") {
if primary_raw_path.contains("_latest.manifest") {
primary_elem = primary_iter.next();
continue;
}

View File

@@ -18,14 +18,16 @@ use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader};
use arrow_schema::SchemaRef;
use lance::dataset::cleanup::RemovalStats;
use lance::dataset::optimize::{compact_files, CompactionMetrics, CompactionOptions};
use lance::dataset::optimize::{
compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions,
};
use lance::dataset::{Dataset, WriteParams};
use lance::index::IndexType;
use lance::index::{DatasetIndexExt, IndexType};
use lance::io::object_store::WrappingObjectStore;
use std::path::Path;
use crate::error::{Error, Result};
use crate::index::vector::VectorIndexBuilder;
use crate::index::vector::{VectorIndexBuilder, VectorIndex};
use crate::query::Query;
use crate::utils::{PatchReadParam, PatchWriteParam};
use crate::WriteMode;
@@ -153,6 +155,22 @@ impl Table {
})
}
pub async fn checkout_latest(&self) -> Result<Self> {
let latest_version_id = self.dataset.latest_version_id().await?;
let dataset = if latest_version_id == self.dataset.version().version {
self.dataset.clone()
} else {
Arc::new(self.dataset.checkout_version(latest_version_id).await?)
};
Ok(Table {
name: self.name.clone(),
uri: self.uri.clone(),
dataset,
store_wrapper: self.store_wrapper.clone(),
})
}
fn get_table_name(uri: &str) -> Result<String> {
let path = Path::new(uri);
let name = path
@@ -222,8 +240,6 @@ impl Table {
/// Create index on the table.
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt;
let mut dataset = self.dataset.as_ref().clone();
dataset
.create_index(
@@ -241,6 +257,14 @@ impl Table {
Ok(())
}
pub async fn optimize_indices(&mut self) -> Result<()> {
let mut dataset = self.dataset.as_ref().clone();
dataset.optimize_indices().await?;
Ok(())
}
/// Insert records into this Table
///
/// # Arguments
@@ -337,12 +361,44 @@ impl Table {
/// for faster reads.
///
/// This calls into [lance::dataset::optimize::compact_files].
pub async fn compact_files(&mut self, options: CompactionOptions) -> Result<CompactionMetrics> {
pub async fn compact_files(
&mut self,
options: CompactionOptions,
remap_options: Option<Arc<dyn IndexRemapperOptions>>,
) -> Result<CompactionMetrics> {
let mut dataset = self.dataset.as_ref().clone();
let metrics = compact_files(&mut dataset, options, None).await?;
let metrics = compact_files(&mut dataset, options, remap_options).await?;
self.dataset = Arc::new(dataset);
Ok(metrics)
}
pub fn count_fragments(&self) -> usize {
self.dataset.count_fragments()
}
pub fn count_deleted_rows(&self) -> usize {
self.dataset.count_deleted_rows()
}
pub fn num_small_files(&self, max_rows_per_group: usize) -> usize {
self.dataset.num_small_files(max_rows_per_group)
}
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
Ok(self.dataset.count_indexed_rows(index_uuid).await?)
}
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
Ok(self.dataset.count_unindexed_rows(index_uuid).await?)
}
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
let (indices, mf) = futures::try_join!(
self.dataset.load_indices(),
self.dataset.latest_manifest()
)?;
Ok(indices.iter().map(|i| VectorIndex::new_from_format(&mf, i)).collect())
}
}
#[cfg(test)]