Compare commits

..

5 Commits

Author SHA1 Message Date
Lance Release
7732f7d41c Bump version: 0.3.3 → 0.3.4 2023-10-26 21:02:52 +00:00
Bert
5ca98c326f feat: added dataset stats api to node (#604) 2023-10-26 17:00:48 -04:00
Bert
b55db397eb feat: added data stats apis (#596) 2023-10-26 13:10:17 -04:00
Rob Meng
c04d72ac8a expose remap index api (#603)
expose index remap options in `compact_files`
2023-10-25 22:10:37 -04:00
Rob Meng
28b02fb72a feat: expose optimize index api (#602)
expose `optimize_index` api.
2023-10-25 19:40:23 -04:00
12 changed files with 241 additions and 24 deletions

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.3
current_version = 0.3.4
commit = True
message = Bump version: {current_version} → {new_version}
tag = True

View File

@@ -5,9 +5,9 @@ exclude = ["python"]
resolver = "2"
[workspace.dependencies]
lance = { "version" = "=0.8.7", "features" = ["dynamodb"] }
lance-linalg = { "version" = "=0.8.7" }
lance-testing = { "version" = "=0.8.7" }
lance = { "version" = "=0.8.8", "features" = ["dynamodb"] }
lance-linalg = { "version" = "=0.8.8" }
lance-testing = { "version" = "=0.8.8" }
# Note that this one does not include pyarrow
arrow = { version = "47.0.0", optional = false }
arrow-array = "47.0"
@@ -19,7 +19,7 @@ arrow-arith = "47.0"
arrow-cast = "47.0"
chrono = "0.4.23"
half = { "version" = "=2.3.1", default-features = false, features = [
"num-traits"
"num-traits",
] }
log = "0.4"
object_store = "0.7.1"

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.3.3",
"version": "0.3.4",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -81,10 +81,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.3.3",
"@lancedb/vectordb-darwin-x64": "0.3.3",
"@lancedb/vectordb-linux-arm64-gnu": "0.3.3",
"@lancedb/vectordb-linux-x64-gnu": "0.3.3",
"@lancedb/vectordb-win32-x64-msvc": "0.3.3"
"@lancedb/vectordb-darwin-arm64": "0.3.4",
"@lancedb/vectordb-darwin-x64": "0.3.4",
"@lancedb/vectordb-linux-arm64-gnu": "0.3.4",
"@lancedb/vectordb-linux-x64-gnu": "0.3.4",
"@lancedb/vectordb-win32-x64-msvc": "0.3.4"
}
}

View File

@@ -23,7 +23,7 @@ import { Query } from './query'
import { isEmbeddingFunction } from './embedding/embedding_function'
// eslint-disable-next-line @typescript-eslint/no-var-requires
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles } = require('../native.js')
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles, tableListIndices, tableIndexStats } = require('../native.js')
export { Query }
export type { EmbeddingFunction }
@@ -260,6 +260,27 @@ export interface Table<T = number[]> {
* ```
*/
delete: (filter: string) => Promise<void>
/**
* List the indicies on this table.
*/
listIndices: () => Promise<VectorIndex[]>
/**
* Get statistics about an index.
*/
indexStats: (indexUuid: string) => Promise<IndexStats>
}
export interface VectorIndex {
columns: string[]
name: string
uuid: string
}
export interface IndexStats {
numIndexedRows: number | null
numUnindexedRows: number | null
}
/**
@@ -502,6 +523,14 @@ export class LocalTable<T = number[]> implements Table<T> {
return res.metrics
})
}
async listIndices (): Promise<VectorIndex[]> {
return tableListIndices.call(this._tbl)
}
async indexStats (indexUuid: string): Promise<IndexStats> {
return tableIndexStats.call(this._tbl, indexUuid)
}
}
export interface CleanupStats {

View File

@@ -14,7 +14,9 @@
import {
type EmbeddingFunction, type Table, type VectorIndexParams, type Connection,
type ConnectionOptions, type CreateTableOptions, type WriteOptions
type ConnectionOptions, type CreateTableOptions, type VectorIndex,
type WriteOptions,
type IndexStats
} from '../index'
import { Query } from '../query'
@@ -241,4 +243,21 @@ export class RemoteTable<T = number[]> implements Table<T> {
async delete (filter: string): Promise<void> {
await this._client.post(`/v1/table/${this._name}/delete/`, { predicate: filter })
}
async listIndices (): Promise<VectorIndex[]> {
const results = await this._client.post(`/v1/table/${this._name}/index/list/`)
return results.data.indexes?.map((index: any) => ({
columns: index.columns,
name: index.index_name,
uuid: index.index_uuid
}))
}
async indexStats (indexUuid: string): Promise<IndexStats> {
const results = await this._client.post(`/v1/table/${this._name}/index/${indexUuid}/stats/`)
return {
numIndexedRows: results.data.num_indexed_rows,
numUnindexedRows: results.data.num_unindexed_rows
}
}
}

View File

@@ -328,6 +328,24 @@ describe('LanceDB client', function () {
const createIndex = table.createIndex({ type: 'ivf_pq', column: 'name', num_partitions: -1, max_iters: 2, num_sub_vectors: 2 })
await expect(createIndex).to.be.rejectedWith('num_partitions: must be > 0')
})
it('should be able to list index and stats', async function () {
const uri = await createTestDB(32, 300)
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
await table.createIndex({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2, num_sub_vectors: 2 })
const indices = await table.listIndices()
expect(indices).to.have.lengthOf(1)
expect(indices[0].name).to.equal('vector_idx')
expect(indices[0].uuid).to.not.be.equal(undefined)
expect(indices[0].columns).to.have.lengthOf(1)
expect(indices[0].columns[0]).to.equal('vector')
const stats = await table.indexStats(indices[0].uuid)
expect(stats.numIndexedRows).to.equal(300)
expect(stats.numUnindexedRows).to.equal(0)
}).timeout(50_000)
})
describe('when using a custom embedding function', function () {

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb-node"
version = "0.3.3"
version = "0.3.4"
description = "Serverless, low-latency vector database for AI applications"
license = "Apache-2.0"
edition = "2018"
@@ -14,7 +14,7 @@ arrow-array = { workspace = true }
arrow-ipc = { workspace = true }
arrow-schema = { workspace = true }
chrono = { workspace = true }
conv = "0.3.3"
conv = "0.3.4"
once_cell = "1"
futures = "0.3"
half = { workspace = true }

View File

@@ -239,6 +239,8 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
cx.export_function("tableDelete", JsTable::js_delete)?;
cx.export_function("tableCleanupOldVersions", JsTable::js_cleanup)?;
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
cx.export_function("tableListIndices", JsTable::js_list_indices)?;
cx.export_function("tableIndexStats", JsTable::js_index_stats)?;
cx.export_function(
"tableCreateVectorIndex",
index::vector::table_create_vector_index,

View File

@@ -247,7 +247,7 @@ impl JsTable {
}
rt.spawn(async move {
let stats = table.compact_files(options).await;
let stats = table.compact_files(options, None).await;
deferred.settle_with(&channel, move |mut cx| {
let stats = stats.or_throw(&mut cx)?;
@@ -276,4 +276,91 @@ impl JsTable {
});
Ok(promise)
}
pub(crate) fn js_list_indices(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
// let predicate = cx.argument::<JsString>(0)?.value(&mut cx);
let channel = cx.channel();
let table = js_table.table.clone();
rt.spawn(async move {
let indices = table.load_indices().await;
deferred.settle_with(&channel, move |mut cx| {
let indices = indices.or_throw(&mut cx)?;
let output = JsArray::new(&mut cx, indices.len() as u32);
for (i, index) in indices.iter().enumerate() {
let js_index = JsObject::new(&mut cx);
let index_name = cx.string(index.index_name.clone());
js_index.set(&mut cx, "name", index_name)?;
let index_uuid = cx.string(index.index_uuid.clone());
js_index.set(&mut cx, "uuid", index_uuid)?;
let js_index_columns = JsArray::new(&mut cx, index.columns.len() as u32);
for (j, column) in index.columns.iter().enumerate() {
let js_column = cx.string(column.clone());
js_index_columns.set(&mut cx, j as u32, js_column)?;
}
js_index.set(&mut cx, "columns", js_index_columns)?;
output.set(&mut cx, i as u32, js_index)?;
}
Ok(output)
})
});
Ok(promise)
}
pub(crate) fn js_index_stats(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let index_uuid = cx.argument::<JsString>(0)?.value(&mut cx);
let channel = cx.channel();
let table = js_table.table.clone();
rt.spawn(async move {
let load_stats = futures::try_join!(
table.count_indexed_rows(&index_uuid),
table.count_unindexed_rows(&index_uuid)
);
deferred.settle_with(&channel, move |mut cx| {
let (indexed_rows, unindexed_rows) = load_stats.or_throw(&mut cx)?;
let output = JsObject::new(&mut cx);
match indexed_rows {
Some(x) => {
let i = cx.number(x as f64);
output.set(&mut cx, "numIndexedRows", i)?;
}
None => {
let null = cx.null();
output.set(&mut cx, "numIndexedRows", null)?;
}
};
match unindexed_rows {
Some(x) => {
let i = cx.number(x as f64);
output.set(&mut cx, "numUnindexedRows", i)?;
}
None => {
let null = cx.null();
output.set(&mut cx, "numUnindexedRows", null)?;
}
};
Ok(output)
})
});
Ok(promise)
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "vectordb"
version = "0.3.3"
version = "0.3.4"
edition = "2021"
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license = "Apache-2.0"

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use lance::format::{Index, Manifest};
use lance::index::vector::ivf::IvfBuildParams;
use lance::index::vector::pq::PQBuildParams;
use lance::index::vector::VectorIndexParams;
@@ -106,6 +107,27 @@ impl VectorIndexBuilder for IvfPQIndexBuilder {
}
}
pub struct VectorIndex {
pub columns: Vec<String>,
pub index_name: String,
pub index_uuid: String,
}
impl VectorIndex {
pub fn new_from_format(manifest: &Manifest, index: &Index) -> VectorIndex {
let fields = index
.fields
.iter()
.map(|i| manifest.schema.fields[*i as usize].name.clone())
.collect();
VectorIndex {
columns: fields,
index_name: index.name.clone(),
index_uuid: index.uuid.to_string(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -18,14 +18,16 @@ use std::sync::Arc;
use arrow_array::{Float32Array, RecordBatchReader};
use arrow_schema::SchemaRef;
use lance::dataset::cleanup::RemovalStats;
use lance::dataset::optimize::{compact_files, CompactionMetrics, CompactionOptions};
use lance::dataset::optimize::{
compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions,
};
use lance::dataset::{Dataset, WriteParams};
use lance::index::IndexType;
use lance::index::{DatasetIndexExt, IndexType};
use lance::io::object_store::WrappingObjectStore;
use std::path::Path;
use crate::error::{Error, Result};
use crate::index::vector::VectorIndexBuilder;
use crate::index::vector::{VectorIndexBuilder, VectorIndex};
use crate::query::Query;
use crate::utils::{PatchReadParam, PatchWriteParam};
use crate::WriteMode;
@@ -238,8 +240,6 @@ impl Table {
/// Create index on the table.
pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
use lance::index::DatasetIndexExt;
let mut dataset = self.dataset.as_ref().clone();
dataset
.create_index(
@@ -257,6 +257,14 @@ impl Table {
Ok(())
}
pub async fn optimize_indices(&mut self) -> Result<()> {
let mut dataset = self.dataset.as_ref().clone();
dataset.optimize_indices().await?;
Ok(())
}
/// Insert records into this Table
///
/// # Arguments
@@ -353,12 +361,44 @@ impl Table {
/// for faster reads.
///
/// This calls into [lance::dataset::optimize::compact_files].
pub async fn compact_files(&mut self, options: CompactionOptions) -> Result<CompactionMetrics> {
pub async fn compact_files(
&mut self,
options: CompactionOptions,
remap_options: Option<Arc<dyn IndexRemapperOptions>>,
) -> Result<CompactionMetrics> {
let mut dataset = self.dataset.as_ref().clone();
let metrics = compact_files(&mut dataset, options, None).await?;
let metrics = compact_files(&mut dataset, options, remap_options).await?;
self.dataset = Arc::new(dataset);
Ok(metrics)
}
pub fn count_fragments(&self) -> usize {
self.dataset.count_fragments()
}
pub fn count_deleted_rows(&self) -> usize {
self.dataset.count_deleted_rows()
}
pub fn num_small_files(&self, max_rows_per_group: usize) -> usize {
self.dataset.num_small_files(max_rows_per_group)
}
pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
Ok(self.dataset.count_indexed_rows(index_uuid).await?)
}
pub async fn count_unindexed_rows(&self, index_uuid: &str) -> Result<Option<usize>> {
Ok(self.dataset.count_unindexed_rows(index_uuid).await?)
}
pub async fn load_indices(&self) -> Result<Vec<VectorIndex>> {
let (indices, mf) = futures::try_join!(
self.dataset.load_indices(),
self.dataset.latest_manifest()
)?;
Ok(indices.iter().map(|i| VectorIndex::new_from_format(&mf, i)).collect())
}
}
#[cfg(test)]