Files
lancedb/nodejs/src/table.rs
Wyatt Alt 386fc9e466 feat: add num_attempts to merge insert result (#2795)
This pipes the num_attempts field from lance's merge insert result
through lancedb. This allows callers of merge_insert to get a better
idea of whether transaction conflicts are occurring.
2025-11-19 09:32:57 -08:00

871 lines
26 KiB
Rust

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
use std::collections::HashMap;
use lancedb::ipc::ipc_file_to_batches;
use lancedb::table::{
AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration, NewColumnTransform,
OptimizeAction, OptimizeOptions, Table as LanceDbTable,
};
use napi::bindgen_prelude::*;
use napi_derive::napi;
use crate::error::NapiErrorExt;
use crate::index::Index;
use crate::merge::NativeMergeInsertBuilder;
use crate::query::{Query, TakeQuery, VectorQuery};
use crate::util::schema_to_buffer;
#[napi]
pub struct Table {
// We keep a duplicate of the table name so we can use it for error
// messages even if the table has been closed
pub name: String,
pub(crate) inner: Option<LanceDbTable>,
}
impl Table {
pub(crate) fn inner_ref(&self) -> napi::Result<&LanceDbTable> {
self.inner
.as_ref()
.ok_or_else(|| napi::Error::from_reason(format!("Table {} is closed", self.name)))
}
}
#[napi]
impl Table {
pub(crate) fn new(table: LanceDbTable) -> Self {
Self {
name: table.name().to_string(),
inner: Some(table),
}
}
#[napi]
pub fn display(&self) -> String {
match &self.inner {
None => format!("ClosedTable({})", self.name),
Some(inner) => inner.to_string(),
}
}
#[napi]
pub fn is_open(&self) -> bool {
self.inner.is_some()
}
#[napi]
pub fn close(&mut self) {
self.inner.take();
}
/// Return Schema as empty Arrow IPC file.
#[napi(catch_unwind)]
pub async fn schema(&self) -> napi::Result<Buffer> {
let schema = self.inner_ref()?.schema().await.default_error()?;
schema_to_buffer(&schema)
}
#[napi(catch_unwind)]
pub async fn add(&self, buf: Buffer, mode: String) -> napi::Result<AddResult> {
let batches = ipc_file_to_batches(buf.to_vec())
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
let mut op = self.inner_ref()?.add(batches);
op = if mode == "append" {
op.mode(AddDataMode::Append)
} else if mode == "overwrite" {
op.mode(AddDataMode::Overwrite)
} else {
return Err(napi::Error::from_reason(format!("Invalid mode: {}", mode)));
};
let res = op.execute().await.default_error()?;
Ok(res.into())
}
#[napi(catch_unwind)]
pub async fn count_rows(&self, filter: Option<String>) -> napi::Result<i64> {
self.inner_ref()?
.count_rows(filter)
.await
.map(|val| val as i64)
.default_error()
}
#[napi(catch_unwind)]
pub async fn delete(&self, predicate: String) -> napi::Result<DeleteResult> {
let res = self.inner_ref()?.delete(&predicate).await.default_error()?;
Ok(res.into())
}
#[napi(catch_unwind)]
pub async fn create_index(
&self,
index: Option<&Index>,
column: String,
replace: Option<bool>,
wait_timeout_s: Option<i64>,
name: Option<String>,
train: Option<bool>,
) -> napi::Result<()> {
let lancedb_index = if let Some(index) = index {
index.consume()?
} else {
lancedb::index::Index::Auto
};
let mut builder = self.inner_ref()?.create_index(&[column], lancedb_index);
if let Some(replace) = replace {
builder = builder.replace(replace);
}
if let Some(timeout) = wait_timeout_s {
builder =
builder.wait_timeout(std::time::Duration::from_secs(timeout.try_into().unwrap()));
}
if let Some(name) = name {
builder = builder.name(name);
}
if let Some(train) = train {
builder = builder.train(train);
}
builder.execute().await.default_error()
}
#[napi(catch_unwind)]
pub async fn drop_index(&self, index_name: String) -> napi::Result<()> {
self.inner_ref()?
.drop_index(&index_name)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn prewarm_index(&self, index_name: String) -> napi::Result<()> {
self.inner_ref()?
.prewarm_index(&index_name)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn wait_for_index(&self, index_names: Vec<String>, timeout_s: i64) -> Result<()> {
let timeout = std::time::Duration::from_secs(timeout_s.try_into().unwrap());
let index_names: Vec<&str> = index_names.iter().map(|s| s.as_str()).collect();
let slice: &[&str] = &index_names;
self.inner_ref()?
.wait_for_index(slice, timeout)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn stats(&self) -> Result<TableStatistics> {
let stats = self.inner_ref()?.stats().await.default_error()?;
Ok(stats.into())
}
#[napi(catch_unwind)]
pub async fn update(
&self,
only_if: Option<String>,
columns: Vec<(String, String)>,
) -> napi::Result<UpdateResult> {
let mut op = self.inner_ref()?.update();
if let Some(only_if) = only_if {
op = op.only_if(only_if);
}
for (column_name, value) in columns {
op = op.column(column_name, value);
}
let res = op.execute().await.default_error()?;
Ok(res.into())
}
#[napi(catch_unwind)]
pub fn query(&self) -> napi::Result<Query> {
Ok(Query::new(self.inner_ref()?.query()))
}
#[napi(catch_unwind)]
pub fn take_offsets(&self, offsets: Vec<i64>) -> napi::Result<TakeQuery> {
Ok(TakeQuery::new(
self.inner_ref()?.take_offsets(
offsets
.into_iter()
.map(|o| {
u64::try_from(o).map_err(|e| {
napi::Error::from_reason(format!(
"Failed to convert offset to u64: {}",
e
))
})
})
.collect::<Result<Vec<_>>>()?,
),
))
}
#[napi(catch_unwind)]
pub fn take_row_ids(&self, row_ids: Vec<i64>) -> napi::Result<TakeQuery> {
Ok(TakeQuery::new(
self.inner_ref()?.take_row_ids(
row_ids
.into_iter()
.map(|o| {
u64::try_from(o).map_err(|e| {
napi::Error::from_reason(format!(
"Failed to convert row id to u64: {}",
e
))
})
})
.collect::<Result<Vec<_>>>()?,
),
))
}
#[napi(catch_unwind)]
pub fn vector_search(&self, vector: Float32Array) -> napi::Result<VectorQuery> {
self.query()?.nearest_to(vector)
}
#[napi(catch_unwind)]
pub async fn add_columns(
&self,
transforms: Vec<AddColumnsSql>,
) -> napi::Result<AddColumnsResult> {
let transforms = transforms
.into_iter()
.map(|sql| (sql.name, sql.value_sql))
.collect::<Vec<_>>();
let transforms = NewColumnTransform::SqlExpressions(transforms);
let res = self
.inner_ref()?
.add_columns(transforms, None)
.await
.default_error()?;
Ok(res.into())
}
#[napi(catch_unwind)]
pub async fn alter_columns(
&self,
alterations: Vec<ColumnAlteration>,
) -> napi::Result<AlterColumnsResult> {
for alteration in &alterations {
if alteration.rename.is_none()
&& alteration.nullable.is_none()
&& alteration.data_type.is_none()
{
return Err(napi::Error::from_reason(
"Alteration must have a 'rename', 'dataType', or 'nullable' field.",
));
}
}
let alterations = alterations
.into_iter()
.map(LanceColumnAlteration::try_from)
.collect::<std::result::Result<Vec<_>, String>>()
.map_err(napi::Error::from_reason)?;
let res = self
.inner_ref()?
.alter_columns(&alterations)
.await
.default_error()?;
Ok(res.into())
}
#[napi(catch_unwind)]
pub async fn drop_columns(&self, columns: Vec<String>) -> napi::Result<DropColumnsResult> {
let col_refs = columns.iter().map(String::as_str).collect::<Vec<_>>();
let res = self
.inner_ref()?
.drop_columns(&col_refs)
.await
.default_error()?;
Ok(res.into())
}
#[napi(catch_unwind)]
pub async fn version(&self) -> napi::Result<i64> {
self.inner_ref()?
.version()
.await
.map(|val| val as i64)
.default_error()
}
#[napi(catch_unwind)]
pub async fn checkout(&self, version: i64) -> napi::Result<()> {
self.inner_ref()?
.checkout(version as u64)
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn checkout_tag(&self, tag: String) -> napi::Result<()> {
self.inner_ref()?
.checkout_tag(tag.as_str())
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn checkout_latest(&self) -> napi::Result<()> {
self.inner_ref()?.checkout_latest().await.default_error()
}
#[napi(catch_unwind)]
pub async fn list_versions(&self) -> napi::Result<Vec<Version>> {
self.inner_ref()?
.list_versions()
.await
.map(|versions| {
versions
.iter()
.map(|version| Version {
version: version.version as i64,
timestamp: version.timestamp.timestamp_micros(),
metadata: version
.metadata
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect(),
})
.collect()
})
.default_error()
}
#[napi(catch_unwind)]
pub async fn restore(&self) -> napi::Result<()> {
self.inner_ref()?.restore().await.default_error()
}
#[napi(catch_unwind)]
pub async fn tags(&self) -> napi::Result<Tags> {
Ok(Tags {
inner: self.inner_ref()?.clone(),
})
}
#[napi(catch_unwind)]
pub async fn optimize(
&self,
older_than_ms: Option<i64>,
delete_unverified: Option<bool>,
) -> napi::Result<OptimizeStats> {
let inner = self.inner_ref()?;
let older_than = if let Some(ms) = older_than_ms {
if ms == i64::MIN {
return Err(napi::Error::from_reason(format!(
"older_than_ms can not be {}",
i32::MIN,
)));
}
Duration::try_milliseconds(ms)
} else {
None
};
let compaction_stats = inner
.optimize(OptimizeAction::Compact {
options: lancedb::table::CompactionOptions::default(),
remap_options: None,
})
.await
.default_error()?
.compaction
.unwrap();
let prune_stats = inner
.optimize(OptimizeAction::Prune {
older_than,
delete_unverified,
error_if_tagged_old_versions: None,
})
.await
.default_error()?
.prune
.unwrap();
inner
.optimize(lancedb::table::OptimizeAction::Index(
OptimizeOptions::default(),
))
.await
.default_error()?;
Ok(OptimizeStats {
compaction: CompactionStats {
files_added: compaction_stats.files_added as i64,
files_removed: compaction_stats.files_removed as i64,
fragments_added: compaction_stats.fragments_added as i64,
fragments_removed: compaction_stats.fragments_removed as i64,
},
prune: RemovalStats {
bytes_removed: prune_stats.bytes_removed as i64,
old_versions_removed: prune_stats.old_versions as i64,
},
})
}
#[napi(catch_unwind)]
pub async fn list_indices(&self) -> napi::Result<Vec<IndexConfig>> {
Ok(self
.inner_ref()?
.list_indices()
.await
.default_error()?
.into_iter()
.map(IndexConfig::from)
.collect::<Vec<_>>())
}
#[napi(catch_unwind)]
pub async fn index_stats(&self, index_name: String) -> napi::Result<Option<IndexStatistics>> {
let tbl = self.inner_ref()?;
let stats = tbl.index_stats(&index_name).await.default_error()?;
Ok(stats.map(IndexStatistics::from))
}
#[napi(catch_unwind)]
pub fn merge_insert(&self, on: Vec<String>) -> napi::Result<NativeMergeInsertBuilder> {
let on: Vec<_> = on.iter().map(String::as_str).collect();
Ok(self.inner_ref()?.merge_insert(on.as_slice()).into())
}
#[napi(catch_unwind)]
pub async fn uses_v2_manifest_paths(&self) -> napi::Result<bool> {
self.inner_ref()?
.as_native()
.ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))?
.uses_v2_manifest_paths()
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn migrate_manifest_paths_v2(&self) -> napi::Result<()> {
self.inner_ref()?
.as_native()
.ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))?
.migrate_manifest_paths_v2()
.await
.default_error()
}
}
#[napi(object)]
/// A description of an index currently configured on a column
pub struct IndexConfig {
/// The name of the index
pub name: String,
/// The type of the index
pub index_type: String,
/// The columns in the index
///
/// Currently this is always an array of size 1. In the future there may
/// be more columns to represent composite indices.
pub columns: Vec<String>,
}
impl From<lancedb::index::IndexConfig> for IndexConfig {
fn from(value: lancedb::index::IndexConfig) -> Self {
let index_type = format!("{:?}", value.index_type);
Self {
index_type,
columns: value.columns,
name: value.name,
}
}
}
/// Statistics about a compaction operation.
#[napi(object)]
#[derive(Clone, Debug)]
pub struct CompactionStats {
/// The number of fragments removed
pub fragments_removed: i64,
/// The number of new, compacted fragments added
pub fragments_added: i64,
/// The number of data files removed
pub files_removed: i64,
/// The number of new, compacted data files added
pub files_added: i64,
}
/// Statistics about a cleanup operation
#[napi(object)]
#[derive(Clone, Debug)]
pub struct RemovalStats {
/// The number of bytes removed
pub bytes_removed: i64,
/// The number of old versions removed
pub old_versions_removed: i64,
}
/// Statistics about an optimize operation
#[napi(object)]
#[derive(Clone, Debug)]
pub struct OptimizeStats {
/// Statistics about the compaction operation
pub compaction: CompactionStats,
/// Statistics about the removal operation
pub prune: RemovalStats,
}
/// A definition of a column alteration. The alteration changes the column at
/// `path` to have the new name `name`, to be nullable if `nullable` is true,
/// and to have the data type `data_type`. At least one of `rename` or `nullable`
/// must be provided.
#[napi(object)]
pub struct ColumnAlteration {
/// The path to the column to alter. This is a dot-separated path to the column.
/// If it is a top-level column then it is just the name of the column. If it is
/// a nested column then it is the path to the column, e.g. "a.b.c" for a column
/// `c` nested inside a column `b` nested inside a column `a`.
pub path: String,
/// The new name of the column. If not provided then the name will not be changed.
/// This must be distinct from the names of all other columns in the table.
pub rename: Option<String>,
/// A new data type for the column. If not provided then the data type will not be changed.
/// Changing data types is limited to casting to the same general type. For example, these
/// changes are valid:
/// * `int32` -> `int64` (integers)
/// * `double` -> `float` (floats)
/// * `string` -> `large_string` (strings)
/// But these changes are not:
/// * `int32` -> `double` (mix integers and floats)
/// * `string` -> `int32` (mix strings and integers)
pub data_type: Option<String>,
/// Set the new nullability. Note that a nullable column cannot be made non-nullable.
pub nullable: Option<bool>,
}
impl TryFrom<ColumnAlteration> for LanceColumnAlteration {
type Error = String;
fn try_from(js: ColumnAlteration) -> std::result::Result<Self, Self::Error> {
let ColumnAlteration {
path,
rename,
nullable,
data_type,
} = js;
let data_type = if let Some(data_type) = data_type {
Some(
lancedb::utils::string_to_datatype(&data_type)
.ok_or_else(|| format!("Invalid data type: {}", data_type))?,
)
} else {
None
};
Ok(Self {
path,
rename,
nullable,
data_type,
})
}
}
/// A definition of a new column to add to a table.
#[napi(object)]
pub struct AddColumnsSql {
/// The name of the new column.
pub name: String,
/// The values to populate the new column with, as a SQL expression.
/// The expression can reference other columns in the table.
pub value_sql: String,
}
#[napi(object)]
pub struct IndexStatistics {
/// The number of rows indexed by the index
pub num_indexed_rows: f64,
/// The number of rows not indexed
pub num_unindexed_rows: f64,
/// The type of the index
pub index_type: String,
/// The type of the distance function used by the index. This is only
/// present for vector indices. Scalar and full text search indices do
/// not have a distance function.
pub distance_type: Option<String>,
/// The number of parts this index is split into.
pub num_indices: Option<u32>,
/// The KMeans loss value of the index,
/// it is only present for vector indices.
pub loss: Option<f64>,
}
impl From<lancedb::index::IndexStatistics> for IndexStatistics {
fn from(value: lancedb::index::IndexStatistics) -> Self {
Self {
num_indexed_rows: value.num_indexed_rows as f64,
num_unindexed_rows: value.num_unindexed_rows as f64,
index_type: value.index_type.to_string(),
distance_type: value.distance_type.map(|d| d.to_string()),
num_indices: value.num_indices,
loss: value.loss,
}
}
}
#[napi(object)]
pub struct TableStatistics {
/// The total number of bytes in the table
pub total_bytes: i64,
/// The number of rows in the table
pub num_rows: i64,
/// The number of indices in the table
pub num_indices: i64,
/// Statistics on table fragments
pub fragment_stats: FragmentStatistics,
}
#[napi(object)]
pub struct FragmentStatistics {
/// The number of fragments in the table
pub num_fragments: i64,
/// The number of uncompacted fragments in the table
pub num_small_fragments: i64,
/// Statistics on the number of rows in the table fragments
pub lengths: FragmentSummaryStats,
}
#[napi(object)]
pub struct FragmentSummaryStats {
/// The number of rows in the fragment with the fewest rows
pub min: i64,
/// The number of rows in the fragment with the most rows
pub max: i64,
/// The mean number of rows in the fragments
pub mean: i64,
/// The 25th percentile of number of rows in the fragments
pub p25: i64,
/// The 50th percentile of number of rows in the fragments
pub p50: i64,
/// The 75th percentile of number of rows in the fragments
pub p75: i64,
/// The 99th percentile of number of rows in the fragments
pub p99: i64,
}
impl From<lancedb::table::TableStatistics> for TableStatistics {
fn from(v: lancedb::table::TableStatistics) -> Self {
Self {
total_bytes: v.total_bytes as i64,
num_rows: v.num_rows as i64,
num_indices: v.num_indices as i64,
fragment_stats: FragmentStatistics {
num_fragments: v.fragment_stats.num_fragments as i64,
num_small_fragments: v.fragment_stats.num_small_fragments as i64,
lengths: FragmentSummaryStats {
min: v.fragment_stats.lengths.min as i64,
max: v.fragment_stats.lengths.max as i64,
mean: v.fragment_stats.lengths.mean as i64,
p25: v.fragment_stats.lengths.p25 as i64,
p50: v.fragment_stats.lengths.p50 as i64,
p75: v.fragment_stats.lengths.p75 as i64,
p99: v.fragment_stats.lengths.p99 as i64,
},
},
}
}
}
#[napi(object)]
pub struct Version {
pub version: i64,
pub timestamp: i64,
pub metadata: HashMap<String, String>,
}
#[napi(object)]
pub struct UpdateResult {
pub rows_updated: i64,
pub version: i64,
}
impl From<lancedb::table::UpdateResult> for UpdateResult {
fn from(value: lancedb::table::UpdateResult) -> Self {
Self {
rows_updated: value.rows_updated as i64,
version: value.version as i64,
}
}
}
#[napi(object)]
pub struct AddResult {
pub version: i64,
}
impl From<lancedb::table::AddResult> for AddResult {
fn from(value: lancedb::table::AddResult) -> Self {
Self {
version: value.version as i64,
}
}
}
#[napi(object)]
pub struct DeleteResult {
pub version: i64,
}
impl From<lancedb::table::DeleteResult> for DeleteResult {
fn from(value: lancedb::table::DeleteResult) -> Self {
Self {
version: value.version as i64,
}
}
}
#[napi(object)]
pub struct MergeResult {
pub version: i64,
pub num_inserted_rows: i64,
pub num_updated_rows: i64,
pub num_deleted_rows: i64,
pub num_attempts: i64,
}
impl From<lancedb::table::MergeResult> for MergeResult {
fn from(value: lancedb::table::MergeResult) -> Self {
Self {
version: value.version as i64,
num_inserted_rows: value.num_inserted_rows as i64,
num_updated_rows: value.num_updated_rows as i64,
num_deleted_rows: value.num_deleted_rows as i64,
num_attempts: value.num_attempts as i64,
}
}
}
#[napi(object)]
pub struct AddColumnsResult {
pub version: i64,
}
impl From<lancedb::table::AddColumnsResult> for AddColumnsResult {
fn from(value: lancedb::table::AddColumnsResult) -> Self {
Self {
version: value.version as i64,
}
}
}
#[napi(object)]
pub struct AlterColumnsResult {
pub version: i64,
}
impl From<lancedb::table::AlterColumnsResult> for AlterColumnsResult {
fn from(value: lancedb::table::AlterColumnsResult) -> Self {
Self {
version: value.version as i64,
}
}
}
#[napi(object)]
pub struct DropColumnsResult {
pub version: i64,
}
impl From<lancedb::table::DropColumnsResult> for DropColumnsResult {
fn from(value: lancedb::table::DropColumnsResult) -> Self {
Self {
version: value.version as i64,
}
}
}
#[napi]
pub struct TagContents {
pub version: i64,
pub manifest_size: i64,
}
#[napi]
pub struct Tags {
inner: LanceDbTable,
}
#[napi]
impl Tags {
#[napi]
pub async fn list(&self) -> napi::Result<HashMap<String, TagContents>> {
let rust_tags = self.inner.tags().await.default_error()?;
let tag_list = rust_tags.as_ref().list().await.default_error()?;
let tag_contents = tag_list
.into_iter()
.map(|(k, v)| {
(
k,
TagContents {
version: v.version as i64,
manifest_size: v.manifest_size as i64,
},
)
})
.collect();
Ok(tag_contents)
}
#[napi]
pub async fn get_version(&self, tag: String) -> napi::Result<i64> {
let rust_tags = self.inner.tags().await.default_error()?;
rust_tags
.as_ref()
.get_version(tag.as_str())
.await
.map(|v| v as i64)
.default_error()
}
#[napi]
pub async unsafe fn create(&mut self, tag: String, version: i64) -> napi::Result<()> {
let mut rust_tags = self.inner.tags().await.default_error()?;
rust_tags
.as_mut()
.create(tag.as_str(), version as u64)
.await
.default_error()
}
#[napi]
pub async unsafe fn delete(&mut self, tag: String) -> napi::Result<()> {
let mut rust_tags = self.inner.tags().await.default_error()?;
rust_tags
.as_mut()
.delete(tag.as_str())
.await
.default_error()
}
#[napi]
pub async unsafe fn update(&mut self, tag: String, version: i64) -> napi::Result<()> {
let mut rust_tags = self.inner.tags().await.default_error()?;
rust_tags
.as_mut()
.update(tag.as_str(), version as u64)
.await
.default_error()
}
}