Compare commits

..

15 Commits

Author SHA1 Message Date
Lei Xu
ba3ad35b87 [maven-release-plugin] prepare for next development iteration 2024-05-29 14:55:11 -07:00
Lei Xu
28a0fea1d0 [maven-release-plugin] prepare release lancedb-parent-0.0.2 2024-05-29 14:55:08 -07:00
Rong Rong
b0e6c20be2 also add javadoc and source plugin 2024-05-28 21:13:34 -07:00
Rong Rong
d9965476c5 prepare for Java release 2024-05-28 20:57:20 -07:00
Cory Grinstead
2f4b70ecfe chore: clippy warnings inside java bindings (#1330)
this was causing unrelated PR's to fail.
https://github.com/lancedb/lancedb/actions/runs/9274579178/job/25517248069?pr=1308
2024-05-28 14:05:07 -05:00
Philip Meier
1ad1c0820d chore: replace semver dependency with packaging (#1311)
Fixes #1296 per title. See
https://github.com/lancedb/lancedb/pull/1298#discussion_r1603931457 Cc
@wjones127

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2024-05-28 10:05:16 -07:00
LuQQiu
db712b0f99 feat(java): add table names java api (#1279)
Add lancedb-jni and table names API

---------

Co-authored-by: Lei Xu <eddyxu@gmail.com>
2024-05-24 11:49:11 -07:00
BubbleCal
fd1a5ce788 feat: support IVF_HNSW_PQ (#1314)
this also simplifies the code of creating index with macro

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2024-05-24 18:32:00 +08:00
QianZhu
def087fc85 fix: parse index_stats for scalar index (#1319)
parse the index stats for scalar index - it is different from the index
stats for vector index
2024-05-23 13:10:46 -07:00
Lance Release
43f920182a Bump version: 0.8.0-beta.0 → 0.8.0 2024-05-23 17:32:36 +00:00
Lance Release
718963d1fb Bump version: 0.7.0 → 0.8.0-beta.0 2024-05-23 17:32:36 +00:00
Weston Pace
e4dac751e7 chore: remove working-directory from pypi upload step (#1322)
The wheels are built to `WORKDIR/target/wheels` and the step was
configured to look for them at `WORKDIR/python/target/wheels`.
2024-05-23 10:31:32 -07:00
Lance Release
aae02953eb Updating package-lock.json 2024-05-23 16:30:46 +00:00
Lance Release
1d9f76bdda Bump version: 0.5.0-beta.0 → 0.5.0 2024-05-23 16:30:27 +00:00
Lance Release
affdfc4d48 Bump version: 0.4.20 → 0.5.0-beta.0 2024-05-23 16:30:26 +00:00
59 changed files with 1853 additions and 208 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.4.20"
current_version = "0.5.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

85
.github/workflows/java.yml vendored Normal file
View File

@@ -0,0 +1,85 @@
name: Build and Run Java JNI Tests
on:
push:
branches:
- main
pull_request:
paths:
- java/**
- rust/**
- .github/workflows/java.yml
env:
# This env var is used by Swatinem/rust-cache@v2 for the cache
# key, so we set it to make sure it is always consistent.
CARGO_TERM_COLOR: always
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
RUSTFLAGS: "-C debuginfo=1"
RUST_BACKTRACE: "1"
# according to: https://matklad.github.io/2021/09/04/fast-rust-builds.html
# CI builds are faster with incremental disabled.
CARGO_INCREMENTAL: "0"
CARGO_BUILD_JOBS: "1"
jobs:
linux-build:
runs-on: ubuntu-22.04
name: ubuntu-22.04 + Java 11 & 17
defaults:
run:
working-directory: ./java
steps:
- name: Checkout repository
uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
with:
workspaces: java/core/lancedb-jni
- name: Run cargo fmt
run: cargo fmt --check
working-directory: ./java/core/lancedb-jni
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Install Java 17
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 17
cache: "maven"
- run: echo "JAVA_17=$JAVA_HOME" >> $GITHUB_ENV
- name: Install Java 11
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 11
cache: "maven"
- name: Java Style Check
run: mvn checkstyle:check
# Disable because of issues in lancedb rust core code
# - name: Rust Clippy
# working-directory: java/core/lancedb-jni
# run: cargo clippy --all-targets -- -D warnings
- name: Running tests with Java 11
run: mvn clean test
- name: Running tests with Java 17
run: |
export JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS \
-XX:+IgnoreUnrecognizedVMOptions \
--add-opens=java.base/java.lang=ALL-UNNAMED \
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED \
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED \
--add-opens=java.base/java.io=ALL-UNNAMED \
--add-opens=java.base/java.net=ALL-UNNAMED \
--add-opens=java.base/java.nio=ALL-UNNAMED \
--add-opens=java.base/java.util=ALL-UNNAMED \
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED \
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED \
--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED \
--add-opens=java.base/sun.security.action=ALL-UNNAMED \
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED \
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED \
-Djdk.reflect.useDirectMethodHandle=false \
-Dio.netty.tryReflectionSetAccessible=true"
JAVA_HOME=$JAVA_17 mvn clean test

View File

@@ -27,7 +27,6 @@ runs:
echo "repo=pypi" >> $GITHUB_OUTPUT
fi
- name: Publish to PyPI
working-directory: python
shell: bash
env:
FURY_TOKEN: ${{ inputs.fury_token }}

View File

@@ -1,5 +1,5 @@
[workspace]
members = ["rust/ffi/node", "rust/lancedb", "nodejs", "python"]
members = ["rust/ffi/node", "rust/lancedb", "nodejs", "python", "java/core/lancedb-jni"]
# Python package needs to be built by maturin.
exclude = ["python"]
resolver = "2"

View File

@@ -0,0 +1,27 @@
[package]
name = "lancedb-jni"
description = "JNI bindings for LanceDB"
# TODO modify lancedb/Cargo.toml for version and dependencies
version = "0.4.18"
edition.workspace = true
repository.workspace = true
readme.workspace = true
license.workspace = true
keywords.workspace = true
categories.workspace = true
publish = false
[lib]
crate-type = ["cdylib"]
[dependencies]
lancedb = { path = "../../../rust/lancedb" }
lance = { workspace = true }
arrow = { workspace = true, features = ["ffi"] }
arrow-schema.workspace = true
tokio = "1.23"
jni = "0.21.1"
snafu.workspace = true
lazy_static.workspace = true
serde = { version = "^1" }
serde_json = { version = "1" }

View File

@@ -0,0 +1,130 @@
use crate::ffi::JNIEnvExt;
use crate::traits::IntoJava;
use crate::{Error, RT};
use jni::objects::{JObject, JString, JValue};
use jni::JNIEnv;
pub const NATIVE_CONNECTION: &str = "nativeConnectionHandle";
use crate::Result;
use lancedb::connection::{connect, Connection};
#[derive(Clone)]
pub struct BlockingConnection {
pub(crate) inner: Connection,
}
impl BlockingConnection {
pub fn create(dataset_uri: &str) -> Result<Self> {
let inner = RT.block_on(connect(dataset_uri).execute())?;
Ok(Self { inner })
}
pub fn table_names(
&self,
start_after: Option<String>,
limit: Option<i32>,
) -> Result<Vec<String>> {
let mut op = self.inner.table_names();
if let Some(start_after) = start_after {
op = op.start_after(start_after);
}
if let Some(limit) = limit {
op = op.limit(limit as u32);
}
Ok(RT.block_on(op.execute())?)
}
}
impl IntoJava for BlockingConnection {
fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> JObject<'a> {
attach_native_connection(env, self)
}
}
fn attach_native_connection<'local>(
env: &mut JNIEnv<'local>,
connection: BlockingConnection,
) -> JObject<'local> {
let j_connection = create_java_connection_object(env);
// This block sets a native Rust object (Connection) as a field in the Java object (j_Connection).
// Caution: This creates a potential for memory leaks. The Rust object (Connection) is not
// automatically garbage-collected by Java, and its memory will not be freed unless
// explicitly handled.
//
// To prevent memory leaks, ensure the following:
// 1. The Java object (`j_Connection`) should implement the `java.io.Closeable` interface.
// 2. Users of this Java object should be instructed to always use it within a try-with-resources
// statement (or manually call the `close()` method) to ensure that `self.close()` is invoked.
match unsafe { env.set_rust_field(&j_connection, NATIVE_CONNECTION, connection) } {
Ok(_) => j_connection,
Err(err) => {
env.throw_new(
"java/lang/RuntimeException",
format!("Failed to set native handle for Connection: {}", err),
)
.expect("Error throwing exception");
JObject::null()
}
}
}
fn create_java_connection_object<'a>(env: &mut JNIEnv<'a>) -> JObject<'a> {
env.new_object("com/lancedb/lancedb/Connection", "()V", &[])
.expect("Failed to create Java Lance Connection instance")
}
#[no_mangle]
pub extern "system" fn Java_com_lancedb_lancedb_Connection_releaseNativeConnection(
mut env: JNIEnv,
j_connection: JObject,
) {
let _: BlockingConnection = unsafe {
env.take_rust_field(j_connection, NATIVE_CONNECTION)
.expect("Failed to take native Connection handle")
};
}
#[no_mangle]
pub extern "system" fn Java_com_lancedb_lancedb_Connection_connect<'local>(
mut env: JNIEnv<'local>,
_obj: JObject,
dataset_uri_object: JString,
) -> JObject<'local> {
let dataset_uri: String = ok_or_throw!(env, env.get_string(&dataset_uri_object)).into();
let blocking_connection = ok_or_throw!(env, BlockingConnection::create(&dataset_uri));
blocking_connection.into_java(&mut env)
}
#[no_mangle]
pub extern "system" fn Java_com_lancedb_lancedb_Connection_tableNames<'local>(
mut env: JNIEnv<'local>,
j_connection: JObject,
start_after_obj: JObject, // Optional<String>
limit_obj: JObject, // Optional<Integer>
) -> JObject<'local> {
ok_or_throw!(
env,
inner_table_names(&mut env, j_connection, start_after_obj, limit_obj)
)
}
fn inner_table_names<'local>(
env: &mut JNIEnv<'local>,
j_connection: JObject,
start_after_obj: JObject, // Optional<String>
limit_obj: JObject, // Optional<Integer>
) -> Result<JObject<'local>> {
let start_after = env.get_string_opt(&start_after_obj)?;
let limit = env.get_int_opt(&limit_obj)?;
let conn =
unsafe { env.get_rust_field::<_, _, BlockingConnection>(j_connection, NATIVE_CONNECTION) }?;
let table_names = conn.table_names(start_after, limit)?;
drop(conn);
let j_names = env.new_object("java/util/ArrayList", "()V", &[])?;
for item in table_names {
let jstr_item = env.new_string(item)?;
let item_jobj = JObject::from(jstr_item);
let item_gen = JValue::Object(&item_jobj);
env.call_method(&j_names, "add", "(Ljava/lang/Object;)Z", &[item_gen])?;
}
Ok(j_names)
}

View File

@@ -0,0 +1,225 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::str::Utf8Error;
use arrow_schema::ArrowError;
use jni::errors::Error as JniError;
use serde_json::Error as JsonError;
use snafu::{Location, Snafu};
type BoxedError = Box<dyn std::error::Error + Send + Sync + 'static>;
/// Java Exception types
pub enum JavaException {
IllegalArgumentException,
IOException,
RuntimeException,
}
impl JavaException {
pub fn as_str(&self) -> &str {
match self {
Self::IllegalArgumentException => "java/lang/IllegalArgumentException",
Self::IOException => "java/io/IOException",
Self::RuntimeException => "java/lang/RuntimeException",
}
}
}
/// TODO(lu) change to lancedb-jni
#[derive(Debug, Snafu)]
#[snafu(visibility(pub))]
pub enum Error {
#[snafu(display("JNI error: {message}, {location}"))]
Jni { message: String, location: Location },
#[snafu(display("Invalid argument: {message}, {location}"))]
InvalidArgument { message: String, location: Location },
#[snafu(display("IO error: {source}, {location}"))]
IO {
source: BoxedError,
location: Location,
},
#[snafu(display("Arrow error: {message}, {location}"))]
Arrow { message: String, location: Location },
#[snafu(display("Index error: {message}, {location}"))]
Index { message: String, location: Location },
#[snafu(display("JSON error: {message}, {location}"))]
JSON { message: String, location: Location },
#[snafu(display("Dataset at path {path} was not found, {location}"))]
DatasetNotFound { path: String, location: Location },
#[snafu(display("Dataset already exists: {uri}, {location}"))]
DatasetAlreadyExists { uri: String, location: Location },
#[snafu(display("Table '{name}' already exists"))]
TableAlreadyExists { name: String },
#[snafu(display("Table '{name}' was not found"))]
TableNotFound { name: String },
#[snafu(display("Invalid table name '{name}': {reason}"))]
InvalidTableName { name: String, reason: String },
#[snafu(display("Embedding function '{name}' was not found: {reason}, {location}"))]
EmbeddingFunctionNotFound {
name: String,
reason: String,
location: Location,
},
#[snafu(display("Other Lance error: {message}, {location}"))]
OtherLance { message: String, location: Location },
#[snafu(display("Other LanceDB error: {message}, {location}"))]
OtherLanceDB { message: String, location: Location },
}
impl Error {
/// Throw as Java Exception
pub fn throw(&self, env: &mut jni::JNIEnv) {
match self {
Self::InvalidArgument { .. }
| Self::DatasetNotFound { .. }
| Self::DatasetAlreadyExists { .. }
| Self::TableAlreadyExists { .. }
| Self::TableNotFound { .. }
| Self::InvalidTableName { .. }
| Self::EmbeddingFunctionNotFound { .. } => {
self.throw_as(env, JavaException::IllegalArgumentException)
}
Self::IO { .. } | Self::Index { .. } => self.throw_as(env, JavaException::IOException),
Self::Arrow { .. }
| Self::JSON { .. }
| Self::OtherLance { .. }
| Self::OtherLanceDB { .. }
| Self::Jni { .. } => self.throw_as(env, JavaException::RuntimeException),
}
}
/// Throw as an concrete Java Exception
pub fn throw_as(&self, env: &mut jni::JNIEnv, exception: JavaException) {
let message = &format!(
"Error when throwing Java exception: {}:{}",
exception.as_str(),
self
);
env.throw_new(exception.as_str(), self.to_string())
.expect(message);
}
}
pub type Result<T> = std::result::Result<T, Error>;
trait ToSnafuLocation {
fn to_snafu_location(&'static self) -> snafu::Location;
}
impl ToSnafuLocation for std::panic::Location<'static> {
fn to_snafu_location(&'static self) -> snafu::Location {
snafu::Location::new(self.file(), self.line(), self.column())
}
}
impl From<JniError> for Error {
#[track_caller]
fn from(source: JniError) -> Self {
Self::Jni {
message: source.to_string(),
location: std::panic::Location::caller().to_snafu_location(),
}
}
}
impl From<Utf8Error> for Error {
#[track_caller]
fn from(source: Utf8Error) -> Self {
Self::InvalidArgument {
message: source.to_string(),
location: std::panic::Location::caller().to_snafu_location(),
}
}
}
impl From<ArrowError> for Error {
#[track_caller]
fn from(source: ArrowError) -> Self {
Self::Arrow {
message: source.to_string(),
location: std::panic::Location::caller().to_snafu_location(),
}
}
}
impl From<JsonError> for Error {
#[track_caller]
fn from(source: JsonError) -> Self {
Self::JSON {
message: source.to_string(),
location: std::panic::Location::caller().to_snafu_location(),
}
}
}
impl From<lance::Error> for Error {
#[track_caller]
fn from(source: lance::Error) -> Self {
match source {
lance::Error::DatasetNotFound {
path,
source: _,
location,
} => Self::DatasetNotFound { path, location },
lance::Error::DatasetAlreadyExists { uri, location } => {
Self::DatasetAlreadyExists { uri, location }
}
lance::Error::IO { source, location } => Self::IO { source, location },
lance::Error::Arrow { message, location } => Self::Arrow { message, location },
lance::Error::Index { message, location } => Self::Index { message, location },
lance::Error::InvalidInput { source, location } => Self::InvalidArgument {
message: source.to_string(),
location,
},
_ => Self::OtherLance {
message: source.to_string(),
location: std::panic::Location::caller().to_snafu_location(),
},
}
}
}
impl From<lancedb::Error> for Error {
#[track_caller]
fn from(source: lancedb::Error) -> Self {
match source {
lancedb::Error::InvalidTableName { name, reason } => {
Self::InvalidTableName { name, reason }
}
lancedb::Error::InvalidInput { message } => Self::InvalidArgument {
message,
location: std::panic::Location::caller().to_snafu_location(),
},
lancedb::Error::TableNotFound { name } => Self::TableNotFound { name },
lancedb::Error::TableAlreadyExists { name } => Self::TableAlreadyExists { name },
lancedb::Error::EmbeddingFunctionNotFound { name, reason } => {
Self::EmbeddingFunctionNotFound {
name,
reason,
location: std::panic::Location::caller().to_snafu_location(),
}
}
lancedb::Error::Arrow { source } => Self::Arrow {
message: source.to_string(),
location: std::panic::Location::caller().to_snafu_location(),
},
lancedb::Error::Lance { source } => Self::from(source),
_ => Self::OtherLanceDB {
message: source.to_string(),
location: std::panic::Location::caller().to_snafu_location(),
},
}
}
}

View File

@@ -0,0 +1,204 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use core::slice;
use jni::objects::{JByteBuffer, JObjectArray, JString};
use jni::sys::jobjectArray;
use jni::{objects::JObject, JNIEnv};
use crate::error::{Error, Result};
/// TODO(lu) import from lance-jni without duplicate
/// Extend JNIEnv with helper functions.
pub trait JNIEnvExt {
/// Get integers from Java List<Integer> object.
fn get_integers(&mut self, obj: &JObject) -> Result<Vec<i32>>;
/// Get strings from Java List<String> object.
fn get_strings(&mut self, obj: &JObject) -> Result<Vec<String>>;
/// Get strings from Java String[] object.
/// Note that get Option<Vec<String>> from Java Optional<String[]> just doesn't work.
#[allow(unused)]
fn get_strings_array(&mut self, obj: jobjectArray) -> Result<Vec<String>>;
/// Get Option<String> from Java Optional<String>.
fn get_string_opt(&mut self, obj: &JObject) -> Result<Option<String>>;
/// Get Option<Vec<String>> from Java Optional<List<String>>.
#[allow(unused)]
fn get_strings_opt(&mut self, obj: &JObject) -> Result<Option<Vec<String>>>;
/// Get Option<i32> from Java Optional<Integer>.
fn get_int_opt(&mut self, obj: &JObject) -> Result<Option<i32>>;
/// Get Option<Vec<i32>> from Java Optional<List<Integer>>.
fn get_ints_opt(&mut self, obj: &JObject) -> Result<Option<Vec<i32>>>;
/// Get Option<i64> from Java Optional<Long>.
#[allow(unused)]
fn get_long_opt(&mut self, obj: &JObject) -> Result<Option<i64>>;
/// Get Option<u64> from Java Optional<Long>.
#[allow(unused)]
fn get_u64_opt(&mut self, obj: &JObject) -> Result<Option<u64>>;
/// Get Option<&[u8]> from Java Optional<ByteBuffer>.
#[allow(unused)]
fn get_bytes_opt(&mut self, obj: &JObject) -> Result<Option<&[u8]>>;
fn get_optional<T, F>(&mut self, obj: &JObject, f: F) -> Result<Option<T>>
where
F: FnOnce(&mut JNIEnv, &JObject) -> Result<T>;
}
impl JNIEnvExt for JNIEnv<'_> {
fn get_integers(&mut self, obj: &JObject) -> Result<Vec<i32>> {
let list = self.get_list(obj)?;
let mut iter = list.iter(self)?;
let mut results = Vec::with_capacity(list.size(self)? as usize);
while let Some(elem) = iter.next(self)? {
let int_obj = self.call_method(elem, "intValue", "()I", &[])?;
let int_value = int_obj.i()?;
results.push(int_value);
}
Ok(results)
}
fn get_strings(&mut self, obj: &JObject) -> Result<Vec<String>> {
let list = self.get_list(obj)?;
let mut iter = list.iter(self)?;
let mut results = Vec::with_capacity(list.size(self)? as usize);
while let Some(elem) = iter.next(self)? {
let jstr = JString::from(elem);
let val = self.get_string(&jstr)?;
results.push(val.to_str()?.to_string())
}
Ok(results)
}
fn get_strings_array(&mut self, obj: jobjectArray) -> Result<Vec<String>> {
let jobject_array = unsafe { JObjectArray::from_raw(obj) };
let array_len = self.get_array_length(&jobject_array)?;
let mut res: Vec<String> = Vec::new();
for i in 0..array_len {
let item: JString = self.get_object_array_element(&jobject_array, i)?.into();
res.push(self.get_string(&item)?.into());
}
Ok(res)
}
fn get_string_opt(&mut self, obj: &JObject) -> Result<Option<String>> {
self.get_optional(obj, |env, inner_obj| {
let java_obj_gen = env.call_method(inner_obj, "get", "()Ljava/lang/Object;", &[])?;
let java_string_obj = java_obj_gen.l()?;
let jstr = JString::from(java_string_obj);
let val = env.get_string(&jstr)?;
Ok(val.to_str()?.to_string())
})
}
fn get_strings_opt(&mut self, obj: &JObject) -> Result<Option<Vec<String>>> {
self.get_optional(obj, |env, inner_obj| {
let java_obj_gen = env.call_method(inner_obj, "get", "()Ljava/lang/Object;", &[])?;
let java_list_obj = java_obj_gen.l()?;
env.get_strings(&java_list_obj)
})
}
fn get_int_opt(&mut self, obj: &JObject) -> Result<Option<i32>> {
self.get_optional(obj, |env, inner_obj| {
let java_obj_gen = env.call_method(inner_obj, "get", "()Ljava/lang/Object;", &[])?;
let java_int_obj = java_obj_gen.l()?;
let int_obj = env.call_method(java_int_obj, "intValue", "()I", &[])?;
let int_value = int_obj.i()?;
Ok(int_value)
})
}
fn get_ints_opt(&mut self, obj: &JObject) -> Result<Option<Vec<i32>>> {
self.get_optional(obj, |env, inner_obj| {
let java_obj_gen = env.call_method(inner_obj, "get", "()Ljava/lang/Object;", &[])?;
let java_list_obj = java_obj_gen.l()?;
env.get_integers(&java_list_obj)
})
}
fn get_long_opt(&mut self, obj: &JObject) -> Result<Option<i64>> {
self.get_optional(obj, |env, inner_obj| {
let java_obj_gen = env.call_method(inner_obj, "get", "()Ljava/lang/Object;", &[])?;
let java_long_obj = java_obj_gen.l()?;
let long_obj = env.call_method(java_long_obj, "longValue", "()J", &[])?;
let long_value = long_obj.j()?;
Ok(long_value)
})
}
fn get_u64_opt(&mut self, obj: &JObject) -> Result<Option<u64>> {
self.get_optional(obj, |env, inner_obj| {
let java_obj_gen = env.call_method(inner_obj, "get", "()Ljava/lang/Object;", &[])?;
let java_long_obj = java_obj_gen.l()?;
let long_obj = env.call_method(java_long_obj, "longValue", "()J", &[])?;
let long_value = long_obj.j()?;
Ok(long_value as u64)
})
}
fn get_bytes_opt(&mut self, obj: &JObject) -> Result<Option<&[u8]>> {
self.get_optional(obj, |env, inner_obj| {
let java_obj_gen = env.call_method(inner_obj, "get", "()Ljava/lang/Object;", &[])?;
let java_byte_buffer_obj = java_obj_gen.l()?;
let j_byte_buffer = JByteBuffer::from(java_byte_buffer_obj);
let raw_data = env.get_direct_buffer_address(&j_byte_buffer)?;
let capacity = env.get_direct_buffer_capacity(&j_byte_buffer)?;
let data = unsafe { slice::from_raw_parts(raw_data, capacity) };
Ok(data)
})
}
fn get_optional<T, F>(&mut self, obj: &JObject, f: F) -> Result<Option<T>>
where
F: FnOnce(&mut JNIEnv, &JObject) -> Result<T>,
{
if obj.is_null() {
return Ok(None);
}
let is_empty = self.call_method(obj, "isEmpty", "()Z", &[])?;
if is_empty.z()? {
// TODO(lu): put get java object into here cuz can only get java Object
Ok(None)
} else {
f(self, obj).map(Some)
}
}
}
#[no_mangle]
pub extern "system" fn Java_com_lancedb_lance_test_JniTestHelper_parseInts(
mut env: JNIEnv,
_obj: JObject,
list_obj: JObject, // List<Integer>
) {
ok_or_throw_without_return!(env, env.get_integers(&list_obj));
}
#[no_mangle]
pub extern "system" fn Java_com_lancedb_lance_test_JniTestHelper_parseIntsOpt(
mut env: JNIEnv,
_obj: JObject,
list_obj: JObject, // Optional<List<Integer>>
) {
ok_or_throw_without_return!(env, env.get_ints_opt(&list_obj));
}

View File

@@ -0,0 +1,68 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use lazy_static::lazy_static;
// TODO import from lance-jni without duplicate
#[macro_export]
macro_rules! ok_or_throw {
($env:expr, $result:expr) => {
match $result {
Ok(value) => value,
Err(err) => {
Error::from(err).throw(&mut $env);
return JObject::null();
}
}
};
}
macro_rules! ok_or_throw_without_return {
($env:expr, $result:expr) => {
match $result {
Ok(value) => value,
Err(err) => {
Error::from(err).throw(&mut $env);
return;
}
}
};
}
#[macro_export]
macro_rules! ok_or_throw_with_return {
($env:expr, $result:expr, $ret:expr) => {
match $result {
Ok(value) => value,
Err(err) => {
Error::from(err).throw(&mut $env);
return $ret;
}
}
};
}
mod connection;
pub mod error;
mod ffi;
mod traits;
pub use error::{Error, Result};
lazy_static! {
static ref RT: tokio::runtime::Runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.expect("Failed to create tokio runtime");
}

View File

@@ -0,0 +1,122 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use jni::objects::{JMap, JObject, JString, JValue};
use jni::JNIEnv;
use crate::Result;
pub trait FromJObject<T> {
fn extract(&self) -> Result<T>;
}
/// Convert a Rust type into a Java Object.
pub trait IntoJava {
fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> JObject<'a>;
}
impl FromJObject<i32> for JObject<'_> {
fn extract(&self) -> Result<i32> {
Ok(JValue::from(self).i()?)
}
}
impl FromJObject<i64> for JObject<'_> {
fn extract(&self) -> Result<i64> {
Ok(JValue::from(self).j()?)
}
}
impl FromJObject<f32> for JObject<'_> {
fn extract(&self) -> Result<f32> {
Ok(JValue::from(self).f()?)
}
}
impl FromJObject<f64> for JObject<'_> {
fn extract(&self) -> Result<f64> {
Ok(JValue::from(self).d()?)
}
}
pub trait FromJString {
fn extract(&self, env: &mut JNIEnv) -> Result<String>;
}
impl FromJString for JString<'_> {
fn extract(&self, env: &mut JNIEnv) -> Result<String> {
Ok(env.get_string(self)?.into())
}
}
pub trait JMapExt {
#[allow(dead_code)]
fn get_string(&self, env: &mut JNIEnv, key: &str) -> Result<Option<String>>;
#[allow(dead_code)]
fn get_i32(&self, env: &mut JNIEnv, key: &str) -> Result<Option<i32>>;
#[allow(dead_code)]
fn get_i64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<i64>>;
#[allow(dead_code)]
fn get_f32(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f32>>;
#[allow(dead_code)]
fn get_f64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f64>>;
}
fn get_map_value<T>(env: &mut JNIEnv, map: &JMap, key: &str) -> Result<Option<T>>
where
for<'a> JObject<'a>: FromJObject<T>,
{
let key_obj: JObject = env.new_string(key)?.into();
if let Some(value) = map.get(env, &key_obj)? {
if value.is_null() {
Ok(None)
} else {
Ok(Some(value.extract()?))
}
} else {
Ok(None)
}
}
impl JMapExt for JMap<'_, '_, '_> {
fn get_string(&self, env: &mut JNIEnv, key: &str) -> Result<Option<String>> {
let key_obj: JObject = env.new_string(key)?.into();
if let Some(value) = self.get(env, &key_obj)? {
let value_str: JString = value.into();
Ok(Some(value_str.extract(env)?))
} else {
Ok(None)
}
}
fn get_i32(&self, env: &mut JNIEnv, key: &str) -> Result<Option<i32>> {
get_map_value(env, self, key)
}
fn get_i64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<i64>> {
get_map_value(env, self, key)
}
fn get_f32(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f32>> {
get_map_value(env, self, key)
}
fn get_f64(&self, env: &mut JNIEnv, key: &str) -> Result<Option<f64>> {
get_map_value(env, self, key)
}
}

92
java/core/pom.xml Normal file
View File

@@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.0.3-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>lancedb-core</artifactId>
<name>LanceDB Core</name>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-memory-netty</artifactId>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-c-data</artifactId>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-dataset</artifactId>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</dependency>
<dependency>
<groupId>org.questdb</groupId>
<artifactId>jar-jni</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<profiles>
<profile>
<id>build-jni</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.questdb</groupId>
<artifactId>rust-maven-plugin</artifactId>
<version>1.1.1</version>
<executions>
<execution>
<id>lancedb-jni</id>
<goals>
<goal>build</goal>
</goals>
<configuration>
<path>lancedb-jni</path>
<!--<release>true</release>-->
<!-- Copy native libraries to target/classes for runtime access -->
<copyTo>${project.build.directory}/classes/nativelib</copyTo>
<copyWithPlatformDir>true</copyWithPlatformDir>
</configuration>
</execution>
<execution>
<id>lancedb-jni-test</id>
<goals>
<goal>test</goal>
</goals>
<configuration>
<path>lancedb-jni</path>
<release>false</release>
<verbosity>-v</verbosity>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>

View File

@@ -0,0 +1,120 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.lancedb.lancedb;
import io.questdb.jar.jni.JarJniLoader;
import java.io.Closeable;
import java.util.List;
import java.util.Optional;
/**
* Represents LanceDB database.
*/
public class Connection implements Closeable {
static {
JarJniLoader.loadLib(Connection.class, "/nativelib", "lancedb_jni");
}
private long nativeConnectionHandle;
/**
* Connect to a LanceDB instance.
*/
public static native Connection connect(String uri);
/**
* Get the names of all tables in the database. The names are sorted in
* ascending order.
*
* @return the table names
*/
public List<String> tableNames() {
return tableNames(Optional.empty(), Optional.empty());
}
/**
* Get the names of filtered tables in the database. The names are sorted in
* ascending order.
*
* @param limit The number of results to return.
* @return the table names
*/
public List<String> tableNames(int limit) {
return tableNames(Optional.empty(), Optional.of(limit));
}
/**
* Get the names of filtered tables in the database. The names are sorted in
* ascending order.
*
* @param startAfter If present, only return names that come lexicographically after the supplied
* value. This can be combined with limit to implement pagination
* by setting this to the last table name from the previous page.
* @return the table names
*/
public List<String> tableNames(String startAfter) {
return tableNames(Optional.of(startAfter), Optional.empty());
}
/**
* Get the names of filtered tables in the database. The names are sorted in
* ascending order.
*
* @param startAfter If present, only return names that come lexicographically after the supplied
* value. This can be combined with limit to implement pagination
* by setting this to the last table name from the previous page.
* @param limit The number of results to return.
* @return the table names
*/
public List<String> tableNames(String startAfter, int limit) {
return tableNames(Optional.of(startAfter), Optional.of(limit));
}
/**
* Get the names of filtered tables in the database. The names are sorted in
* ascending order.
*
* @param startAfter If present, only return names that come lexicographically after the supplied
* value. This can be combined with limit to implement pagination
* by setting this to the last table name from the previous page.
* @param limit The number of results to return.
* @return the table names
*/
public native List<String> tableNames(
Optional<String> startAfter, Optional<Integer> limit);
/**
* Closes this connection and releases any system resources associated with it. If
* the connection is
* already closed, then invoking this method has no effect.
*/
@Override
public void close() {
if (nativeConnectionHandle != 0) {
releaseNativeConnection(nativeConnectionHandle);
nativeConnectionHandle = 0;
}
}
/**
* Native method to release the Lance connection resources associated with the
* given handle.
*
* @param handle The native handle to the connection resource.
*/
private native void releaseNativeConnection(long handle);
private Connection() {}
}

View File

@@ -0,0 +1,135 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.lancedb.lancedb;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.file.Path;
import java.util.List;
import java.net.URL;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
public class ConnectionTest {
private static final String[] TABLE_NAMES = {
"dataset_version",
"new_empty_dataset",
"test",
"write_stream"
};
@TempDir
static Path tempDir; // Temporary directory for the tests
private static URL lanceDbURL;
@BeforeAll
static void setUp() {
ClassLoader classLoader = ConnectionTest.class.getClassLoader();
lanceDbURL = classLoader.getResource("example_db");
}
@Test
void emptyDB() {
String databaseUri = tempDir.resolve("emptyDB").toString();
try (Connection conn = Connection.connect(databaseUri)) {
List<String> tableNames = conn.tableNames();
assertTrue(tableNames.isEmpty());
}
}
@Test
void tableNames() {
try (Connection conn = Connection.connect(lanceDbURL.toString())) {
List<String> tableNames = conn.tableNames();
assertEquals(4, tableNames.size());
for (int i = 0; i < TABLE_NAMES.length; i++) {
assertEquals(TABLE_NAMES[i], tableNames.get(i));
}
}
}
@Test
void tableNamesStartAfter() {
try (Connection conn = Connection.connect(lanceDbURL.toString())) {
assertTableNamesStartAfter(conn, TABLE_NAMES[0], 3, TABLE_NAMES[1], TABLE_NAMES[2], TABLE_NAMES[3]);
assertTableNamesStartAfter(conn, TABLE_NAMES[1], 2, TABLE_NAMES[2], TABLE_NAMES[3]);
assertTableNamesStartAfter(conn, TABLE_NAMES[2], 1, TABLE_NAMES[3]);
assertTableNamesStartAfter(conn, TABLE_NAMES[3], 0);
assertTableNamesStartAfter(conn, "a_dataset", 4, TABLE_NAMES[0], TABLE_NAMES[1], TABLE_NAMES[2], TABLE_NAMES[3]);
assertTableNamesStartAfter(conn, "o_dataset", 2, TABLE_NAMES[2], TABLE_NAMES[3]);
assertTableNamesStartAfter(conn, "v_dataset", 1, TABLE_NAMES[3]);
assertTableNamesStartAfter(conn, "z_dataset", 0);
}
}
private void assertTableNamesStartAfter(Connection conn, String startAfter, int expectedSize, String... expectedNames) {
List<String> tableNames = conn.tableNames(startAfter);
assertEquals(expectedSize, tableNames.size());
for (int i = 0; i < expectedNames.length; i++) {
assertEquals(expectedNames[i], tableNames.get(i));
}
}
@Test
void tableNamesLimit() {
try (Connection conn = Connection.connect(lanceDbURL.toString())) {
for (int i = 0; i <= TABLE_NAMES.length; i++) {
List<String> tableNames = conn.tableNames(i);
assertEquals(i, tableNames.size());
for (int j = 0; j < i; j++) {
assertEquals(TABLE_NAMES[j], tableNames.get(j));
}
}
}
}
@Test
void tableNamesStartAfterLimit() {
try (Connection conn = Connection.connect(lanceDbURL.toString())) {
List<String> tableNames = conn.tableNames(TABLE_NAMES[0], 2);
assertEquals(2, tableNames.size());
assertEquals(TABLE_NAMES[1], tableNames.get(0));
assertEquals(TABLE_NAMES[2], tableNames.get(1));
tableNames = conn.tableNames(TABLE_NAMES[1], 1);
assertEquals(1, tableNames.size());
assertEquals(TABLE_NAMES[2], tableNames.get(0));
tableNames = conn.tableNames(TABLE_NAMES[2], 2);
assertEquals(1, tableNames.size());
assertEquals(TABLE_NAMES[3], tableNames.get(0));
tableNames = conn.tableNames(TABLE_NAMES[3], 2);
assertEquals(0, tableNames.size());
tableNames = conn.tableNames(TABLE_NAMES[0], 0);
assertEquals(0, tableNames.size());
// Limit larger than the number of remaining tables
tableNames = conn.tableNames(TABLE_NAMES[0], 10);
assertEquals(3, tableNames.size());
assertEquals(TABLE_NAMES[1], tableNames.get(0));
assertEquals(TABLE_NAMES[2], tableNames.get(1));
assertEquals(TABLE_NAMES[3], tableNames.get(2));
// Start after a value not in the list
tableNames = conn.tableNames("non_existent_table", 2);
assertEquals(2, tableNames.size());
assertEquals(TABLE_NAMES[2], tableNames.get(0));
assertEquals(TABLE_NAMES[3], tableNames.get(1));
// Start after the last table with a limit
tableNames = conn.tableNames(TABLE_NAMES[3], 1);
assertEquals(0, tableNames.size());
}
}
}

View File

@@ -0,0 +1 @@
$d51afd07-e3cd-4c76-9b9b-787e13fd55b0<62>=id <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*int3208name <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*string08

View File

@@ -0,0 +1 @@
$15648e72-076f-4ef1-8b90-10d305b95b3b<33>=id <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*int3208name <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*string08

View File

@@ -0,0 +1 @@
$a3689caf-4f6b-4afc-a3c7-97af75661843<34>oitem <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*string8price <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*double80vector <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*fixed_size_list:float:28

219
java/pom.xml Normal file
View File

@@ -0,0 +1,219 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.0.3-SNAPSHOT</version>
<packaging>pom</packaging>
<name>Lance Parent</name>
<description>LanceDB Java API</description>
<url>http://lancedb.com/</url>
<developers>
<developer>
<name>Lance DB Dev Group</name>
<email>dev@lancedb.com</email>
</developer>
</developers>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
</license>
</licenses>
<scm>
<developerConnection>scm:git:git@github.com:lancedb/lancedb.git</developerConnection>
<tag>HEAD</tag>
<url>scm:git:git@github.com:lancedb/lancedb.git</url>
</scm>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<arrow.version>15.0.0</arrow.version>
</properties>
<modules>
<module>core</module>
</modules>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
<version>${arrow.version}</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-memory-netty</artifactId>
<version>${arrow.version}</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-c-data</artifactId>
<version>${arrow.version}</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-dataset</artifactId>
<version>${arrow.version}</version>
</dependency>
<dependency>
<groupId>org.questdb</groupId>
<artifactId>jar-jni</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.10.1</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20210307</version>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<version>3.3.1</version>
<configuration>
<configLocation>google_checks.xml</configLocation>
<consoleOutput>true</consoleOutput>
<failsOnError>true</failsOnError>
<violationSeverity>warning</violationSeverity>
<linkXRef>false</linkXRef>
</configuration>
<executions>
<execution>
<id>validate</id>
<phase>validate</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<compilerArgs>
<arg>-h</arg>
<arg>target/headers</arg>
</compilerArgs>
</configuration>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.5</version>
<configuration>
<argLine>--add-opens=java.base/java.nio=ALL-UNNAMED</argLine>
<forkNode implementation="org.apache.maven.plugin.surefire.extensions.SurefireForkNodeFactory" />
<useSystemClassLoader>false</useSystemClassLoader>
</configuration>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
<profiles>
<profile>
<id>deploy-to-ossrh</id>
<build>
<plugins>
<plugin>
<groupId>org.sonatype.central</groupId>
<artifactId>central-publishing-maven-plugin</artifactId>
<version>0.4.0</version>
<extensions>true</extensions>
<configuration>
<publishingServerId>ossrh</publishingServerId>
<tokenAuth>true</tokenAuth>
</configuration>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://s01.oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.5</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>

View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.4.20",
"version": "0.5.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.4.20",
"version": "0.5.0",
"cpu": [
"x64",
"arm64"

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.4.20",
"version": "0.5.0",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.4.20",
"version": "0.5.0",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.4.20",
"version": "0.5.0",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.4.20",
"version": "0.5.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.4.20",
"version": "0.5.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.4.20",
"version": "0.5.0",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb",
"version": "0.4.20",
"version": "0.5.0",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"napi": {

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.7.0"
current_version = "0.8.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.7.0"
version = "0.8.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -10,7 +10,7 @@ dependencies = [
"tqdm>=4.27.0",
"pydantic>=1.10",
"attrs>=21.3.0",
"semver",
"packaging",
"cachetools",
"overrides>=0.7",
]

View File

@@ -74,7 +74,7 @@ class BedRockText(TextEmbeddingFunction):
profile_name: Union[str, None] = None
role_session_name: str = "lancedb-embeddings"
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat
class Config:
keep_untouched = (cached_property,)

View File

@@ -90,7 +90,7 @@ class GeminiText(TextEmbeddingFunction):
query_task_type: str = "retrieval_query"
source_task_type: str = "retrieval_document"
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat
class Config:
keep_untouched = (cached_property,)

View File

@@ -40,7 +40,7 @@ class ImageBindEmbeddings(EmbeddingFunction):
device: str = "cpu"
normalize: bool = False
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat
class Config:
keep_untouched = (cached_property,)

View File

@@ -54,7 +54,7 @@ class TransformersEmbeddingFunction(EmbeddingFunction):
self._tokenizer = transformers.AutoTokenizer.from_pretrained(self.name)
self._model = transformers.AutoModel.from_pretrained(self.name)
if PYDANTIC_VERSION < (2, 0): # Pydantic 1.x compat
if PYDANTIC_VERSION.major < 2: # Pydantic 1.x compat
class Config:
keep_untouched = (cached_property,)

View File

@@ -35,13 +35,13 @@ from typing import (
import numpy as np
import pyarrow as pa
import pydantic
import semver
from packaging.version import Version
PYDANTIC_VERSION = semver.parse_version_info(pydantic.__version__)
PYDANTIC_VERSION = Version(pydantic.__version__)
try:
from pydantic_core import CoreSchema, core_schema
except ImportError:
if PYDANTIC_VERSION >= (2,):
if PYDANTIC_VERSION.major >= 2:
raise
if TYPE_CHECKING:
@@ -144,7 +144,7 @@ def Vector(
raise TypeError("A list of numbers or numpy.ndarray is needed")
return cls(v)
if PYDANTIC_VERSION < (2, 0):
if PYDANTIC_VERSION.major < 2:
@classmethod
def __modify_schema__(cls, field_schema: Dict[str, Any]):

View File

@@ -1,5 +1,5 @@
import os
import semver
from packaging.version import Version
from functools import cached_property
from typing import Union
@@ -44,9 +44,8 @@ class CohereReranker(Reranker):
def _client(self):
cohere = attempt_import_or_raise("cohere")
# ensure version is at least 0.5.0
if (
hasattr(cohere, "__version__")
and semver.compare(cohere.__version__, "5.0.0") < 0
if hasattr(cohere, "__version__") and Version(cohere.__version__) < Version(
"0.5.0"
):
raise ValueError(
f"cohere version must be at least 0.5.0, found {cohere.__version__}"

View File

@@ -178,7 +178,7 @@ def test_fixed_size_list_field():
li: List[int]
data = TestModel(vec=list(range(16)), li=[1, 2, 3])
if PYDANTIC_VERSION >= (2,):
if PYDANTIC_VERSION.major >= 2:
assert json.loads(data.model_dump_json()) == {
"vec": list(range(16)),
"li": [1, 2, 3],
@@ -197,7 +197,7 @@ def test_fixed_size_list_field():
]
)
if PYDANTIC_VERSION >= (2,):
if PYDANTIC_VERSION.major >= 2:
json_schema = TestModel.model_json_schema()
else:
json_schema = TestModel.schema()

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.4.20"
version = "0.5.0"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.4.20"
version = "0.5.0"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -38,6 +38,7 @@ url.workspace = true
regex.workspace = true
serde = { version = "^1" }
serde_json = { version = "1" }
serde_with = { version = "3.8.1" }
# For remote feature
reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true }
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }

View File

@@ -14,11 +14,14 @@
use std::sync::Arc;
use serde::Deserialize;
use serde_with::skip_serializing_none;
use crate::{table::TableInternal, Result};
use self::{
scalar::BTreeIndexBuilder,
vector::{IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
};
pub mod scalar;
@@ -28,6 +31,7 @@ pub enum Index {
Auto,
BTree(BTreeIndexBuilder),
IvfPq(IvfPqIndexBuilder),
IvfHnswPq(IvfHnswPqIndexBuilder),
IvfHnswSq(IvfHnswSqIndexBuilder),
}
@@ -69,6 +73,7 @@ impl IndexBuilder {
#[derive(Debug, Clone, PartialEq)]
pub enum IndexType {
IvfPq,
IvfHnswPq,
IvfHnswSq,
BTree,
}
@@ -83,3 +88,19 @@ pub struct IndexConfig {
/// be more columns to represent composite indices.
pub columns: Vec<String>,
}
#[skip_serializing_none]
#[derive(Debug, Deserialize)]
pub struct IndexMetadata {
pub metric_type: Option<String>,
pub index_type: Option<String>,
}
#[skip_serializing_none]
#[derive(Debug, Deserialize)]
pub struct IndexStatistics {
pub num_indexed_rows: usize,
pub num_unindexed_rows: usize,
pub index_type: Option<String>,
pub indices: Vec<IndexMetadata>,
}

View File

@@ -19,8 +19,6 @@
//! values
use std::cmp::max;
use serde::Deserialize;
use lance::table::format::{Index, Manifest};
use crate::DistanceType;
@@ -46,18 +44,118 @@ impl VectorIndex {
}
}
#[derive(Debug, Deserialize)]
pub struct VectorIndexMetadata {
pub metric_type: String,
pub index_type: String,
macro_rules! impl_distance_type_setter {
() => {
/// [DistanceType] to use to build the index.
///
/// Default value is [DistanceType::L2].
///
/// This is used when training the index to calculate the IVF partitions (vectors are
/// grouped in partitions with similar vectors according to this distance type) and to
/// calculate a subvector's code during quantization.
///
/// The metric type used to train an index MUST match the metric type used to search the
/// index. Failure to do so will yield inaccurate results.
pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
self.distance_type = distance_type;
self
}
};
}
#[derive(Debug, Deserialize)]
pub struct VectorIndexStatistics {
pub num_indexed_rows: usize,
pub num_unindexed_rows: usize,
pub index_type: String,
pub indices: Vec<VectorIndexMetadata>,
macro_rules! impl_ivf_params_setter {
() => {
/// The number of IVF partitions to create.
///
/// This value should generally scale with the number of rows in the dataset. By default
/// the number of partitions is the square root of the number of rows.
///
/// If this value is too large then the first part of the search (picking the right partition)
/// will be slow. If this value is too small then the second part of the search (searching
/// within a partition) will be slow.
pub fn num_partitions(mut self, num_partitions: u32) -> Self {
self.num_partitions = Some(num_partitions);
self
}
/// The rate used to calculate the number of training vectors for kmeans.
///
/// When an IVF index is trained, we need to calculate partitions. These are groups
/// of vectors that are similar to each other. To do this we use an algorithm called kmeans.
///
/// Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
/// random sample of the data. This parameter controls the size of the sample. The total
/// number of vectors used to train the index is `sample_rate * num_partitions`.
///
/// Increasing this value might improve the quality of the index but in most cases the
/// default should be sufficient.
///
/// The default value is 256.
pub fn sample_rate(mut self, sample_rate: u32) -> Self {
self.sample_rate = sample_rate;
self
}
/// Max iterations to train kmeans.
///
/// When training an IVF index we use kmeans to calculate the partitions. This parameter
/// controls how many iterations of kmeans to run.
///
/// Increasing this might improve the quality of the index but in most cases the parameter
/// is unused because kmeans will converge with fewer iterations. The parameter is only
/// used in cases where kmeans does not appear to converge. In those cases it is unlikely
/// that setting this larger will lead to the index converging anyways.
///
/// The default value is 50.
pub fn max_iterations(mut self, max_iterations: u32) -> Self {
self.max_iterations = max_iterations;
self
}
};
}
macro_rules! impl_pq_params_setter {
() => {
/// Number of sub-vectors of PQ.
///
/// This value controls how much the vector is compressed during the quantization step.
/// The more sub vectors there are the less the vector is compressed. The default is
/// the dimension of the vector divided by 16. If the dimension is not evenly divisible
/// by 16 we use the dimension divded by 8.
///
/// The above two cases are highly preferred. Having 8 or 16 values per subvector allows
/// us to use efficient SIMD instructions.
///
/// If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
/// will likely result in poor performance.
pub fn num_sub_vectors(mut self, num_sub_vectors: u32) -> Self {
self.num_sub_vectors = Some(num_sub_vectors);
self
}
};
}
macro_rules! impl_hnsw_params_setter {
() => {
/// The number of neighbors to select for each vector in the HNSW graph.
/// This value controls the tradeoff between search speed and accuracy.
/// The higher the value the more accurate the search but the slower it will be.
/// The default value is 20.
pub fn num_edges(mut self, m: u32) -> Self {
self.m = m;
self
}
/// The number of candidates to evaluate during the construction of the HNSW graph.
/// This value controls the tradeoff between build speed and accuracy.
/// The higher the value the more accurate the build but the slower it will be.
/// This value should be set to a value that is not less than `ef` in the search phase.
/// The default value is 300.
pub fn ef_construction(mut self, ef_construction: u32) -> Self {
self.ef_construction = ef_construction;
self
}
};
}
/// Builder for an IVF PQ index.
@@ -106,84 +204,9 @@ impl Default for IvfPqIndexBuilder {
}
impl IvfPqIndexBuilder {
/// [DistanceType] to use to build the index.
///
/// Default value is [DistanceType::L2].
///
/// This is used when training the index to calculate the IVF partitions (vectors are
/// grouped in partitions with similar vectors according to this distance type) and to
/// calculate a subvector's code during quantization.
///
/// The metric type used to train an index MUST match the metric type used to search the
/// index. Failure to do so will yield inaccurate results.
pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
self.distance_type = distance_type;
self
}
/// The number of IVF partitions to create.
///
/// This value should generally scale with the number of rows in the dataset. By default
/// the number of partitions is the square root of the number of rows.
///
/// If this value is too large then the first part of the search (picking the right partition)
/// will be slow. If this value is too small then the second part of the search (searching
/// within a partition) will be slow.
pub fn num_partitions(mut self, num_partitions: u32) -> Self {
self.num_partitions = Some(num_partitions);
self
}
/// Number of sub-vectors of PQ.
///
/// This value controls how much the vector is compressed during the quantization step.
/// The more sub vectors there are the less the vector is compressed. The default is
/// the dimension of the vector divided by 16. If the dimension is not evenly divisible
/// by 16 we use the dimension divded by 8.
///
/// The above two cases are highly preferred. Having 8 or 16 values per subvector allows
/// us to use efficient SIMD instructions.
///
/// If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
/// will likely result in poor performance.
pub fn num_sub_vectors(mut self, num_sub_vectors: u32) -> Self {
self.num_sub_vectors = Some(num_sub_vectors);
self
}
/// The rate used to calculate the number of training vectors for kmeans.
///
/// When an IVF PQ index is trained, we need to calculate partitions. These are groups
/// of vectors that are similar to each other. To do this we use an algorithm called kmeans.
///
/// Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
/// random sample of the data. This parameter controls the size of the sample. The total
/// number of vectors used to train the index is `sample_rate * num_partitions`.
///
/// Increasing this value might improve the quality of the index but in most cases the
/// default should be sufficient.
///
/// The default value is 256.
pub fn sample_rate(mut self, sample_rate: u32) -> Self {
self.sample_rate = sample_rate;
self
}
/// Max iterations to train kmeans.
///
/// When training an IVF PQ index we use kmeans to calculate the partitions. This parameter
/// controls how many iterations of kmeans to run.
///
/// Increasing this might improve the quality of the index but in most cases the parameter
/// is unused because kmeans will converge with fewer iterations. The parameter is only
/// used in cases where kmeans does not appear to converge. In those cases it is unlikely
/// that setting this larger will lead to the index converging anyways.
///
/// The default value is 50.
pub fn max_iterations(mut self, max_iterations: u32) -> Self {
self.max_iterations = max_iterations;
self
}
impl_distance_type_setter!();
impl_ivf_params_setter!();
impl_pq_params_setter!();
}
pub(crate) fn suggested_num_partitions(rows: usize) -> u32 {
@@ -206,6 +229,51 @@ pub(crate) fn suggested_num_sub_vectors(dim: u32) -> u32 {
}
}
/// Builder for an IVF HNSW PQ index.
///
/// This index is a combination of IVF and HNSW.
/// The IVF part is the same as the IVF PQ index.
/// For each IVF partition, this builds a HNSW graph, the graph is used to
/// quickly find the closest vectors to a query vector.
///
/// The PQ (product quantizer) is used to compress the vectors as the same as IVF PQ.
#[derive(Debug, Clone)]
pub struct IvfHnswPqIndexBuilder {
// IVF
pub(crate) distance_type: DistanceType,
pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
// HNSW
pub(crate) m: u32,
pub(crate) ef_construction: u32,
// PQ
pub(crate) num_sub_vectors: Option<u32>,
}
impl Default for IvfHnswPqIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
num_sub_vectors: None,
sample_rate: 256,
max_iterations: 50,
m: 20,
ef_construction: 300,
}
}
}
impl IvfHnswPqIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
impl_hnsw_params_setter!();
impl_pq_params_setter!();
}
/// Builder for an IVF_HNSW_SQ index.
///
/// This index is a combination of IVF and HNSW.
@@ -244,85 +312,7 @@ impl Default for IvfHnswSqIndexBuilder {
}
impl IvfHnswSqIndexBuilder {
/// [DistanceType] to use to build the index.
///
/// Default value is [DistanceType::L2].
///
/// This is used when training the index to calculate the IVF partitions (vectors are
/// grouped in partitions with similar vectors according to this distance type)
///
/// The metric type used to train an index MUST match the metric type used to search the
/// index. Failure to do so will yield inaccurate results.
///
/// Now IVF_HNSW_SQ only supports L2 and Cosine distance types.
pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
self.distance_type = distance_type;
self
}
/// The number of IVF partitions to create.
///
/// This value should generally scale with the number of rows in the dataset. By default
/// the number of partitions is the square root of the number of rows.
///
/// If this value is too large then the first part of the search (picking the right partition)
/// will be slow. If this value is too small then the second part of the search (searching
/// within a partition) will be slow.
pub fn num_partitions(mut self, num_partitions: u32) -> Self {
self.num_partitions = Some(num_partitions);
self
}
/// The rate used to calculate the number of training vectors for kmeans and SQ.
///
/// When an IVF_HNSW_SQ index is trained, we need to calculate partitions and min/max value of vectors. These are groups
/// of vectors that are similar to each other. To do this we use an algorithm called kmeans.
///
/// Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
/// random sample of the data. This parameter controls the size of the sample. The total
/// number of vectors used to train the IVF is `sample_rate * num_partitions`.
///
/// The total number of vectors used to train the SQ is `sample_rate * 2^{num_bits}`.
///
/// Increasing this value might improve the quality of the index but in most cases the
/// default should be sufficient.
///
/// The default value is 256.
pub fn sample_rate(mut self, sample_rate: u32) -> Self {
self.sample_rate = sample_rate;
self
}
/// Max iterations to train kmeans.
///
/// When training an IVF index we use kmeans to calculate the partitions. This parameter
/// controls how many iterations of kmeans to run.
///
/// Increasing this might improve the quality of the index but in most cases the parameter
/// is unused because kmeans will converge with fewer iterations. The parameter is only
/// used in cases where kmeans does not appear to converge. In those cases it is unlikely
/// that setting this larger will lead to the index converging anyways.
///
/// The default value is 50.
pub fn max_iterations(mut self, max_iterations: u32) -> Self {
self.max_iterations = max_iterations;
self
}
/// The number of neighbors to select for each vector in the HNSW graph.
/// Bumping this number will increase the recall of the search but also increase the build/search time.
/// The default value is 20.
pub fn m(mut self, m: u32) -> Self {
self.m = m;
self
}
/// The number of candidates to evaluate during the construction of the HNSW graph.
/// Bumping this number will increase the recall of the search but also increase the build/search time.
/// This value should be not less than `ef` in the search phase.
/// The default value is 300.
pub fn ef_construction(mut self, ef_construction: u32) -> Self {
self.ef_construction = ef_construction;
self
}
impl_distance_type_setter!();
impl_ivf_params_setter!();
impl_hnsw_params_setter!();
}

View File

@@ -37,6 +37,7 @@ use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatch
use lance::io::WrappingObjectStore;
use lance_index::vector::hnsw::builder::HnswBuildParams;
use lance_index::vector::ivf::IvfBuildParams;
use lance_index::vector::pq::PQBuildParams;
use lance_index::vector::sq::builder::SQBuildParams;
use lance_index::DatasetIndexExt;
use lance_index::IndexType;
@@ -49,9 +50,10 @@ use crate::connection::NoData;
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
use crate::error::{Error, Result};
use crate::index::vector::{
IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex, VectorIndexStatistics,
IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
};
use crate::index::IndexConfig;
use crate::index::IndexStatistics;
use crate::index::{
vector::{suggested_num_partitions, suggested_num_sub_vectors},
Index, IndexBuilder,
@@ -1217,7 +1219,7 @@ impl NativeTable {
pub async fn get_index_type(&self, index_uuid: &str) -> Result<Option<String>> {
match self.load_index_stats(index_uuid).await? {
Some(stats) => Ok(Some(stats.index_type)),
Some(stats) => Ok(Some(stats.index_type.unwrap_or_default())),
None => Ok(None),
}
}
@@ -1228,7 +1230,7 @@ impl NativeTable {
stats
.indices
.iter()
.map(|i| i.metric_type.clone())
.filter_map(|i| i.metric_type.clone())
.collect(),
)),
None => Ok(None),
@@ -1244,7 +1246,7 @@ impl NativeTable {
.collect())
}
async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<VectorIndexStatistics>> {
async fn load_index_stats(&self, index_uuid: &str) -> Result<Option<IndexStatistics>> {
let index = self
.load_indices()
.await?
@@ -1255,7 +1257,7 @@ impl NativeTable {
}
let dataset = self.dataset.get().await?;
let index_stats = dataset.index_statistics(&index.unwrap().index_name).await?;
let index_stats: VectorIndexStatistics = whatever!(
let index_stats: IndexStatistics = whatever!(
serde_json::from_str(&index_stats),
"error deserializing index statistics {index_stats}",
);
@@ -1316,6 +1318,69 @@ impl NativeTable {
Ok(())
}
async fn create_ivf_hnsw_pq_index(
&self,
index: IvfHnswPqIndexBuilder,
field: &Field,
replace: bool,
) -> Result<()> {
if !Self::supported_vector_data_type(field.data_type()) {
return Err(Error::InvalidInput {
message: format!(
"An IVF HNSW PQ index cannot be created on the column `{}` which has data type {}",
field.name(),
field.data_type()
),
});
}
let num_partitions = if let Some(n) = index.num_partitions {
n
} else {
suggested_num_partitions(self.count_rows(None).await?)
};
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
n
} else {
match field.data_type() {
arrow_schema::DataType::FixedSizeList(_, n) => {
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
}
_ => Err(Error::Schema {
message: format!("Column '{}' is not a FixedSizeList", field.name()),
}),
}?
};
let mut dataset = self.dataset.get_mut().await?;
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
ivf_params.sample_rate = index.sample_rate as usize;
ivf_params.max_iters = index.max_iterations as usize;
let hnsw_params = HnswBuildParams::default()
.num_edges(index.m as usize)
.ef_construction(index.ef_construction as usize);
let pq_params = PQBuildParams {
num_sub_vectors: num_sub_vectors as usize,
..Default::default()
};
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_pq_params(
index.distance_type.into(),
ivf_params,
hnsw_params,
pq_params,
);
dataset
.create_index(
&[field.name()],
IndexType::Vector,
None,
&lance_idx_params,
replace,
)
.await?;
Ok(())
}
async fn create_ivf_hnsw_sq_index(
&self,
index: IvfHnswSqIndexBuilder,
@@ -1610,6 +1675,10 @@ impl TableInternal for NativeTable {
Index::Auto => self.create_auto_index(field, opts).await,
Index::BTree(_) => self.create_btree_index(field, opts).await,
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
Index::IvfHnswPq(ivf_hnsw_pq) => {
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
.await
}
Index::IvfHnswSq(ivf_hnsw_sq) => {
self.create_ivf_hnsw_sq_index(ivf_hnsw_sq, field, opts.replace)
.await
@@ -2475,6 +2544,25 @@ mod tests {
.unwrap(),
Some(0)
);
assert_eq!(
table
.as_native()
.unwrap()
.get_index_type(index_uuid)
.await
.unwrap()
.map(|index_type| index_type.to_string()),
Some("IVF".to_string())
);
assert_eq!(
table
.as_native()
.unwrap()
.get_distance_type(index_uuid)
.await
.unwrap(),
Some(crate::DistanceType::L2.to_string())
);
}
#[tokio::test]
@@ -2573,6 +2661,102 @@ mod tests {
);
}
#[tokio::test]
async fn test_create_index_ivf_hnsw_pq() {
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use rand;
use std::iter::repeat_with;
use arrow_array::Float32Array;
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = connect(uri).execute().await.unwrap();
let dimension = 16;
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
"embeddings",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
dimension,
),
false,
)]));
let mut rng = rand::thread_rng();
let float_arr = Float32Array::from(
repeat_with(|| rng.gen::<f32>())
.take(512 * dimension as usize)
.collect::<Vec<f32>>(),
);
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
let batches = RecordBatchIterator::new(
vec![RecordBatch::try_new(schema.clone(), vec![vectors.clone()]).unwrap()]
.into_iter()
.map(Ok),
schema,
);
let table = conn.create_table("test", batches).execute().await.unwrap();
assert_eq!(
table
.as_native()
.unwrap()
.count_indexed_rows("my_index")
.await
.unwrap(),
None
);
assert_eq!(
table
.as_native()
.unwrap()
.count_unindexed_rows("my_index")
.await
.unwrap(),
None
);
let index = IvfHnswPqIndexBuilder::default();
table
.create_index(&["embeddings"], Index::IvfHnswPq(index))
.execute()
.await
.unwrap();
let index_configs = table.list_indices().await.unwrap();
assert_eq!(index_configs.len(), 1);
let index = index_configs.into_iter().next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::IvfPq);
assert_eq!(index.columns, vec!["embeddings".to_string()]);
assert_eq!(table.count_rows(None).await.unwrap(), 512);
assert_eq!(table.name(), "test");
let indices = table.as_native().unwrap().load_indices().await.unwrap();
let index_uuid = &indices[0].index_uuid;
assert_eq!(
table
.as_native()
.unwrap()
.count_indexed_rows(index_uuid)
.await
.unwrap(),
Some(512)
);
assert_eq!(
table
.as_native()
.unwrap()
.count_unindexed_rows(index_uuid)
.await
.unwrap(),
Some(0)
);
}
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
let list_type = DataType::FixedSizeList(
Arc::new(Field::new("item", values.data_type().clone(), true)),
@@ -2644,6 +2828,27 @@ mod tests {
let index = index_configs.into_iter().next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::BTree);
assert_eq!(index.columns, vec!["i".to_string()]);
let indices = table.as_native().unwrap().load_indices().await.unwrap();
let index_uuid = &indices[0].index_uuid;
assert_eq!(
table
.as_native()
.unwrap()
.count_indexed_rows(index_uuid)
.await
.unwrap(),
Some(1)
);
assert_eq!(
table
.as_native()
.unwrap()
.count_unindexed_rows(index_uuid)
.await
.unwrap(),
Some(0)
);
}
#[tokio::test]