From 5f261cf2d8209b24ca682795dc93f1ee11112bc5 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 5 Dec 2024 10:53:59 -0800 Subject: [PATCH] feat: upgrade to Lance v0.20.0 (#1908) Upstream change log: https://github.com/lancedb/lance/releases/tag/v0.20.0 --- Cargo.toml | 36 ++++++++++++++++++------------------ python/Cargo.toml | 13 ++++--------- python/pyproject.toml | 2 +- python/src/arrow.rs | 2 +- python/src/connection.rs | 6 +++++- python/src/error.rs | 6 ++++-- python/src/index.rs | 4 ++++ python/src/lib.rs | 8 ++++++-- python/src/query.rs | 4 +++- python/src/table.rs | 19 +++++++++++-------- 10 files changed, 57 insertions(+), 43 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1c529852..3107fb8c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,27 +23,27 @@ rust-version = "1.80.0" # TO [workspace.dependencies] lance = { "version" = "=0.20.0", "features" = [ "dynamodb", -], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } -lance-io = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } -lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } -lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } -lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } -lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } -lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } -lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.3" } +] } +lance-io = "0.20.0" +lance-index = "0.20.0" +lance-linalg = "0.20.0" +lance-table = "0.20.0" +lance-testing = "0.20.0" +lance-datafusion = "0.20.0" +lance-encoding = "0.20.0" # Note that this one does not include pyarrow -arrow = { version = "52.2", optional = false } -arrow-array = "52.2" -arrow-data = "52.2" -arrow-ipc = "52.2" -arrow-ord = "52.2" -arrow-schema = "52.2" -arrow-arith = "52.2" -arrow-cast = "52.2" +arrow = { version = "53.2", optional = false } +arrow-array = "53.2" +arrow-data = "53.2" +arrow-ipc = "53.2" +arrow-ord = "53.2" +arrow-schema = "53.2" +arrow-arith = "53.2" +arrow-cast = "53.2" async-trait = "0" chrono = "0.4.35" -datafusion-common = "41.0" -datafusion-physical-plan = "41.0" +datafusion-common = "42.0" +datafusion-physical-plan = "42.0" env_logger = "0.10" half = { "version" = "=2.4.1", default-features = false, features = [ "num-traits", diff --git a/python/Cargo.toml b/python/Cargo.toml index 856bb034..b90018ae 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -14,23 +14,18 @@ name = "_lancedb" crate-type = ["cdylib"] [dependencies] -arrow = { version = "52.1", features = ["pyarrow"] } +arrow = { version = "53.2", features = ["pyarrow"] } lancedb = { path = "../rust/lancedb", default-features = false } env_logger.workspace = true -pyo3 = { version = "0.21", features = [ +pyo3 = { version = "0.22.2", features = [ "extension-module", "abi3-py39", "gil-refs" ] } -# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119 -# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] } -pyo3-asyncio-0-21 = { version = "0.21.0", features = [ - "attributes", - "tokio-runtime" -] } +pyo3-async-runtimes = { version = "0.22", features = ["attributes", "tokio-runtime"] } pin-project = "1.1.5" futures.workspace = true -tokio = { version = "1.36.0", features = ["sync"] } +tokio = { version = "1.40", features = ["sync"] } [build-dependencies] pyo3-build-config = { version = "0.20.3", features = [ diff --git a/python/pyproject.toml b/python/pyproject.toml index 71c293f5..a05e5bc7 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,7 +3,7 @@ name = "lancedb" # version in Cargo.toml dependencies = [ "deprecation", - "pylance==0.20.0b3", + "pylance==0.20.0", "tqdm>=4.27.0", "pydantic>=1.10", "packaging", diff --git a/python/src/arrow.rs b/python/src/arrow.rs index 07e569d0..c5c53b54 100644 --- a/python/src/arrow.rs +++ b/python/src/arrow.rs @@ -10,7 +10,7 @@ use arrow::{ use futures::stream::StreamExt; use lancedb::arrow::SendableRecordBatchStream; use pyo3::{pyclass, pymethods, Bound, PyAny, PyObject, PyRef, PyResult, Python}; -use pyo3_asyncio_0_21::tokio::future_into_py; +use pyo3_async_runtimes::tokio::future_into_py; use crate::error::PythonErrorExt; diff --git a/python/src/connection.rs b/python/src/connection.rs index 46e15cfb..5648dfd9 100644 --- a/python/src/connection.rs +++ b/python/src/connection.rs @@ -9,7 +9,7 @@ use pyo3::{ exceptions::{PyRuntimeError, PyValueError}, pyclass, pyfunction, pymethods, Bound, FromPyObject, PyAny, PyRef, PyResult, Python, }; -use pyo3_asyncio_0_21::tokio::future_into_py; +use pyo3_async_runtimes::tokio::future_into_py; use crate::{error::PythonErrorExt, table::Table}; @@ -58,6 +58,7 @@ impl Connection { self.inner.take(); } + #[pyo3(signature = (start_after=None, limit=None))] pub fn table_names( self_: PyRef<'_, Self>, start_after: Option, @@ -74,6 +75,7 @@ impl Connection { future_into_py(self_.py(), async move { op.execute().await.infer_error() }) } + #[pyo3(signature = (name, mode, data, storage_options=None, data_storage_version=None, enable_v2_manifest_paths=None))] pub fn create_table<'a>( self_: PyRef<'a, Self>, name: String, @@ -111,6 +113,7 @@ impl Connection { }) } + #[pyo3(signature = (name, mode, schema, storage_options=None, data_storage_version=None, enable_v2_manifest_paths=None))] pub fn create_empty_table<'a>( self_: PyRef<'a, Self>, name: String, @@ -198,6 +201,7 @@ impl Connection { } #[pyfunction] +#[pyo3(signature = (uri, api_key=None, region=None, host_override=None, read_consistency_interval=None, client_config=None, storage_options=None))] #[allow(clippy::too_many_arguments)] pub fn connect( py: Python, diff --git a/python/src/error.rs b/python/src/error.rs index 4855b8f5..f34eedcb 100644 --- a/python/src/error.rs +++ b/python/src/error.rs @@ -138,7 +138,9 @@ fn http_from_rust_error( status_code: Option, ) -> PyResult { let message = err.to_string(); - let http_err_cls = py.import("lancedb.remote.errors")?.getattr("HttpError")?; + let http_err_cls = py + .import_bound("lancedb.remote.errors")? + .getattr("HttpError")?; let py_err = http_err_cls.call1((message, request_id, status_code))?; // Reset the traceback since it doesn't provide additional information. @@ -149,5 +151,5 @@ fn http_from_rust_error( py_err.setattr(intern!(py, "__cause__"), cause_err)?; } - Ok(PyErr::from_value(py_err)) + Ok(PyErr::from_value_bound(py_err)) } diff --git a/python/src/index.rs b/python/src/index.rs index 4ea4c19f..fd09d847 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -47,6 +47,7 @@ impl Index { #[pymethods] impl Index { + #[pyo3(signature = (distance_type=None, num_partitions=None, num_sub_vectors=None, max_iterations=None, sample_rate=None))] #[staticmethod] pub fn ivf_pq( distance_type: Option, @@ -106,6 +107,7 @@ impl Index { }) } + #[pyo3(signature = (with_position=None, base_tokenizer=None, language=None, max_token_length=None, lower_case=None, stem=None, remove_stop_words=None, ascii_folding=None))] #[allow(clippy::too_many_arguments)] #[staticmethod] pub fn fts( @@ -146,6 +148,7 @@ impl Index { } } + #[pyo3(signature = (distance_type=None, num_partitions=None, num_sub_vectors=None, max_iterations=None, sample_rate=None, m=None, ef_construction=None))] #[staticmethod] pub fn hnsw_pq( distance_type: Option, @@ -184,6 +187,7 @@ impl Index { }) } + #[pyo3(signature = (distance_type=None, num_partitions=None, max_iterations=None, sample_rate=None, m=None, ef_construction=None))] #[staticmethod] pub fn hnsw_sq( distance_type: Option, diff --git a/python/src/lib.rs b/python/src/lib.rs index 9d1f0a80..01e39cae 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -16,7 +16,11 @@ use arrow::RecordBatchStream; use connection::{connect, Connection}; use env_logger::Env; use index::{Index, IndexConfig}; -use pyo3::{pymodule, types::PyModule, wrap_pyfunction, PyResult, Python}; +use pyo3::{ + pymodule, + types::{PyModule, PyModuleMethods}, + wrap_pyfunction, Bound, PyResult, Python, +}; use query::{Query, VectorQuery}; use table::Table; @@ -29,7 +33,7 @@ pub mod table; pub mod util; #[pymodule] -pub fn _lancedb(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn _lancedb(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { let env = Env::new() .filter_or("LANCEDB_LOG", "warn") .write_style("LANCEDB_LOG_STYLE"); diff --git a/python/src/query.rs b/python/src/query.rs index 5eeb4aa8..486363f9 100644 --- a/python/src/query.rs +++ b/python/src/query.rs @@ -29,7 +29,7 @@ use pyo3::PyAny; use pyo3::PyRef; use pyo3::PyResult; use pyo3::{pyclass, PyErr}; -use pyo3_asyncio_0_21::tokio::future_into_py; +use pyo3_async_runtimes::tokio::future_into_py; use crate::arrow::RecordBatchStream; use crate::error::PythonErrorExt; @@ -105,6 +105,7 @@ impl Query { Ok(()) } + #[pyo3(signature = (max_batch_length=None))] pub fn execute( self_: PyRef<'_, Self>, max_batch_length: Option, @@ -203,6 +204,7 @@ impl VectorQuery { self.inner = self.inner.clone().bypass_vector_index() } + #[pyo3(signature = (max_batch_length=None))] pub fn execute( self_: PyRef<'_, Self>, max_batch_length: Option, diff --git a/python/src/table.rs b/python/src/table.rs index 25a3b97e..a5f446ec 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -12,10 +12,10 @@ use lancedb::table::{ use pyo3::{ exceptions::{PyRuntimeError, PyValueError}, pyclass, pymethods, - types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods, PyString}, + types::{IntoPyDict, PyAnyMethods, PyDict, PyDictMethods}, Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject, }; -use pyo3_asyncio_0_21::tokio::future_into_py; +use pyo3_async_runtimes::tokio::future_into_py; use crate::{ error::PythonErrorExt, @@ -141,9 +141,10 @@ impl Table { }) } + #[pyo3(signature = (updates, r#where=None))] pub fn update<'a>( self_: PyRef<'a, Self>, - updates: &PyDict, + updates: &Bound<'_, PyDict>, r#where: Option, ) -> PyResult> { let mut op = self_.inner_ref()?.update(); @@ -151,10 +152,8 @@ impl Table { op = op.only_if(only_if); } for (column_name, value) in updates.into_iter() { - let column_name: &PyString = column_name.downcast()?; - let column_name = column_name.to_str()?.to_string(); - let value: &PyString = value.downcast()?; - let value = value.to_str()?.to_string(); + let column_name: String = column_name.extract()?; + let value: String = value.extract()?; op = op.column(column_name, value); } future_into_py(self_.py(), async move { @@ -163,6 +162,7 @@ impl Table { }) } + #[pyo3(signature = (filter=None))] pub fn count_rows( self_: PyRef<'_, Self>, filter: Option, @@ -173,6 +173,7 @@ impl Table { }) } + #[pyo3(signature = (column, index=None, replace=None))] pub fn create_index<'a>( self_: PyRef<'a, Self>, column: String, @@ -267,7 +268,8 @@ impl Table { .unwrap(); let tup: Vec<(&String, &String)> = v.metadata.iter().collect(); - dict.set_item("metadata", tup.into_py_dict(py)).unwrap(); + dict.set_item("metadata", tup.into_py_dict_bound(py)) + .unwrap(); dict.to_object(py) }) .collect::>() @@ -303,6 +305,7 @@ impl Table { Query::new(self.inner_ref().unwrap().query()) } + #[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))] pub fn optimize( self_: PyRef<'_, Self>, cleanup_since_ms: Option,