feat: make it possible to opt in to using the v2 format (#1352)

This also exposed the max_batch_length configuration option in python/node (it was needed to verify if we are actually in v2 mode or not)
2026-01-10 22:02:58 +00:00 · 2024-06-04 21:52:14 -07:00
parent d39e7d23f4
commit d5586c9c32
17 changed files with 310 additions and 33 deletions
--- a/python/src/connection.rs
+++ b/python/src/connection.rs
@@ -91,6 +91,7 @@ impl Connection {
        mode: &str,
        data: &PyAny,
        storage_options: Option<HashMap<String, String>>,
+        use_legacy_format: Option<bool>,
    ) -> PyResult<&'a PyAny> {
        let inner = self_.get_inner()?.clone();

@@ -103,6 +104,10 @@ impl Connection {
            builder = builder.storage_options(storage_options);
        }

+        if let Some(use_legacy_format) = use_legacy_format {
+            builder = builder.use_legacy_format(use_legacy_format);
+        }
+
        future_into_py(self_.py(), async move {
            let table = builder.execute().await.infer_error()?;
            Ok(Table::new(table))
@@ -115,6 +120,7 @@ impl Connection {
        mode: &str,
        schema: &PyAny,
        storage_options: Option<HashMap<String, String>>,
+        use_legacy_format: Option<bool>,
    ) -> PyResult<&'a PyAny> {
        let inner = self_.get_inner()?.clone();

@@ -128,6 +134,10 @@ impl Connection {
            builder = builder.storage_options(storage_options);
        }

+        if let Some(use_legacy_format) = use_legacy_format {
+            builder = builder.use_legacy_format(use_legacy_format);
+        }
+
        future_into_py(self_.py(), async move {
            let table = builder.execute().await.infer_error()?;
            Ok(Table::new(table))
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -15,6 +15,7 @@
 use arrow::array::make_array;
 use arrow::array::ArrayData;
 use arrow::pyarrow::FromPyArrow;
+use lancedb::query::QueryExecutionOptions;
 use lancedb::query::{
    ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
 };
@@ -61,10 +62,14 @@ impl Query {
        Ok(VectorQuery { inner })
    }

-    pub fn execute(self_: PyRef<'_, Self>) -> PyResult<&PyAny> {
+    pub fn execute(self_: PyRef<'_, Self>, max_batch_length: Option<u32>) -> PyResult<&PyAny> {
        let inner = self_.inner.clone();
        future_into_py(self_.py(), async move {
-            let inner_stream = inner.execute().await.infer_error()?;
+            let mut opts = QueryExecutionOptions::default();
+            if let Some(max_batch_length) = max_batch_length {
+                opts.max_batch_length = max_batch_length;
+            }
+            let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
            Ok(RecordBatchStream::new(inner_stream))
        })
    }
@@ -115,10 +120,14 @@ impl VectorQuery {
        self.inner = self.inner.clone().bypass_vector_index()
    }

-    pub fn execute(self_: PyRef<'_, Self>) -> PyResult<&PyAny> {
+    pub fn execute(self_: PyRef<'_, Self>, max_batch_length: Option<u32>) -> PyResult<&PyAny> {
        let inner = self_.inner.clone();
        future_into_py(self_.py(), async move {
-            let inner_stream = inner.execute().await.infer_error()?;
+            let mut opts = QueryExecutionOptions::default();
+            if let Some(max_batch_length) = max_batch_length {
+                opts.max_batch_length = max_batch_length;
+            }
+            let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
            Ok(RecordBatchStream::new(inner_stream))
        })
    }