feat: add take_offsets and take_row_ids (#2584)

These operations have existed in lance for a long while and many users need to drop down to lance for this capability. This PR adds the API and implements it using filters (e.g. `_rowid IN (...)`) so that in doesn't currently add any load to `BaseTable`. I'm not sure that is sustainable as base table implementations may want to specialize how they handle this method. However, I figure it is a good starting point. In addition, unlike Lance, this API does not currently guarantee anything about the order of the take results. This is necessary for the fallback filter approach to work (SQL filters cannot guarantee result order)
2025-12-27 23:12:58 +00:00 · 2025-08-15 06:48:24 -07:00
parent 296205ef96
commit ed640a76d9
24 changed files with 1488 additions and 381 deletions
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -13,10 +13,12 @@ use lancedb::index::scalar::{
    BooleanQuery, BoostQuery, FtsQuery, FullTextSearchQuery, MatchQuery, MultiMatchQuery, Occur,
    Operator, PhraseQuery,
 };
+use lancedb::query::QueryBase;
 use lancedb::query::QueryExecutionOptions;
 use lancedb::query::QueryFilter;
 use lancedb::query::{
-    ExecutableQuery, Query as LanceDbQuery, QueryBase, Select, VectorQuery as LanceDbVectorQuery,
+    ExecutableQuery, Query as LanceDbQuery, Select, TakeQuery as LanceDbTakeQuery,
+    VectorQuery as LanceDbVectorQuery,
 };
 use lancedb::table::AnyQuery;
 use pyo3::prelude::{PyAnyMethods, PyDictMethods};
@@ -488,6 +490,76 @@ impl Query {
    }
 }

+#[pyclass]
+pub struct TakeQuery {
+    inner: LanceDbTakeQuery,
+}
+
+impl TakeQuery {
+    pub fn new(query: LanceDbTakeQuery) -> Self {
+        Self { inner: query }
+    }
+}
+
+#[pymethods]
+impl TakeQuery {
+    pub fn select(&mut self, columns: Vec<(String, String)>) {
+        self.inner = self.inner.clone().select(Select::dynamic(&columns));
+    }
+
+    pub fn select_columns(&mut self, columns: Vec<String>) {
+        self.inner = self.inner.clone().select(Select::columns(&columns));
+    }
+
+    pub fn with_row_id(&mut self) {
+        self.inner = self.inner.clone().with_row_id();
+    }
+
+    #[pyo3(signature = (max_batch_length=None, timeout=None))]
+    pub fn execute(
+        self_: PyRef<'_, Self>,
+        max_batch_length: Option<u32>,
+        timeout: Option<Duration>,
+    ) -> PyResult<Bound<'_, PyAny>> {
+        let inner = self_.inner.clone();
+        future_into_py(self_.py(), async move {
+            let mut opts = QueryExecutionOptions::default();
+            if let Some(max_batch_length) = max_batch_length {
+                opts.max_batch_length = max_batch_length;
+            }
+            if let Some(timeout) = timeout {
+                opts.timeout = Some(timeout);
+            }
+            let inner_stream = inner.execute_with_options(opts).await.infer_error()?;
+            Ok(RecordBatchStream::new(inner_stream))
+        })
+    }
+
+    pub fn explain_plan(self_: PyRef<'_, Self>, verbose: bool) -> PyResult<Bound<'_, PyAny>> {
+        let inner = self_.inner.clone();
+        future_into_py(self_.py(), async move {
+            inner
+                .explain_plan(verbose)
+                .await
+                .map_err(|e| PyRuntimeError::new_err(e.to_string()))
+        })
+    }
+
+    pub fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
+        let inner = self_.inner.clone();
+        future_into_py(self_.py(), async move {
+            inner
+                .analyze_plan()
+                .await
+                .map_err(|e| PyRuntimeError::new_err(e.to_string()))
+        })
+    }
+
+    pub fn to_query_request(&self) -> PyQueryRequest {
+        PyQueryRequest::from(AnyQuery::Query(self.inner.clone().into_request()))
+    }
+}
+
 #[pyclass]
 #[derive(Clone)]
 pub struct FTSQuery {
--- a/python/src/table.rs
+++ b/python/src/table.rs
@@ -5,7 +5,7 @@ use std::{collections::HashMap, sync::Arc};
 use crate::{
    error::PythonErrorExt,
    index::{extract_index_params, IndexConfig},
-    query::Query,
+    query::{Query, TakeQuery},
 };
 use arrow::{
    datatypes::{DataType, Schema},
@@ -568,6 +568,20 @@ impl Table {
        Ok(Tags::new(self.inner_ref()?.clone()))
    }

+    #[pyo3(signature = (offsets))]
+    pub fn take_offsets(self_: PyRef<'_, Self>, offsets: Vec<u64>) -> PyResult<TakeQuery> {
+        Ok(TakeQuery::new(
+            self_.inner_ref()?.clone().take_offsets(offsets),
+        ))
+    }
+
+    #[pyo3(signature = (row_ids))]
+    pub fn take_row_ids(self_: PyRef<'_, Self>, row_ids: Vec<u64>) -> PyResult<TakeQuery> {
+        Ok(TakeQuery::new(
+            self_.inner_ref()?.clone().take_row_ids(row_ids),
+        ))
+    }
+
    /// Optimize the on-disk data by compacting and pruning old data, for better performance.
    #[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
    pub fn optimize(