mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-04 02:42:57 +00:00
feat: add take_offsets and take_row_ids (#2584)
These operations have existed in lance for a long while and many users need to drop down to lance for this capability. This PR adds the API and implements it using filters (e.g. `_rowid IN (...)`) so that in doesn't currently add any load to `BaseTable`. I'm not sure that is sustainable as base table implementations may want to specialize how they handle this method. However, I figure it is a good starting point. In addition, unlike Lance, this API does not currently guarantee anything about the order of the take results. This is necessary for the fallback filter approach to work (SQL filters cannot guarantee result order)
This commit is contained in:
@@ -287,6 +287,12 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(res2[1].id).toEqual(data2.id);
|
||||
});
|
||||
|
||||
it("should support take queries", async () => {
|
||||
await table.add([{ id: 1 }, { id: 2 }, { id: 3 }]);
|
||||
const res = await table.takeOffsets([1, 2]).toArrow();
|
||||
expect(res.getChild("id")?.toJSON()).toEqual([2, 3]);
|
||||
});
|
||||
|
||||
it("should return the table as an instance of an arrow table", async () => {
|
||||
const arrowTbl = await table.toArrow();
|
||||
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
||||
|
||||
@@ -59,6 +59,7 @@ export {
|
||||
Query,
|
||||
QueryBase,
|
||||
VectorQuery,
|
||||
TakeQuery,
|
||||
QueryExecutionOptions,
|
||||
FullTextSearchOptions,
|
||||
RecordBatchIterator,
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
RecordBatchIterator as NativeBatchIterator,
|
||||
Query as NativeQuery,
|
||||
Table as NativeTable,
|
||||
TakeQuery as NativeTakeQuery,
|
||||
VectorQuery as NativeVectorQuery,
|
||||
} from "./native";
|
||||
import { Reranker } from "./rerankers";
|
||||
@@ -50,7 +51,7 @@ export class RecordBatchIterator implements AsyncIterator<RecordBatch> {
|
||||
/* eslint-enable */
|
||||
|
||||
class RecordBatchIterable<
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery,
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||
> implements AsyncIterable<RecordBatch>
|
||||
{
|
||||
private inner: NativeQueryType;
|
||||
@@ -107,8 +108,9 @@ export interface FullTextSearchOptions {
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
implements AsyncIterable<RecordBatch>
|
||||
export class QueryBase<
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery | NativeTakeQuery,
|
||||
> implements AsyncIterable<RecordBatch>
|
||||
{
|
||||
/**
|
||||
* @hidden
|
||||
@@ -133,56 +135,6 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
fn(this.inner);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
*
|
||||
* The filter should be supplied as an SQL query string. For example:
|
||||
* @example
|
||||
* x > 10
|
||||
* y > 0 AND y < 100
|
||||
* x > 5 OR y = 'test'
|
||||
*
|
||||
* Filtering performance can often be improved by creating a scalar index
|
||||
* on the filter column(s).
|
||||
*/
|
||||
where(predicate: string): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.onlyIf(predicate));
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
* @see where
|
||||
* @deprecated Use `where` instead
|
||||
*/
|
||||
filter(predicate: string): this {
|
||||
return this.where(predicate);
|
||||
}
|
||||
|
||||
fullTextSearch(
|
||||
query: string | FullTextQuery,
|
||||
options?: Partial<FullTextSearchOptions>,
|
||||
): this {
|
||||
let columns: string[] | null = null;
|
||||
if (options) {
|
||||
if (typeof options.columns === "string") {
|
||||
columns = [options.columns];
|
||||
} else if (Array.isArray(options.columns)) {
|
||||
columns = options.columns;
|
||||
}
|
||||
}
|
||||
|
||||
this.doCall((inner: NativeQueryType) => {
|
||||
if (typeof query === "string") {
|
||||
inner.fullTextSearch({
|
||||
query: query,
|
||||
columns: columns,
|
||||
});
|
||||
} else {
|
||||
inner.fullTextSearch({ query: query.inner });
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return only the specified columns.
|
||||
@@ -241,33 +193,6 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum number of results to return.
|
||||
*
|
||||
* By default, a plain search has no limit. If this method is not
|
||||
* called then every valid row from the table will be returned.
|
||||
*/
|
||||
limit(limit: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.limit(limit));
|
||||
return this;
|
||||
}
|
||||
|
||||
offset(offset: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.offset(offset));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
||||
* any data that is not yet indexed.
|
||||
*
|
||||
* Use {@link Table#optimize} to index all un-indexed data.
|
||||
*/
|
||||
fastSearch(): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.fastSearch());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether to return the row id in the results.
|
||||
*
|
||||
@@ -403,6 +328,100 @@ export class QueryBase<NativeQueryType extends NativeQuery | NativeVectorQuery>
|
||||
}
|
||||
}
|
||||
|
||||
export class StandardQueryBase<
|
||||
NativeQueryType extends NativeQuery | NativeVectorQuery,
|
||||
>
|
||||
extends QueryBase<NativeQueryType>
|
||||
implements ExecutableQuery
|
||||
{
|
||||
constructor(inner: NativeQueryType | Promise<NativeQueryType>) {
|
||||
super(inner);
|
||||
}
|
||||
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
*
|
||||
* The filter should be supplied as an SQL query string. For example:
|
||||
* @example
|
||||
* x > 10
|
||||
* y > 0 AND y < 100
|
||||
* x > 5 OR y = 'test'
|
||||
*
|
||||
* Filtering performance can often be improved by creating a scalar index
|
||||
* on the filter column(s).
|
||||
*/
|
||||
where(predicate: string): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.onlyIf(predicate));
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* A filter statement to be applied to this query.
|
||||
* @see where
|
||||
* @deprecated Use `where` instead
|
||||
*/
|
||||
filter(predicate: string): this {
|
||||
return this.where(predicate);
|
||||
}
|
||||
|
||||
fullTextSearch(
|
||||
query: string | FullTextQuery,
|
||||
options?: Partial<FullTextSearchOptions>,
|
||||
): this {
|
||||
let columns: string[] | null = null;
|
||||
if (options) {
|
||||
if (typeof options.columns === "string") {
|
||||
columns = [options.columns];
|
||||
} else if (Array.isArray(options.columns)) {
|
||||
columns = options.columns;
|
||||
}
|
||||
}
|
||||
|
||||
this.doCall((inner: NativeQueryType) => {
|
||||
if (typeof query === "string") {
|
||||
inner.fullTextSearch({
|
||||
query: query,
|
||||
columns: columns,
|
||||
});
|
||||
} else {
|
||||
inner.fullTextSearch({ query: query.inner });
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum number of results to return.
|
||||
*
|
||||
* By default, a plain search has no limit. If this method is not
|
||||
* called then every valid row from the table will be returned.
|
||||
*/
|
||||
limit(limit: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.limit(limit));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the number of rows to skip before returning results.
|
||||
*
|
||||
* This is useful for pagination.
|
||||
*/
|
||||
offset(offset: number): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.offset(offset));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip searching un-indexed data. This can make search faster, but will miss
|
||||
* any data that is not yet indexed.
|
||||
*
|
||||
* Use {@link Table#optimize} to index all un-indexed data.
|
||||
*/
|
||||
fastSearch(): this {
|
||||
this.doCall((inner: NativeQueryType) => inner.fastSearch());
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An interface for a query that can be executed
|
||||
*
|
||||
@@ -419,7 +438,7 @@ export interface ExecutableQuery {}
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
||||
export class VectorQuery extends StandardQueryBase<NativeVectorQuery> {
|
||||
/**
|
||||
* @hidden
|
||||
*/
|
||||
@@ -679,13 +698,24 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A query that returns a subset of the rows in the table.
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class TakeQuery extends QueryBase<NativeTakeQuery> {
|
||||
constructor(inner: NativeTakeQuery) {
|
||||
super(inner);
|
||||
}
|
||||
}
|
||||
|
||||
/** A builder for LanceDB queries.
|
||||
*
|
||||
* @see {@link Table#query}, {@link Table#search}
|
||||
*
|
||||
* @hideconstructor
|
||||
*/
|
||||
export class Query extends QueryBase<NativeQuery> {
|
||||
export class Query extends StandardQueryBase<NativeQuery> {
|
||||
/**
|
||||
* @hidden
|
||||
*/
|
||||
|
||||
@@ -35,6 +35,7 @@ import {
|
||||
import {
|
||||
FullTextQuery,
|
||||
Query,
|
||||
TakeQuery,
|
||||
VectorQuery,
|
||||
instanceOfFullTextQuery,
|
||||
} from "./query";
|
||||
@@ -336,6 +337,20 @@ export abstract class Table {
|
||||
*/
|
||||
abstract query(): Query;
|
||||
|
||||
/**
|
||||
* Create a query that returns a subset of the rows in the table.
|
||||
* @param offsets The offsets of the rows to return.
|
||||
* @returns A builder that can be used to parameterize the query.
|
||||
*/
|
||||
abstract takeOffsets(offsets: number[]): TakeQuery;
|
||||
|
||||
/**
|
||||
* Create a query that returns a subset of the rows in the table.
|
||||
* @param rowIds The row ids of the rows to return.
|
||||
* @returns A builder that can be used to parameterize the query.
|
||||
*/
|
||||
abstract takeRowIds(rowIds: number[]): TakeQuery;
|
||||
|
||||
/**
|
||||
* Create a search query to find the nearest neighbors
|
||||
* of the given query
|
||||
@@ -665,6 +680,14 @@ export class LocalTable extends Table {
|
||||
await this.inner.waitForIndex(indexNames, timeoutSeconds);
|
||||
}
|
||||
|
||||
takeOffsets(offsets: number[]): TakeQuery {
|
||||
return new TakeQuery(this.inner.takeOffsets(offsets));
|
||||
}
|
||||
|
||||
takeRowIds(rowIds: number[]): TakeQuery {
|
||||
return new TakeQuery(this.inner.takeRowIds(rowIds));
|
||||
}
|
||||
|
||||
query(): Query {
|
||||
return new Query(this.inner);
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ use lancedb::query::Query as LanceDbQuery;
|
||||
use lancedb::query::QueryBase;
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::Select;
|
||||
use lancedb::query::TakeQuery as LanceDbTakeQuery;
|
||||
use lancedb::query::VectorQuery as LanceDbVectorQuery;
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi_derive::napi;
|
||||
@@ -319,6 +320,79 @@ impl VectorQuery {
|
||||
}
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub struct TakeQuery {
|
||||
inner: LanceDbTakeQuery,
|
||||
}
|
||||
|
||||
#[napi]
|
||||
impl TakeQuery {
|
||||
pub fn new(query: LanceDbTakeQuery) -> Self {
|
||||
Self { inner: query }
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn select(&mut self, columns: Vec<(String, String)>) {
|
||||
self.inner = self.inner.clone().select(Select::dynamic(&columns));
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn select_columns(&mut self, columns: Vec<String>) {
|
||||
self.inner = self.inner.clone().select(Select::columns(&columns));
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn with_row_id(&mut self) {
|
||||
self.inner = self.inner.clone().with_row_id();
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(
|
||||
&self,
|
||||
max_batch_length: Option<u32>,
|
||||
timeout_ms: Option<u32>,
|
||||
) -> napi::Result<RecordBatchIterator> {
|
||||
let mut execution_opts = QueryExecutionOptions::default();
|
||||
if let Some(max_batch_length) = max_batch_length {
|
||||
execution_opts.max_batch_length = max_batch_length;
|
||||
}
|
||||
if let Some(timeout_ms) = timeout_ms {
|
||||
execution_opts.timeout = Some(std::time::Duration::from_millis(timeout_ms as u64))
|
||||
}
|
||||
let inner_stream = self
|
||||
.inner
|
||||
.execute_with_options(execution_opts)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to execute query stream: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})?;
|
||||
Ok(RecordBatchIterator::new(inner_stream))
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub async fn explain_plan(&self, verbose: bool) -> napi::Result<String> {
|
||||
self.inner.explain_plan(verbose).await.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to retrieve the query plan: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn analyze_plan(&self) -> napi::Result<String> {
|
||||
self.inner.analyze_plan().await.map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to execute analyze plan: {}",
|
||||
convert_error(&e)
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[napi]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JsFullTextQuery {
|
||||
|
||||
@@ -15,7 +15,7 @@ use napi_derive::napi;
|
||||
use crate::error::NapiErrorExt;
|
||||
use crate::index::Index;
|
||||
use crate::merge::NativeMergeInsertBuilder;
|
||||
use crate::query::{Query, VectorQuery};
|
||||
use crate::query::{Query, TakeQuery, VectorQuery};
|
||||
|
||||
#[napi]
|
||||
pub struct Table {
|
||||
@@ -187,6 +187,44 @@ impl Table {
|
||||
Ok(Query::new(self.inner_ref()?.query()))
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub fn take_offsets(&self, offsets: Vec<i64>) -> napi::Result<TakeQuery> {
|
||||
Ok(TakeQuery::new(
|
||||
self.inner_ref()?.take_offsets(
|
||||
offsets
|
||||
.into_iter()
|
||||
.map(|o| {
|
||||
u64::try_from(o).map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to convert offset to u64: {}",
|
||||
e
|
||||
))
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?,
|
||||
),
|
||||
))
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub fn take_row_ids(&self, row_ids: Vec<i64>) -> napi::Result<TakeQuery> {
|
||||
Ok(TakeQuery::new(
|
||||
self.inner_ref()?.take_row_ids(
|
||||
row_ids
|
||||
.into_iter()
|
||||
.map(|o| {
|
||||
u64::try_from(o).map_err(|e| {
|
||||
napi::Error::from_reason(format!(
|
||||
"Failed to convert row id to u64: {}",
|
||||
e
|
||||
))
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?,
|
||||
),
|
||||
))
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub fn vector_search(&self, vector: Float32Array) -> napi::Result<VectorQuery> {
|
||||
self.query()?.nearest_to(vector)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
"intentionallyNotExported": [
|
||||
"lancedb/native.d.ts:Query",
|
||||
"lancedb/native.d.ts:VectorQuery",
|
||||
"lancedb/native.d.ts:TakeQuery",
|
||||
"lancedb/native.d.ts:RecordBatchIterator",
|
||||
"lancedb/native.d.ts:NativeMergeInsertBuilder"
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user