mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-05 19:32:56 +00:00
feat: fast_search in Python and Node (#1623)
Sometimes it is acceptable to users to only search indexed data and skip and new un-indexed data. For example, if un-indexed data will be shortly indexed and they don't mind the delay. In these cases, we can save a lot of CPU time in search, and provide better latency. Users can activate this on queries using `fast_search()`.
This commit is contained in:
@@ -1315,6 +1315,20 @@ class AsyncQueryBase(object):
|
||||
self._inner.offset(offset)
|
||||
return self
|
||||
|
||||
def fast_search(self) -> AsyncQuery:
|
||||
"""
|
||||
Skip searching un-indexed data.
|
||||
|
||||
This can make queries faster, but will miss any data that has not been
|
||||
indexed.
|
||||
|
||||
!!! tip
|
||||
You can add new data into an existing index by calling
|
||||
[AsyncTable.optimize][lancedb.table.AsyncTable.optimize].
|
||||
"""
|
||||
self._inner.fast_search()
|
||||
return self
|
||||
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
) -> AsyncRecordBatchReader:
|
||||
|
||||
@@ -17,6 +17,7 @@ from typing import Optional
|
||||
|
||||
import lance
|
||||
import lancedb
|
||||
from lancedb.index import IvfPq
|
||||
import numpy as np
|
||||
import pandas.testing as tm
|
||||
import pyarrow as pa
|
||||
@@ -358,6 +359,25 @@ async def test_query_to_pandas_async(table_async: AsyncTable):
|
||||
assert df.shape == (0, 4)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fast_search_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
vectors = pa.FixedShapeTensorArray.from_numpy_ndarray(
|
||||
np.random.rand(256, 32)
|
||||
).storage
|
||||
table = await db.create_table("test", pa.table({"vector": vectors}))
|
||||
await table.create_index(
|
||||
"vector", config=IvfPq(num_partitions=1, num_sub_vectors=1)
|
||||
)
|
||||
await table.add(pa.table({"vector": vectors}))
|
||||
|
||||
q = [1.0] * 32
|
||||
plan = await table.query().nearest_to(q).explain_plan(True)
|
||||
assert "LanceScan" in plan
|
||||
plan = await table.query().nearest_to(q).fast_search().explain_plan(True)
|
||||
assert "LanceScan" not in plan
|
||||
|
||||
|
||||
def test_explain_plan(table):
|
||||
q = LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
plan = q.explain_plan(verbose=True)
|
||||
|
||||
@@ -68,6 +68,10 @@ impl Query {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner = self.inner.clone().fast_search();
|
||||
}
|
||||
|
||||
pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<VectorQuery> {
|
||||
let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
|
||||
let array = make_array(data);
|
||||
@@ -146,6 +150,10 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner = self.inner.clone().fast_search();
|
||||
}
|
||||
|
||||
pub fn column(&mut self, column: String) {
|
||||
self.inner = self.inner.clone().column(&column);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user