mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-09 05:12:58 +00:00
feat(python): support post filter on FTS (#1783)
This commit is contained in:
@@ -1329,6 +1329,27 @@ class AsyncQueryBase(object):
|
||||
self._inner.fast_search()
|
||||
return self
|
||||
|
||||
def postfilter(self) -> AsyncQuery:
|
||||
"""
|
||||
If this is called then filtering will happen after the search instead of
|
||||
before.
|
||||
By default filtering will be performed before the search. This is how
|
||||
filtering is typically understood to work. This prefilter step does add some
|
||||
additional latency. Creating a scalar index on the filter column(s) can
|
||||
often improve this latency. However, sometimes a filter is too complex or
|
||||
scalar indices cannot be applied to the column. In these cases postfiltering
|
||||
can be used instead of prefiltering to improve latency.
|
||||
Post filtering applies the filter to the results of the search. This
|
||||
means we only run the filter on a much smaller set of data. However, it can
|
||||
cause the query to return fewer than `limit` results (or even no results) if
|
||||
none of the nearest results match the filter.
|
||||
Post filtering happens during the "refine stage" (described in more detail in
|
||||
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
||||
factor can often help restore some of the results lost by post filtering.
|
||||
"""
|
||||
self._inner.postfilter()
|
||||
return self
|
||||
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
) -> AsyncRecordBatchReader:
|
||||
@@ -1632,30 +1653,6 @@ class AsyncVectorQuery(AsyncQueryBase):
|
||||
self._inner.distance_type(distance_type)
|
||||
return self
|
||||
|
||||
def postfilter(self) -> AsyncVectorQuery:
|
||||
"""
|
||||
If this is called then filtering will happen after the vector search instead of
|
||||
before.
|
||||
|
||||
By default filtering will be performed before the vector search. This is how
|
||||
filtering is typically understood to work. This prefilter step does add some
|
||||
additional latency. Creating a scalar index on the filter column(s) can
|
||||
often improve this latency. However, sometimes a filter is too complex or
|
||||
scalar indices cannot be applied to the column. In these cases postfiltering
|
||||
can be used instead of prefiltering to improve latency.
|
||||
|
||||
Post filtering applies the filter to the results of the vector search. This
|
||||
means we only run the filter on a much smaller set of data. However, it can
|
||||
cause the query to return fewer than `limit` results (or even no results) if
|
||||
none of the nearest results match the filter.
|
||||
|
||||
Post filtering happens during the "refine stage" (described in more detail in
|
||||
@see {@link VectorQuery#refineFactor}). This means that setting a higher refine
|
||||
factor can often help restore some of the results lost by post filtering.
|
||||
"""
|
||||
self._inner.postfilter()
|
||||
return self
|
||||
|
||||
def bypass_vector_index(self) -> AsyncVectorQuery:
|
||||
"""
|
||||
If this is called then any vector index is skipped
|
||||
|
||||
@@ -235,6 +235,29 @@ async def test_search_fts_async(async_table):
|
||||
results = await async_table.query().nearest_to_text("puppy").limit(5).to_list()
|
||||
assert len(results) == 5
|
||||
|
||||
expected_count = await async_table.count_rows(
|
||||
"count > 5000 and contains(text, 'puppy')"
|
||||
)
|
||||
expected_count = min(expected_count, 10)
|
||||
|
||||
limited_results_pre_filter = await (
|
||||
async_table.query()
|
||||
.nearest_to_text("puppy")
|
||||
.where("count > 5000")
|
||||
.limit(10)
|
||||
.to_list()
|
||||
)
|
||||
assert len(limited_results_pre_filter) == expected_count
|
||||
limited_results_post_filter = await (
|
||||
async_table.query()
|
||||
.nearest_to_text("puppy")
|
||||
.where("count > 5000")
|
||||
.limit(10)
|
||||
.postfilter()
|
||||
.to_list()
|
||||
)
|
||||
assert len(limited_results_post_filter) <= expected_count
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_fts_specify_column_async(async_table):
|
||||
|
||||
@@ -72,6 +72,10 @@ impl Query {
|
||||
self.inner = self.inner.clone().fast_search();
|
||||
}
|
||||
|
||||
pub fn postfilter(&mut self) {
|
||||
self.inner = self.inner.clone().postfilter();
|
||||
}
|
||||
|
||||
pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<VectorQuery> {
|
||||
let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
|
||||
let array = make_array(data);
|
||||
|
||||
Reference in New Issue
Block a user