mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-27 15:12:53 +00:00
feat!: revert query limit to be unbounded for scans (#2151)
In earlier PRs (#1886, #1191) we made the default limit 10 regardless of the query type. This was confusing for users and in many cases a breaking change. Users would have queries that used to return all results, but instead only returned the first 10, causing silent bugs. Part of the cause was consistency: the Python sync API seems to have always had a limit of 10, while newer APIs (Python async and Nodejs) didn't. This PR sets the default limit only for searches (vector search, FTS), while letting scans (even with filters) be unbounded. It does this consistently for all SDKs. Fixes #1983 Fixes #1852 Fixes #2141
This commit is contained in:
@@ -110,7 +110,7 @@ class Query(pydantic.BaseModel):
|
||||
full_text_query: Optional[Union[str, dict]] = None
|
||||
|
||||
# top k results to return
|
||||
k: int
|
||||
k: Optional[int] = None
|
||||
|
||||
# # metrics
|
||||
metric: str = "L2"
|
||||
@@ -257,7 +257,7 @@ class LanceQueryBuilder(ABC):
|
||||
|
||||
def __init__(self, table: "Table"):
|
||||
self._table = table
|
||||
self._limit = 10
|
||||
self._limit = None
|
||||
self._offset = 0
|
||||
self._columns = None
|
||||
self._where = None
|
||||
@@ -370,8 +370,7 @@ class LanceQueryBuilder(ABC):
|
||||
The maximum number of results to return.
|
||||
The default query limit is 10 results.
|
||||
For ANN/KNN queries, you must specify a limit.
|
||||
Entering 0, a negative number, or None will reset
|
||||
the limit to the default value of 10.
|
||||
For plain searches, all records are returned if limit not set.
|
||||
*WARNING* if you have a large dataset, setting
|
||||
the limit to a large number, e.g. the table size,
|
||||
can potentially result in reading a
|
||||
@@ -595,6 +594,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
fast_search: bool = False,
|
||||
):
|
||||
super().__init__(table)
|
||||
if self._limit is None:
|
||||
self._limit = 10
|
||||
self._query = query
|
||||
self._distance_type = "L2"
|
||||
self._nprobes = 20
|
||||
@@ -888,6 +889,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
fts_columns: Union[str, List[str]] = [],
|
||||
):
|
||||
super().__init__(table)
|
||||
if self._limit is None:
|
||||
self._limit = 10
|
||||
self._query = query
|
||||
self._phrase_query = False
|
||||
self.ordering_field_name = ordering_field_name
|
||||
@@ -1055,7 +1058,7 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||
query = Query(
|
||||
columns=self._columns,
|
||||
filter=self._where,
|
||||
k=self._limit or 10,
|
||||
k=self._limit,
|
||||
with_row_id=self._with_row_id,
|
||||
vector=[],
|
||||
# not actually respected in remote query
|
||||
|
||||
@@ -3195,7 +3195,9 @@ class AsyncTable:
|
||||
# The sync remote table calls into this method, so we need to map the
|
||||
# query to the async version of the query and run that here. This is only
|
||||
# used for that code path right now.
|
||||
async_query = self.query().limit(query.k)
|
||||
async_query = self.query()
|
||||
if query.k is not None:
|
||||
async_query = async_query.limit(query.k)
|
||||
if query.offset > 0:
|
||||
async_query = async_query.offset(query.offset)
|
||||
if query.columns:
|
||||
|
||||
@@ -174,6 +174,10 @@ def test_search_fts(table, use_tantivy):
|
||||
assert len(results) == 5
|
||||
assert len(results[0]) == 3 # id, text, _score
|
||||
|
||||
# Default limit of 10
|
||||
results = table.search("puppy").select(["id", "text"]).to_list()
|
||||
assert len(results) == 10
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fts_select_async(async_table):
|
||||
|
||||
@@ -1025,13 +1025,13 @@ def test_empty_query(mem_db: DBConnection):
|
||||
|
||||
table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)])
|
||||
df = table.search().select(["id"]).to_pandas()
|
||||
assert len(df) == 10
|
||||
assert len(df) == 100
|
||||
# None is the same as default
|
||||
df = table.search().select(["id"]).limit(None).to_pandas()
|
||||
assert len(df) == 10
|
||||
assert len(df) == 100
|
||||
# invalid limist is the same as None, wihch is the same as default
|
||||
df = table.search().select(["id"]).limit(-1).to_pandas()
|
||||
assert len(df) == 10
|
||||
assert len(df) == 100
|
||||
# valid limit should work
|
||||
df = table.search().select(["id"]).limit(42).to_pandas()
|
||||
assert len(df) == 42
|
||||
|
||||
Reference in New Issue
Block a user