feat!: revert query limit to be unbounded for scans (#2151)

In earlier PRs (#1886, #1191) we made the default limit 10 regardless of
the query type. This was confusing for users and in many cases a
breaking change. Users would have queries that used to return all
results, but instead only returned the first 10, causing silent bugs.

Part of the cause was consistency: the Python sync API seems to have
always had a limit of 10, while newer APIs (Python async and Nodejs)
didn't.

This PR sets the default limit only for searches (vector search, FTS),
while letting scans (even with filters) be unbounded. It does this
consistently for all SDKs.

Fixes #1983
Fixes #1852
Fixes #2141
This commit is contained in:
Will Jones
2025-02-26 10:32:14 -08:00
committed by GitHub
parent 769d483e50
commit 5b12a47119
7 changed files with 78 additions and 15 deletions

View File

@@ -110,7 +110,7 @@ class Query(pydantic.BaseModel):
full_text_query: Optional[Union[str, dict]] = None
# top k results to return
k: int
k: Optional[int] = None
# # metrics
metric: str = "L2"
@@ -257,7 +257,7 @@ class LanceQueryBuilder(ABC):
def __init__(self, table: "Table"):
self._table = table
self._limit = 10
self._limit = None
self._offset = 0
self._columns = None
self._where = None
@@ -370,8 +370,7 @@ class LanceQueryBuilder(ABC):
The maximum number of results to return.
The default query limit is 10 results.
For ANN/KNN queries, you must specify a limit.
Entering 0, a negative number, or None will reset
the limit to the default value of 10.
For plain searches, all records are returned if limit not set.
*WARNING* if you have a large dataset, setting
the limit to a large number, e.g. the table size,
can potentially result in reading a
@@ -595,6 +594,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
fast_search: bool = False,
):
super().__init__(table)
if self._limit is None:
self._limit = 10
self._query = query
self._distance_type = "L2"
self._nprobes = 20
@@ -888,6 +889,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
fts_columns: Union[str, List[str]] = [],
):
super().__init__(table)
if self._limit is None:
self._limit = 10
self._query = query
self._phrase_query = False
self.ordering_field_name = ordering_field_name
@@ -1055,7 +1058,7 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
query = Query(
columns=self._columns,
filter=self._where,
k=self._limit or 10,
k=self._limit,
with_row_id=self._with_row_id,
vector=[],
# not actually respected in remote query

View File

@@ -3195,7 +3195,9 @@ class AsyncTable:
# The sync remote table calls into this method, so we need to map the
# query to the async version of the query and run that here. This is only
# used for that code path right now.
async_query = self.query().limit(query.k)
async_query = self.query()
if query.k is not None:
async_query = async_query.limit(query.k)
if query.offset > 0:
async_query = async_query.offset(query.offset)
if query.columns:

View File

@@ -174,6 +174,10 @@ def test_search_fts(table, use_tantivy):
assert len(results) == 5
assert len(results[0]) == 3 # id, text, _score
# Default limit of 10
results = table.search("puppy").select(["id", "text"]).to_list()
assert len(results) == 10
@pytest.mark.asyncio
async def test_fts_select_async(async_table):

View File

@@ -1025,13 +1025,13 @@ def test_empty_query(mem_db: DBConnection):
table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)])
df = table.search().select(["id"]).to_pandas()
assert len(df) == 10
assert len(df) == 100
# None is the same as default
df = table.search().select(["id"]).limit(None).to_pandas()
assert len(df) == 10
assert len(df) == 100
# invalid limist is the same as None, wihch is the same as default
df = table.search().select(["id"]).limit(-1).to_pandas()
assert len(df) == 10
assert len(df) == 100
# valid limit should work
df = table.search().select(["id"]).limit(42).to_pandas()
assert len(df) == 42