feat!: revert query limit to be unbounded for scans (#2151)

In earlier PRs (#1886, #1191) we made the default limit 10 regardless of the query type. This was confusing for users and in many cases a breaking change. Users would have queries that used to return all results, but instead only returned the first 10, causing silent bugs. Part of the cause was consistency: the Python sync API seems to have always had a limit of 10, while newer APIs (Python async and Nodejs) didn't. This PR sets the default limit only for searches (vector search, FTS), while letting scans (even with filters) be unbounded. It does this consistently for all SDKs. Fixes #1983 Fixes #1852 Fixes #2141
2025-12-27 15:12:53 +00:00 · 2025-02-26 10:32:14 -08:00
parent 769d483e50
commit 5b12a47119
7 changed files with 78 additions and 15 deletions
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -110,7 +110,7 @@ class Query(pydantic.BaseModel):
    full_text_query: Optional[Union[str, dict]] = None

    # top k results to return
-    k: int
+    k: Optional[int] = None

    # # metrics
    metric: str = "L2"
@@ -257,7 +257,7 @@ class LanceQueryBuilder(ABC):

    def __init__(self, table: "Table"):
        self._table = table
-        self._limit = 10
+        self._limit = None
        self._offset = 0
        self._columns = None
        self._where = None
@@ -370,8 +370,7 @@ class LanceQueryBuilder(ABC):
            The maximum number of results to return.
            The default query limit is 10 results.
            For ANN/KNN queries, you must specify a limit.
-            Entering 0, a negative number, or None will reset
-            the limit to the default value of 10.
+            For plain searches, all records are returned if limit not set.
            *WARNING* if you have a large dataset, setting
            the limit to a large number, e.g. the table size,
            can potentially result in reading a
@@ -595,6 +594,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
        fast_search: bool = False,
    ):
        super().__init__(table)
+        if self._limit is None:
+            self._limit = 10
        self._query = query
        self._distance_type = "L2"
        self._nprobes = 20
@@ -888,6 +889,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
        fts_columns: Union[str, List[str]] = [],
    ):
        super().__init__(table)
+        if self._limit is None:
+            self._limit = 10
        self._query = query
        self._phrase_query = False
        self.ordering_field_name = ordering_field_name
@@ -1055,7 +1058,7 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
        query = Query(
            columns=self._columns,
            filter=self._where,
-            k=self._limit or 10,
+            k=self._limit,
            with_row_id=self._with_row_id,
            vector=[],
            # not actually respected in remote query
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -3195,7 +3195,9 @@ class AsyncTable:
        # The sync remote table calls into this method, so we need to map the
        # query to the async version of the query and run that here. This is only
        # used for that code path right now.
-        async_query = self.query().limit(query.k)
+        async_query = self.query()
+        if query.k is not None:
+            async_query = async_query.limit(query.k)
        if query.offset > 0:
            async_query = async_query.offset(query.offset)
        if query.columns:
--- a/python/python/tests/test_fts.py
+++ b/python/python/tests/test_fts.py
@@ -174,6 +174,10 @@ def test_search_fts(table, use_tantivy):
    assert len(results) == 5
    assert len(results[0]) == 3  # id, text, _score

+    # Default limit of 10
+    results = table.search("puppy").select(["id", "text"]).to_list()
+    assert len(results) == 10
+

@pytest.mark.asyncio
 async def test_fts_select_async(async_table):
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -1025,13 +1025,13 @@ def test_empty_query(mem_db: DBConnection):

    table = mem_db.create_table("my_table2", data=[{"id": i} for i in range(100)])
    df = table.search().select(["id"]).to_pandas()
-    assert len(df) == 10
+    assert len(df) == 100
    # None is the same as default
    df = table.search().select(["id"]).limit(None).to_pandas()
-    assert len(df) == 10
+    assert len(df) == 100
    # invalid limist is the same as None, wihch is the same as default
    df = table.search().select(["id"]).limit(-1).to_pandas()
-    assert len(df) == 10
+    assert len(df) == 100
    # valid limit should work
    df = table.search().select(["id"]).limit(42).to_pandas()
    assert len(df) == 42