feat: add to_list and to_pandas api's (#556)

Add `to_list` to return query results as list of python dict (so we're not too pandas-centric). Closes #555 Add `to_pandas` API and add deprecation warning on `to_df`. Closes #545 Co-authored-by: Chang She <chang@lancedb.com>
2026-01-07 12:22:59 +00:00 · 2023-10-11 12:18:55 -07:00
parent a737bbff19
commit 8469d010f8
26 changed files with 125 additions and 71 deletions
--- a/docs/src/ann_indexes.md
+++ b/docs/src/ann_indexes.md
@@ -97,7 +97,7 @@ There are a couple of parameters that can be used to fine-tune the search:
         .limit(2) \
         .nprobes(20) \
         .refine_factor(10) \
-         .to_df()
+         .to_pandas()
     ```
     ```
                                              vector       item       _distance
@@ -124,7 +124,7 @@ You can further filter the elements returned by a search using a where clause.

 === "Python"
     ```python
-     tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_df()
+     tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas()
     ```

 === "Javascript"
@@ -141,7 +141,7 @@ You can select the columns returned by the query using a select clause.

 === "Python"
     ```python
-     tbl.search(np.random.random((1536))).select(["vector"]).to_df()
+     tbl.search(np.random.random((1536))).select(["vector"]).to_pandas()
     ```
     ```
        vector                                             _distance
--- a/docs/src/basic.md
+++ b/docs/src/basic.md
@@ -146,7 +146,7 @@ Once you've embedded the query, you can find its nearest neighbors using the fol

 === "Python"
      ```python
-      tbl.search([100, 100]).limit(2).to_df()
+      tbl.search([100, 100]).limit(2).to_pandas()
      ```

      This returns a pandas DataFrame with the results.
--- a/docs/src/embedding.md
+++ b/docs/src/embedding.md
@@ -118,7 +118,7 @@ belong in the same latent space and your results will be nonsensical.
     ```python
     query = "What's the best pizza topping?"
     query_vector = embed_func([query])[0]
-     tbl.search(query_vector).limit(10).to_df()
+     tbl.search(query_vector).limit(10).to_pandas()
     ```

     The above snippet returns a pandas DataFrame with the 10 closest vectors to the query.
--- a/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md
+++ b/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md
@@ -80,14 +80,14 @@ def handler(event, context):
    # Shape of SIFT is (128,1M), d=float32
    query_vector = np.array(event['query_vector'], dtype=np.float32)

-    rs = table.search(query_vector).limit(2).to_df()
+    rs = table.search(query_vector).limit(2).to_list()

    return {
        "statusCode": status_code,
        "headers": {
            "Content-Type": "application/json"
        },
-        "body": rs.to_json()
+        "body": json.dumps(rs)
    }
 ``` 

--- a/docs/src/fts.md
+++ b/docs/src/fts.md
@@ -43,7 +43,13 @@ table.create_fts_index("text")
 To search:

 ```python
-df = table.search("puppy").limit(10).select(["text"]).to_df()
+table.search("puppy").limit(10).select(["text"]).to_list()
+```
+
+Which returns a list of dictionaries:
+
+```python
+[{'text': 'Frodo was a happy puppy', 'score': 0.6931471824645996}]
 ```

 LanceDB automatically looks for an FTS index if the input is str.
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -36,7 +36,7 @@ LanceDB's core is written in Rust 🦀 and is built using <a href="https://githu
      table = db.create_table("my_table",
                              data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
                                    {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
-      result = table.search([100, 100]).limit(2).to_df()
+      result = table.search([100, 100]).limit(2).to_list()
      ```

 === "Javascript"
--- a/docs/src/notebooks/multimodal_search.ipynb
+++ b/docs/src/notebooks/multimodal_search.ipynb
@@ -19,11 +19,11 @@
     "output_type": "stream",
     "text": [
      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
     ]
    }
   ],
@@ -39,6 +39,7 @@
   "outputs": [],
   "source": [
    "import io\n",
+    "\n",
    "import PIL\n",
    "import duckdb\n",
    "import lancedb"
@@ -158,18 +159,18 @@
    "        \"db = lancedb.connect('~/datasets/demo')\\n\"\n",
    "        \"tbl = db.open_table('diffusiondb')\\n\\n\"\n",
    "        f\"embedding = embed_func('{query}')\\n\"\n",
-    "        \"tbl.search(embedding).limit(9).to_df()\"\n",
+    "        \"tbl.search(embedding).limit(9).to_pandas()\"\n",
    "    )\n",
-    "    return (_extract(tbl.search(emb).limit(9).to_df()), code)\n",
+    "    return (_extract(tbl.search(emb).limit(9).to_pandas()), code)\n",
    "\n",
    "def find_image_keywords(query):\n",
    "    code = (\n",
    "        \"import lancedb\\n\"\n",
    "        \"db = lancedb.connect('~/datasets/demo')\\n\"\n",
    "        \"tbl = db.open_table('diffusiondb')\\n\\n\"\n",
-    "        f\"tbl.search('{query}').limit(9).to_df()\"\n",
+    "        f\"tbl.search('{query}').limit(9).to_pandas()\"\n",
    "    )\n",
-    "    return (_extract(tbl.search(query).limit(9).to_df()), code)\n",
+    "    return (_extract(tbl.search(query).limit(9).to_pandas()), code)\n",
    "\n",
    "def find_image_sql(query):\n",
    "    code = (\n",
--- a/docs/src/notebooks/youtube_transcript_search.ipynb
+++ b/docs/src/notebooks/youtube_transcript_search.ipynb
@@ -27,11 +27,11 @@
     "output_type": "stream",
     "text": [
      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
     ]
    }
   ],
@@ -184,7 +184,7 @@
    "df = (contextualize(data.to_pandas())\n",
    "      .groupby(\"title\").text_col(\"text\")\n",
    "      .window(20).stride(4)\n",
-    "      .to_df())\n",
+    "      .to_pandas())\n",
    "df.head(1)"
   ]
  },
@@ -603,7 +603,7 @@
   "outputs": [],
   "source": [
    "# Use LanceDB to get top 3 most relevant context\n",
-    "context = tbl.search(emb).limit(3).to_df()"
+    "context = tbl.search(emb).limit(3).to_pandas()"
   ]
  },
  {
--- a/docs/src/python/arrow.md
+++ b/docs/src/python/arrow.md
@@ -74,7 +74,7 @@ table = db.open_table("pd_table")

 query_vector = [100, 100]
 # Pandas DataFrame
-df = table.search(query_vector).limit(1).to_df()
+df = table.search(query_vector).limit(1).to_pandas()
 print(df)
 ```

@@ -89,12 +89,12 @@ If you have more complex criteria, you can always apply the filter to the result
 ```python

 # Apply the filter via LanceDB
-results = table.search([100, 100]).where("price < 15").to_df()
+results = table.search([100, 100]).where("price < 15").to_pandas()
 assert len(results) == 1
 assert results["item"].iloc[0] == "foo"

 # Apply the filter via Pandas
-df = results = table.search([100, 100]).to_df()
+df = results = table.search([100, 100]).to_pandas()
 results = df[df.price < 15]
 assert len(results) == 1
 assert results["item"].iloc[0] == "foo"
--- a/docs/src/search.md
+++ b/docs/src/search.md
@@ -67,7 +67,7 @@ await db_setup.createTable('my_vectors', data)

    df = tbl.search(np.random.random((1536))) \
        .limit(10) \
-        .to_df()
+        .to_list()
    ```

 === "JavaScript"
@@ -92,7 +92,7 @@ as well.
    df = tbl.search(np.random.random((1536))) \
        .metric("cosine") \
        .limit(10) \
-        .to_df()
+        .to_list()
    ```