use python and arrow

2026-05-26 08:20:39 +00:00 · 2023-04-19 14:15:18 -07:00
parent 3ba7fa15a4
commit b3fdabdf45
3 changed files with 145 additions and 0 deletions
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -3,6 +3,8 @@ docs_dir: src

 theme:
  name: "material"
+  features:
+    - content.code.copy

 plugins:
 - search
@@ -11,4 +13,14 @@ plugins:

 nav:
 - Home: index.md
+- Integrations: integrations.md
 - Python API: python.md
+
+markdown_extensions:
+- pymdownx.highlight:
+    anchor_linenums: true
+    line_spans: __span
+    pygments_lang_class: true
+- pymdownx.inlinehilite
+- pymdownx.snippets
+- pymdownx.superfences
--- a/docs/src/integrations.md
+++ b/docs/src/integrations.md
@@ -0,0 +1,108 @@
+# Integrations
+
+Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, Polars, and DuckDB.
+
+## Pandas and PyArrow
+
+``` sh
+pip install pandas lancedb
+```
+
+First, we need to connect to a LanceDB instance. This instance could be a local file directory.
+
+``` py
+
+import lancedb
+
+db = lancedb.connect("/tmp/lancedb")
+```
+
+And write `Pandas DataFrame` to LanceDB directly.
+
+```py
+import pandas as pd
+
+data = pd.DataFrame({
+    "vector": [[3.1, 4.1], [5.9, 26.5]],
+    "item": ["foo", "bar"],
+    "price": [10.0, 20.0]
+})
+table = db.create_table("pd_table", data=data)
+
+# Optionally, create a IVF_PQ index
+table.create_index(num_partitions=256, num_sub_vectors=96)
+```
+
+We can now perform similarity searches via `LanceDB`.
+
+```py
+# Open the table previously created.
+table = db.open_table("pd_table")
+
+query_vector = [100, 100]
+# Pandas DataFrame
+df = table.search(query_vector).limit(1).to_df()
+print(df)
+```
+
+```
+    vector     item  price        score
+0  [5.9, 26.5]  bar   20.0  14257.05957
+```
+
+We can apply the filter via `LancdDB`, or apply the filter on `pd.DataFrame` later.
+
+```python
+
+# Apply the filter via LanceDB
+results = table.search([100, 100]).where("price < 15").to_df()
+assert len(results) == 1
+assert results["item"].iloc[0] == "foo"
+
+# Apply the filter via Pandas
+df = results = table.search([100, 100]).to_df()
+results = df[df.price < 15]
+assert len(results) == 1
+assert results["item"].iloc[0] == "foo"
+```
+
+# DuckDB
+
+Let us start with installing `duckdb` and `lancedb`.
+
+```shell
+pip install duckdb lancedb
+```
+
+We will re-use the dataset created previously
+
+```python
+import lancedb
+
+db = lancedb.connect("/tmp/lancedb")
+table = db.open_table("pd_table")
+arrow_table = table.to_arrow()
+```
+
+Now we can use `DuckDB` to query the `arrow_table`:
+
+```python
+In [15]: duckdb.query("SELECT * FROM t")
+Out[15]:
+┌─────────────┬─────────┬────────┐
+│   vector    │  item   │ price  │
+│   float[]   │ varchar │ double │
+├─────────────┼─────────┼────────┤
+│ [3.1, 4.1]  │ foo     │   10.0 │
+│ [5.9, 26.5] │ bar     │   20.0 │
+└─────────────┴─────────┴────────┘
+
+In [16]: duckdb.query("SELECT mean(price) FROM t")
+Out[16]:
+┌─────────────┐
+│ mean(price) │
+│   double    │
+├─────────────┤
+│        15.0 │
+└─────────────┘
+```
--- a/python/tests/test_db.py
+++ b/python/tests/test_db.py
@@ -12,6 +12,7 @@
 #  limitations under the License.

 import lancedb
+import pandas as pd


 def test_basic(tmp_path):
@@ -40,3 +41,27 @@ def test_basic(tmp_path):
    assert len(db) == 1

    assert db.open_table("test").name == db["test"].name
+
+
+def test_ingest_pd(tmp_path):
+    db = lancedb.connect(tmp_path)
+
+    assert db.uri == str(tmp_path)
+    assert db.table_names() == []
+
+    data = pd.DataFrame({"vector": [[3.1, 4.1], [5.9, 26.5]], "item": ["foo", "bar"], "price": [10.0, 20.0]})
+    table = db.create_table("test", data=data)
+    rs = table.search([100, 100]).limit(1).to_df()
+    print(rs)
+    assert len(rs) == 1
+    assert rs["item"].iloc[0] == "bar"
+
+    rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
+    assert len(rs) == 1
+    assert rs["item"].iloc[0] == "foo"
+
+    assert db.table_names() == ["test"]
+    assert "test" in db
+    assert len(db) == 1
+
+    assert db.open_table("test").name == db["test"].name