From b3fdabdf4586a84ea3f7737dd7426aace9e0c1a1 Mon Sep 17 00:00:00 2001
From: Lei Xu <eddyxu@gmail.com>
Date: Wed, 19 Apr 2023 14:15:18 -0700
Subject: [PATCH 1/5] use python and arrow

---
 docs/mkdocs.yml          |  12 +++++
 docs/src/integrations.md | 108 +++++++++++++++++++++++++++++++++++++++
 python/tests/test_db.py  |  25 +++++++++
 3 files changed, 145 insertions(+)
 create mode 100644 docs/src/integrations.md

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index d5054ca5..fa855f48 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -3,6 +3,8 @@ docs_dir: src
 
 theme:
   name: "material"
+  features:
+    - content.code.copy
 
 plugins:
 - search
@@ -11,4 +13,14 @@ plugins:
 
 nav:
 - Home: index.md
+- Integrations: integrations.md
 - Python API: python.md
+
+markdown_extensions:
+- pymdownx.highlight:
+    anchor_linenums: true
+    line_spans: __span
+    pygments_lang_class: true
+- pymdownx.inlinehilite
+- pymdownx.snippets
+- pymdownx.superfences
\ No newline at end of file
diff --git a/docs/src/integrations.md b/docs/src/integrations.md
new file mode 100644
index 00000000..f987ab66
--- /dev/null
+++ b/docs/src/integrations.md
@@ -0,0 +1,108 @@
+# Integrations
+
+Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, Polars, and DuckDB.
+
+## Pandas and PyArrow
+
+``` sh
+pip install pandas lancedb
+```
+
+First, we need to connect to a LanceDB instance. This instance could be a local file directory.
+
+``` py
+
+import lancedb
+
+db = lancedb.connect("/tmp/lancedb")
+```
+
+And write `Pandas DataFrame` to LanceDB directly.
+
+```py
+import pandas as pd
+
+data = pd.DataFrame({
+    "vector": [[3.1, 4.1], [5.9, 26.5]],
+    "item": ["foo", "bar"],
+    "price": [10.0, 20.0]
+})
+table = db.create_table("pd_table", data=data)
+
+# Optionally, create a IVF_PQ index
+table.create_index(num_partitions=256, num_sub_vectors=96)
+```
+
+We can now perform similarity searches via `LanceDB`.
+
+```py
+# Open the table previously created.
+table = db.open_table("pd_table")
+
+query_vector = [100, 100]
+# Pandas DataFrame
+df = table.search(query_vector).limit(1).to_df()
+print(df)
+```
+
+```
+    vector     item  price        score
+0  [5.9, 26.5]  bar   20.0  14257.05957
+```
+
+We can apply the filter via `LancdDB`, or apply the filter on `pd.DataFrame` later.
+
+```python
+
+# Apply the filter via LanceDB
+results = table.search([100, 100]).where("price < 15").to_df()
+assert len(results) == 1
+assert results["item"].iloc[0] == "foo"
+
+# Apply the filter via Pandas
+df = results = table.search([100, 100]).to_df()
+results = df[df.price < 15]
+assert len(results) == 1
+assert results["item"].iloc[0] == "foo"
+```
+
+# DuckDB
+
+Let us start with installing `duckdb` and `lancedb`.
+
+```shell
+pip install duckdb lancedb
+```
+
+We will re-use the dataset created previously
+
+```python
+import lancedb
+
+db = lancedb.connect("/tmp/lancedb")
+table = db.open_table("pd_table")
+arrow_table = table.to_arrow()
+```
+
+Now we can use `DuckDB` to query the `arrow_table`:
+
+```python
+In [15]: duckdb.query("SELECT * FROM t")
+Out[15]:
+┌─────────────┬─────────┬────────┐
+│   vector    │  item   │ price  │
+│   float[]   │ varchar │ double │
+├─────────────┼─────────┼────────┤
+│ [3.1, 4.1]  │ foo     │   10.0 │
+│ [5.9, 26.5] │ bar     │   20.0 │
+└─────────────┴─────────┴────────┘
+
+In [16]: duckdb.query("SELECT mean(price) FROM t")
+Out[16]:
+┌─────────────┐
+│ mean(price) │
+│   double    │
+├─────────────┤
+│        15.0 │
+└─────────────┘
+```
\ No newline at end of file
diff --git a/python/tests/test_db.py b/python/tests/test_db.py
index 956ce505..c6f7583b 100644
--- a/python/tests/test_db.py
+++ b/python/tests/test_db.py
@@ -12,6 +12,7 @@
 #  limitations under the License.
 
 import lancedb
+import pandas as pd
 
 
 def test_basic(tmp_path):
@@ -40,3 +41,27 @@ def test_basic(tmp_path):
     assert len(db) == 1
 
     assert db.open_table("test").name == db["test"].name
+
+
+def test_ingest_pd(tmp_path):
+    db = lancedb.connect(tmp_path)
+
+    assert db.uri == str(tmp_path)
+    assert db.table_names() == []
+
+    data = pd.DataFrame({"vector": [[3.1, 4.1], [5.9, 26.5]], "item": ["foo", "bar"], "price": [10.0, 20.0]})
+    table = db.create_table("test", data=data)
+    rs = table.search([100, 100]).limit(1).to_df()
+    print(rs)
+    assert len(rs) == 1
+    assert rs["item"].iloc[0] == "bar"
+
+    rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
+    assert len(rs) == 1
+    assert rs["item"].iloc[0] == "foo"
+
+    assert db.table_names() == ["test"]
+    assert "test" in db
+    assert len(db) == 1
+
+    assert db.open_table("test").name == db["test"].name

From 45e02bb62b0d7a967d5c905a4eb24d9669c17008 Mon Sep 17 00:00:00 2001
From: Lei Xu <eddyxu@gmail.com>
Date: Wed, 19 Apr 2023 14:24:58 -0700
Subject: [PATCH 2/5] no polars for now

---
 docs/src/integrations.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/src/integrations.md b/docs/src/integrations.md
index f987ab66..beb75000 100644
--- a/docs/src/integrations.md
+++ b/docs/src/integrations.md
@@ -1,6 +1,6 @@
 # Integrations
 
-Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, Polars, and DuckDB.
+Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, PyArrow and DuckDB.
 
 ## Pandas and PyArrow
 
@@ -66,7 +66,9 @@ assert len(results) == 1
 assert results["item"].iloc[0] == "foo"
 ```
 
-# DuckDB
+## DuckDB
+
+`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow).
 
 Let us start with installing `duckdb` and `lancedb`.
 
@@ -84,7 +86,7 @@ table = db.open_table("pd_table")
 arrow_table = table.to_arrow()
 ```
 
-Now we can use `DuckDB` to query the `arrow_table`:
+`DuckDB` can directly query the `arrow_table`:
 
 ```python
 In [15]: duckdb.query("SELECT * FROM t")

From c38d80cab21ed9077ab196b7615de5c667e4881a Mon Sep 17 00:00:00 2001
From: Lei Xu <eddyxu@gmail.com>
Date: Wed, 19 Apr 2023 14:26:07 -0700
Subject: [PATCH 3/5] remove print

---
 python/tests/test_db.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tests/test_db.py b/python/tests/test_db.py
index c6f7583b..207dd430 100644
--- a/python/tests/test_db.py
+++ b/python/tests/test_db.py
@@ -52,7 +52,6 @@ def test_ingest_pd(tmp_path):
     data = pd.DataFrame({"vector": [[3.1, 4.1], [5.9, 26.5]], "item": ["foo", "bar"], "price": [10.0, 20.0]})
     table = db.create_table("test", data=data)
     rs = table.search([100, 100]).limit(1).to_df()
-    print(rs)
     assert len(rs) == 1
     assert rs["item"].iloc[0] == "bar"
 

From de6bfab1247f0c7d345043e02d7d7b997ae006bf Mon Sep 17 00:00:00 2001
From: Lei Xu <eddyxu@gmail.com>
Date: Wed, 19 Apr 2023 14:42:43 -0700
Subject: [PATCH 4/5] comments

---
 docs/src/integrations.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/docs/src/integrations.md b/docs/src/integrations.md
index beb75000..8d5b5659 100644
--- a/docs/src/integrations.md
+++ b/docs/src/integrations.md
@@ -4,11 +4,7 @@ Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python eco
 
 ## Pandas and PyArrow
 
-``` sh
-pip install pandas lancedb
-```
-
-First, we need to connect to a LanceDB instance. This instance could be a local file directory.
+First, we need to connect to a `LanceDB` database.
 
 ``` py
 
@@ -17,7 +13,7 @@ import lancedb
 db = lancedb.connect("/tmp/lancedb")
 ```
 
-And write `Pandas DataFrame` to LanceDB directly.
+And write a `Pandas DataFrame` to LanceDB directly.
 
 ```py
 import pandas as pd
@@ -50,7 +46,8 @@ print(df)
 0  [5.9, 26.5]  bar   20.0  14257.05957
 ```
 
-We can apply the filter via `LancdDB`, or apply the filter on `pd.DataFrame` later.
+If you have a simple filter, it's faster to provide a where clause to `LanceDB`'s search query.
+If you have more complex criteria, you can always apply the filter to the resulting pandas `DataFrame` from the search query.
 
 ```python
 

From 23d4e3561fac71efead1b3f7a5e094cb6b561b5c Mon Sep 17 00:00:00 2001
From: Lei Xu <eddyxu@gmail.com>
Date: Wed, 19 Apr 2023 14:53:45 -0700
Subject: [PATCH 5/5] add link to basic and indexing

---
 docs/src/integrations.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/src/integrations.md b/docs/src/integrations.md
index 8d5b5659..ea82cf53 100644
--- a/docs/src/integrations.md
+++ b/docs/src/integrations.md
@@ -29,6 +29,10 @@ table = db.create_table("pd_table", data=data)
 table.create_index(num_partitions=256, num_sub_vectors=96)
 ```
 
+You will find detailed instructions of creating dataset and index in [Basic Operations](basic.md) and [Indexing](indexing.md)
+sections.
+
+
 We can now perform similarity searches via `LanceDB`.
 
 ```py