From b3fdabdf4586a84ea3f7737dd7426aace9e0c1a1 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 14:15:18 -0700 Subject: [PATCH 1/5] use python and arrow --- docs/mkdocs.yml | 12 +++++ docs/src/integrations.md | 108 +++++++++++++++++++++++++++++++++++++++ python/tests/test_db.py | 25 +++++++++ 3 files changed, 145 insertions(+) create mode 100644 docs/src/integrations.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index d5054ca5..fa855f48 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -3,6 +3,8 @@ docs_dir: src theme: name: "material" + features: + - content.code.copy plugins: - search @@ -11,4 +13,14 @@ plugins: nav: - Home: index.md +- Integrations: integrations.md - Python API: python.md + +markdown_extensions: +- pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true +- pymdownx.inlinehilite +- pymdownx.snippets +- pymdownx.superfences \ No newline at end of file diff --git a/docs/src/integrations.md b/docs/src/integrations.md new file mode 100644 index 00000000..f987ab66 --- /dev/null +++ b/docs/src/integrations.md @@ -0,0 +1,108 @@ +# Integrations + +Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, Polars, and DuckDB. + +## Pandas and PyArrow + +``` sh +pip install pandas lancedb +``` + +First, we need to connect to a LanceDB instance. This instance could be a local file directory. + +``` py + +import lancedb + +db = lancedb.connect("/tmp/lancedb") +``` + +And write `Pandas DataFrame` to LanceDB directly. + +```py +import pandas as pd + +data = pd.DataFrame({ + "vector": [[3.1, 4.1], [5.9, 26.5]], + "item": ["foo", "bar"], + "price": [10.0, 20.0] +}) +table = db.create_table("pd_table", data=data) + +# Optionally, create a IVF_PQ index +table.create_index(num_partitions=256, num_sub_vectors=96) +``` + +We can now perform similarity searches via `LanceDB`. + +```py +# Open the table previously created. +table = db.open_table("pd_table") + +query_vector = [100, 100] +# Pandas DataFrame +df = table.search(query_vector).limit(1).to_df() +print(df) +``` + +``` + vector item price score +0 [5.9, 26.5] bar 20.0 14257.05957 +``` + +We can apply the filter via `LancdDB`, or apply the filter on `pd.DataFrame` later. + +```python + +# Apply the filter via LanceDB +results = table.search([100, 100]).where("price < 15").to_df() +assert len(results) == 1 +assert results["item"].iloc[0] == "foo" + +# Apply the filter via Pandas +df = results = table.search([100, 100]).to_df() +results = df[df.price < 15] +assert len(results) == 1 +assert results["item"].iloc[0] == "foo" +``` + +# DuckDB + +Let us start with installing `duckdb` and `lancedb`. + +```shell +pip install duckdb lancedb +``` + +We will re-use the dataset created previously + +```python +import lancedb + +db = lancedb.connect("/tmp/lancedb") +table = db.open_table("pd_table") +arrow_table = table.to_arrow() +``` + +Now we can use `DuckDB` to query the `arrow_table`: + +```python +In [15]: duckdb.query("SELECT * FROM t") +Out[15]: +┌─────────────┬─────────┬────────┐ +│ vector │ item │ price │ +│ float[] │ varchar │ double │ +├─────────────┼─────────┼────────┤ +│ [3.1, 4.1] │ foo │ 10.0 │ +│ [5.9, 26.5] │ bar │ 20.0 │ +└─────────────┴─────────┴────────┘ + +In [16]: duckdb.query("SELECT mean(price) FROM t") +Out[16]: +┌─────────────┐ +│ mean(price) │ +│ double │ +├─────────────┤ +│ 15.0 │ +└─────────────┘ +``` \ No newline at end of file diff --git a/python/tests/test_db.py b/python/tests/test_db.py index 956ce505..c6f7583b 100644 --- a/python/tests/test_db.py +++ b/python/tests/test_db.py @@ -12,6 +12,7 @@ # limitations under the License. import lancedb +import pandas as pd def test_basic(tmp_path): @@ -40,3 +41,27 @@ def test_basic(tmp_path): assert len(db) == 1 assert db.open_table("test").name == db["test"].name + + +def test_ingest_pd(tmp_path): + db = lancedb.connect(tmp_path) + + assert db.uri == str(tmp_path) + assert db.table_names() == [] + + data = pd.DataFrame({"vector": [[3.1, 4.1], [5.9, 26.5]], "item": ["foo", "bar"], "price": [10.0, 20.0]}) + table = db.create_table("test", data=data) + rs = table.search([100, 100]).limit(1).to_df() + print(rs) + assert len(rs) == 1 + assert rs["item"].iloc[0] == "bar" + + rs = table.search([100, 100]).where("price < 15").limit(2).to_df() + assert len(rs) == 1 + assert rs["item"].iloc[0] == "foo" + + assert db.table_names() == ["test"] + assert "test" in db + assert len(db) == 1 + + assert db.open_table("test").name == db["test"].name From 45e02bb62b0d7a967d5c905a4eb24d9669c17008 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 14:24:58 -0700 Subject: [PATCH 2/5] no polars for now --- docs/src/integrations.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/src/integrations.md b/docs/src/integrations.md index f987ab66..beb75000 100644 --- a/docs/src/integrations.md +++ b/docs/src/integrations.md @@ -1,6 +1,6 @@ # Integrations -Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, Polars, and DuckDB. +Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python ecosystem, including Pandas, PyArrow and DuckDB. ## Pandas and PyArrow @@ -66,7 +66,9 @@ assert len(results) == 1 assert results["item"].iloc[0] == "foo" ``` -# DuckDB +## DuckDB + +`LanceDB` works with `DuckDB` via [PyArrow integration](https://duckdb.org/docs/guides/python/sql_on_arrow). Let us start with installing `duckdb` and `lancedb`. @@ -84,7 +86,7 @@ table = db.open_table("pd_table") arrow_table = table.to_arrow() ``` -Now we can use `DuckDB` to query the `arrow_table`: +`DuckDB` can directly query the `arrow_table`: ```python In [15]: duckdb.query("SELECT * FROM t") From c38d80cab21ed9077ab196b7615de5c667e4881a Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 14:26:07 -0700 Subject: [PATCH 3/5] remove print --- python/tests/test_db.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/tests/test_db.py b/python/tests/test_db.py index c6f7583b..207dd430 100644 --- a/python/tests/test_db.py +++ b/python/tests/test_db.py @@ -52,7 +52,6 @@ def test_ingest_pd(tmp_path): data = pd.DataFrame({"vector": [[3.1, 4.1], [5.9, 26.5]], "item": ["foo", "bar"], "price": [10.0, 20.0]}) table = db.create_table("test", data=data) rs = table.search([100, 100]).limit(1).to_df() - print(rs) assert len(rs) == 1 assert rs["item"].iloc[0] == "bar" From de6bfab1247f0c7d345043e02d7d7b997ae006bf Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 14:42:43 -0700 Subject: [PATCH 4/5] comments --- docs/src/integrations.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/docs/src/integrations.md b/docs/src/integrations.md index beb75000..8d5b5659 100644 --- a/docs/src/integrations.md +++ b/docs/src/integrations.md @@ -4,11 +4,7 @@ Built on top of Apache Arrow, `LanceDB` is easy to integrate with the Python eco ## Pandas and PyArrow -``` sh -pip install pandas lancedb -``` - -First, we need to connect to a LanceDB instance. This instance could be a local file directory. +First, we need to connect to a `LanceDB` database. ``` py @@ -17,7 +13,7 @@ import lancedb db = lancedb.connect("/tmp/lancedb") ``` -And write `Pandas DataFrame` to LanceDB directly. +And write a `Pandas DataFrame` to LanceDB directly. ```py import pandas as pd @@ -50,7 +46,8 @@ print(df) 0 [5.9, 26.5] bar 20.0 14257.05957 ``` -We can apply the filter via `LancdDB`, or apply the filter on `pd.DataFrame` later. +If you have a simple filter, it's faster to provide a where clause to `LanceDB`'s search query. +If you have more complex criteria, you can always apply the filter to the resulting pandas `DataFrame` from the search query. ```python From 23d4e3561fac71efead1b3f7a5e094cb6b561b5c Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 14:53:45 -0700 Subject: [PATCH 5/5] add link to basic and indexing --- docs/src/integrations.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/src/integrations.md b/docs/src/integrations.md index 8d5b5659..ea82cf53 100644 --- a/docs/src/integrations.md +++ b/docs/src/integrations.md @@ -29,6 +29,10 @@ table = db.create_table("pd_table", data=data) table.create_index(num_partitions=256, num_sub_vectors=96) ``` +You will find detailed instructions of creating dataset and index in [Basic Operations](basic.md) and [Indexing](indexing.md) +sections. + + We can now perform similarity searches via `LanceDB`. ```py