diff --git a/README.md b/README.md index b2790e4d..072669cc 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ db = lancedb.connect(uri) table = db.create_table("my_table", data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}]) -result = table.search([100, 100]).limit(2).to_df() +result = table.search([100, 100]).limit(2).to_pandas() ``` ## Blogs, Tutorials & Videos diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index aa4e6d0b..33208628 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -97,7 +97,7 @@ There are a couple of parameters that can be used to fine-tune the search: .limit(2) \ .nprobes(20) \ .refine_factor(10) \ - .to_df() + .to_pandas() ``` ``` vector item _distance @@ -124,7 +124,7 @@ You can further filter the elements returned by a search using a where clause. === "Python" ```python - tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_df() + tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas() ``` === "Javascript" @@ -141,7 +141,7 @@ You can select the columns returned by the query using a select clause. === "Python" ```python - tbl.search(np.random.random((1536))).select(["vector"]).to_df() + tbl.search(np.random.random((1536))).select(["vector"]).to_pandas() ``` ``` vector _distance diff --git a/docs/src/basic.md b/docs/src/basic.md index 8b24f809..33bbbf3a 100644 --- a/docs/src/basic.md +++ b/docs/src/basic.md @@ -146,7 +146,7 @@ Once you've embedded the query, you can find its nearest neighbors using the fol === "Python" ```python - tbl.search([100, 100]).limit(2).to_df() + tbl.search([100, 100]).limit(2).to_pandas() ``` This returns a pandas DataFrame with the results. diff --git a/docs/src/embedding.md b/docs/src/embedding.md index e8090578..c2db7501 100644 --- a/docs/src/embedding.md +++ b/docs/src/embedding.md @@ -118,7 +118,7 @@ belong in the same latent space and your results will be nonsensical. ```python query = "What's the best pizza topping?" query_vector = embed_func([query])[0] - tbl.search(query_vector).limit(10).to_df() + tbl.search(query_vector).limit(10).to_pandas() ``` The above snippet returns a pandas DataFrame with the 10 closest vectors to the query. diff --git a/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md b/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md index 7467d943..b8058b9d 100644 --- a/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md +++ b/docs/src/examples/serverless_lancedb_with_s3_and_lambda.md @@ -80,14 +80,14 @@ def handler(event, context): # Shape of SIFT is (128,1M), d=float32 query_vector = np.array(event['query_vector'], dtype=np.float32) - rs = table.search(query_vector).limit(2).to_df() + rs = table.search(query_vector).limit(2).to_list() return { "statusCode": status_code, "headers": { "Content-Type": "application/json" }, - "body": rs.to_json() + "body": json.dumps(rs) } ``` diff --git a/docs/src/fts.md b/docs/src/fts.md index dafcb055..47f51346 100644 --- a/docs/src/fts.md +++ b/docs/src/fts.md @@ -43,7 +43,13 @@ table.create_fts_index("text") To search: ```python -df = table.search("puppy").limit(10).select(["text"]).to_df() +table.search("puppy").limit(10).select(["text"]).to_list() +``` + +Which returns a list of dictionaries: + +```python +[{'text': 'Frodo was a happy puppy', 'score': 0.6931471824645996}] ``` LanceDB automatically looks for an FTS index if the input is str. diff --git a/docs/src/index.md b/docs/src/index.md index 8aa43a28..64eb8f87 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -36,7 +36,7 @@ LanceDB's core is written in Rust 🦀 and is built using \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n", "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n" ] } ], @@ -39,6 +39,7 @@ "outputs": [], "source": [ "import io\n", + "\n", "import PIL\n", "import duckdb\n", "import lancedb" @@ -158,18 +159,18 @@ " \"db = lancedb.connect('~/datasets/demo')\\n\"\n", " \"tbl = db.open_table('diffusiondb')\\n\\n\"\n", " f\"embedding = embed_func('{query}')\\n\"\n", - " \"tbl.search(embedding).limit(9).to_df()\"\n", + " \"tbl.search(embedding).limit(9).to_pandas()\"\n", " )\n", - " return (_extract(tbl.search(emb).limit(9).to_df()), code)\n", + " return (_extract(tbl.search(emb).limit(9).to_pandas()), code)\n", "\n", "def find_image_keywords(query):\n", " code = (\n", " \"import lancedb\\n\"\n", " \"db = lancedb.connect('~/datasets/demo')\\n\"\n", " \"tbl = db.open_table('diffusiondb')\\n\\n\"\n", - " f\"tbl.search('{query}').limit(9).to_df()\"\n", + " f\"tbl.search('{query}').limit(9).to_pandas()\"\n", " )\n", - " return (_extract(tbl.search(query).limit(9).to_df()), code)\n", + " return (_extract(tbl.search(query).limit(9).to_pandas()), code)\n", "\n", "def find_image_sql(query):\n", " code = (\n", diff --git a/docs/src/notebooks/youtube_transcript_search.ipynb b/docs/src/notebooks/youtube_transcript_search.ipynb index 8165e0fc..b39a9c8e 100644 --- a/docs/src/notebooks/youtube_transcript_search.ipynb +++ b/docs/src/notebooks/youtube_transcript_search.ipynb @@ -27,11 +27,11 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n", "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n", + "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n" ] } ], @@ -184,7 +184,7 @@ "df = (contextualize(data.to_pandas())\n", " .groupby(\"title\").text_col(\"text\")\n", " .window(20).stride(4)\n", - " .to_df())\n", + " .to_pandas())\n", "df.head(1)" ] }, @@ -603,7 +603,7 @@ "outputs": [], "source": [ "# Use LanceDB to get top 3 most relevant context\n", - "context = tbl.search(emb).limit(3).to_df()" + "context = tbl.search(emb).limit(3).to_pandas()" ] }, { diff --git a/docs/src/python/arrow.md b/docs/src/python/arrow.md index 0e8bd3ee..b05018a0 100644 --- a/docs/src/python/arrow.md +++ b/docs/src/python/arrow.md @@ -74,7 +74,7 @@ table = db.open_table("pd_table") query_vector = [100, 100] # Pandas DataFrame -df = table.search(query_vector).limit(1).to_df() +df = table.search(query_vector).limit(1).to_pandas() print(df) ``` @@ -89,12 +89,12 @@ If you have more complex criteria, you can always apply the filter to the result ```python # Apply the filter via LanceDB -results = table.search([100, 100]).where("price < 15").to_df() +results = table.search([100, 100]).where("price < 15").to_pandas() assert len(results) == 1 assert results["item"].iloc[0] == "foo" # Apply the filter via Pandas -df = results = table.search([100, 100]).to_df() +df = results = table.search([100, 100]).to_pandas() results = df[df.price < 15] assert len(results) == 1 assert results["item"].iloc[0] == "foo" diff --git a/docs/src/search.md b/docs/src/search.md index 8c5aa96c..5a3143fe 100644 --- a/docs/src/search.md +++ b/docs/src/search.md @@ -67,7 +67,7 @@ await db_setup.createTable('my_vectors', data) df = tbl.search(np.random.random((1536))) \ .limit(10) \ - .to_df() + .to_list() ``` === "JavaScript" @@ -92,7 +92,7 @@ as well. df = tbl.search(np.random.random((1536))) \ .metric("cosine") \ .limit(10) \ - .to_df() + .to_list() ``` diff --git a/python/README.md b/python/README.md index c69b636f..faad44d4 100644 --- a/python/README.md +++ b/python/README.md @@ -16,7 +16,7 @@ pip install lancedb import lancedb db = lancedb.connect('') table = db.open_table('my_table') -results = table.search([0.1, 0.3]).limit(20).to_df() +results = table.search([0.1, 0.3]).limit(20).to_list() print(results) ``` diff --git a/python/lancedb/__init__.py b/python/lancedb/__init__.py index 28121944..c72390e6 100644 --- a/python/lancedb/__init__.py +++ b/python/lancedb/__init__.py @@ -14,13 +14,13 @@ import importlib.metadata from typing import Optional +__version__ = importlib.metadata.version("lancedb") + from .db import URI, DBConnection, LanceDBConnection from .remote.db import RemoteDBConnection from .schema import vector from .utils import sentry_log -__version__ = importlib.metadata.version("lancedb") - def connect( uri: URI, diff --git a/python/lancedb/context.py b/python/lancedb/context.py index b29946c6..73800d02 100644 --- a/python/lancedb/context.py +++ b/python/lancedb/context.py @@ -12,6 +12,9 @@ # limitations under the License. from __future__ import annotations +import deprecation + +from . import __version__ from .exceptions import MissingColumnError, MissingValueError from .util import safe_import_pandas @@ -43,7 +46,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: this how many tokens, but depending on the input data, it could be sentences, paragraphs, messages, etc. - >>> contextualize(data).window(3).stride(1).text_col('token').to_df() + >>> contextualize(data).window(3).stride(1).text_col('token').to_pandas() token document_id 0 The quick brown 1 1 quick brown fox 1 @@ -56,7 +59,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: 8 dog I love 1 9 I love sandwiches 2 10 love sandwiches 2 - >>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_df() + >>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_pandas() token document_id 0 The quick brown fox jumped over the 1 1 quick brown fox jumped over the lazy 1 @@ -68,7 +71,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: ``stride`` determines how many rows to skip between each window start. This can be used to reduce the total number of windows generated. - >>> contextualize(data).window(4).stride(2).text_col('token').to_df() + >>> contextualize(data).window(4).stride(2).text_col('token').to_pandas() token document_id 0 The quick brown fox 1 2 brown fox jumped over 1 @@ -81,7 +84,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: context windows that don't cross document boundaries. In this case, we can pass ``document_id`` as the group by. - >>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_df() + >>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_pandas() token document_id 0 The quick brown fox 1 2 brown fox jumped over 1 @@ -93,14 +96,14 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: This can be used to trim the last few context windows which have size less than ``min_window_size``. By default context windows of size 1 are skipped. - >>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_df() + >>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_pandas() token document_id 0 The quick brown fox jumped over 1 3 fox jumped over the lazy dog 1 6 the lazy dog 1 9 I love sandwiches 2 - >>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_df() + >>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_pandas() token document_id 0 The quick brown fox jumped over 1 3 fox jumped over the lazy dog 1 @@ -176,7 +179,16 @@ class Contextualizer: self._min_window_size = min_window_size return self + @deprecation.deprecated( + deprecated_in="0.3.1", + removed_in="0.4.0", + current_version=__version__, + details="Use the bar function instead", + ) def to_df(self) -> "pd.DataFrame": + return self.to_pandas() + + def to_pandas(self) -> "pd.DataFrame": """Create the context windows and return a DataFrame.""" if pd is None: raise ImportError( diff --git a/python/lancedb/query.py b/python/lancedb/query.py index eedf8079..71c18134 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -16,10 +16,12 @@ from __future__ import annotations from abc import ABC, abstractmethod from typing import List, Literal, Optional, Type, Union +import deprecation import numpy as np import pyarrow as pa import pydantic +from . import __version__ from .common import VECTOR_COLUMN_NAME from .pydantic import LanceModel from .util import safe_import_pandas @@ -127,7 +129,24 @@ class LanceQueryBuilder(ABC): self._columns = None self._where = None + @deprecation.deprecated( + deprecated_in="0.3.1", + removed_in="0.4.0", + current_version=__version__, + details="Use the bar function instead", + ) def to_df(self) -> "pd.DataFrame": + """ + Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead. + + Execute the query and return the results as a pandas DataFrame. + In addition to the selected columns, LanceDB also returns a vector + and also the "_distance" column which is the distance between the query + vector and the returned vector. + """ + return self.to_pandas() + + def to_pandas(self) -> "pd.DataFrame": """ Execute the query and return the results as a pandas DataFrame. In addition to the selected columns, LanceDB also returns a vector @@ -148,6 +167,16 @@ class LanceQueryBuilder(ABC): """ raise NotImplementedError + def to_list(self) -> List[dict]: + """ + Execute the query and return the results as a list of dictionaries. + + Each list entry is a dictionary with the selected column names as keys, + or all table columns if `select` is not called. The vector and the "_distance" + fields are returned whether or not they're explicitly selected. + """ + return self.to_arrow().to_pylist() + def to_pydantic(self, model: Type[LanceModel]) -> List[LanceModel]: """Return the table as a list of pydantic models. @@ -232,7 +261,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): ... .where("b < 10") ... .select(["b"]) ... .limit(2) - ... .to_df()) + ... .to_pandas()) b vector _distance 0 6 [0.4, 0.4] 0.0 """ diff --git a/python/lancedb/table.py b/python/lancedb/table.py index acc48c40..03da59ae 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -137,7 +137,7 @@ class Table(ABC): Can query the table with [Table.search][lancedb.table.Table.search]. - >>> table.search([0.4, 0.4]).select(["b"]).to_df() + >>> table.search([0.4, 0.4]).select(["b"]).to_pandas() b vector _distance 0 4 [0.5, 1.3] 0.82 1 2 [1.1, 1.2] 1.13 diff --git a/python/pyproject.toml b/python/pyproject.toml index a21678bc..fca35b03 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -2,6 +2,7 @@ name = "lancedb" version = "0.3.0" dependencies = [ + "deprecation", "pylance==0.8.3", "ratelimiter~=1.0", "retry>=0.9.2", diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 12ba4116..75adb348 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -47,7 +47,7 @@ def test_contextualizer(raw_df: pd.DataFrame): .stride(3) .text_col("token") .groupby("document_id") - .to_df()["token"] + .to_pandas()["token"] .to_list() ) @@ -67,7 +67,7 @@ def test_contextualizer_with_threshold(raw_df: pd.DataFrame): .text_col("token") .groupby("document_id") .min_window_size(4) - .to_df()["token"] + .to_pandas()["token"] .to_list() ) diff --git a/python/tests/test_db.py b/python/tests/test_db.py index bae8848a..abb342a2 100644 --- a/python/tests/test_db.py +++ b/python/tests/test_db.py @@ -33,11 +33,11 @@ def test_basic(tmp_path): {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}, ], ) - rs = table.search([100, 100]).limit(1).to_df() + rs = table.search([100, 100]).limit(1).to_pandas() assert len(rs) == 1 assert rs["item"].iloc[0] == "bar" - rs = table.search([100, 100]).where("price < 15").limit(2).to_df() + rs = table.search([100, 100]).where("price < 15").limit(2).to_pandas() assert len(rs) == 1 assert rs["item"].iloc[0] == "foo" @@ -62,11 +62,11 @@ def test_ingest_pd(tmp_path): } ) table = db.create_table("test", data=data) - rs = table.search([100, 100]).limit(1).to_df() + rs = table.search([100, 100]).limit(1).to_pandas() assert len(rs) == 1 assert rs["item"].iloc[0] == "bar" - rs = table.search([100, 100]).where("price < 15").limit(2).to_df() + rs = table.search([100, 100]).where("price < 15").limit(2).to_pandas() assert len(rs) == 1 assert rs["item"].iloc[0] == "foo" @@ -137,8 +137,8 @@ def test_ingest_iterator(tmp_path): db = lancedb.connect(tmp_path) tbl = db.create_table("table2", make_batches(), schema=schema, mode="overwrite") tbl.to_pandas() - assert tbl.search([3.1, 4.1]).limit(1).to_df()["_distance"][0] == 0.0 - assert tbl.search([5.9, 26.5]).limit(1).to_df()["_distance"][0] == 0.0 + assert tbl.search([3.1, 4.1]).limit(1).to_pandas()["_distance"][0] == 0.0 + assert tbl.search([5.9, 26.5]).limit(1).to_pandas()["_distance"][0] == 0.0 tbl_len = len(tbl) tbl.add(make_batches()) assert tbl_len == 50 diff --git a/python/tests/test_e2e_remote_db.py b/python/tests/test_e2e_remote_db.py index 54daa904..e9e69c48 100644 --- a/python/tests/test_e2e_remote_db.py +++ b/python/tests/test_e2e_remote_db.py @@ -23,5 +23,5 @@ from lancedb import LanceDBConnection def test_against_local_server(): conn = LanceDBConnection("lancedb+http://localhost:10024") table = conn.open_table("sift1m_ivf1024_pq16") - df = table.search(np.random.rand(128)).to_df() + df = table.search(np.random.rand(128)).to_pandas() assert len(df) == 10 diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py index 48a999f9..c5c69859 100644 --- a/python/tests/test_fts.py +++ b/python/tests/test_fts.py @@ -71,14 +71,14 @@ def test_search_index(tmp_path, table): def test_create_index_from_table(tmp_path, table): table.create_fts_index("text") - df = table.search("puppy").limit(10).select(["text"]).to_df() + df = table.search("puppy").limit(10).select(["text"]).to_pandas() assert len(df) == 10 assert "text" in df.columns def test_create_index_multiple_columns(tmp_path, table): table.create_fts_index(["text", "text2"]) - df = table.search("puppy").limit(10).to_df() + df = table.search("puppy").limit(10).to_pandas() assert len(df) == 10 assert "text" in df.columns assert "text2" in df.columns @@ -87,5 +87,5 @@ def test_create_index_multiple_columns(tmp_path, table): def test_empty_rs(tmp_path, table, mocker): table.create_fts_index(["text", "text2"]) mocker.patch("lancedb.fts.search_index", return_value=([], [])) - df = table.search("puppy").limit(10).to_df() + df = table.search("puppy").limit(10).to_pandas() assert len(df) == 0 diff --git a/python/tests/test_io.py b/python/tests/test_io.py index 656200a8..0629e809 100644 --- a/python/tests/test_io.py +++ b/python/tests/test_io.py @@ -36,11 +36,11 @@ def test_s3_io(): {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}, ], ) - rs = table.search([100, 100]).limit(1).to_df() + rs = table.search([100, 100]).limit(1).to_pandas() assert len(rs) == 1 assert rs["item"].iloc[0] == "bar" - rs = table.search([100, 100]).where("price < 15").limit(2).to_df() + rs = table.search([100, 100]).where("price < 15").limit(2).to_pandas() assert len(rs) == 1 assert rs["item"].iloc[0] == "foo" diff --git a/python/tests/test_query.py b/python/tests/test_query.py index 6784f439..cefea0c2 100644 --- a/python/tests/test_query.py +++ b/python/tests/test_query.py @@ -85,17 +85,20 @@ def test_cast(table): def test_query_builder(table): - df = ( - LanceVectorQueryBuilder(table, [0, 0], "vector").limit(1).select(["id"]).to_df() + rs = ( + LanceVectorQueryBuilder(table, [0, 0], "vector") + .limit(1) + .select(["id"]) + .to_list() ) - assert df["id"].values[0] == 1 - assert all(df["vector"].values[0] == [1, 2]) + assert rs[0]["id"] == 1 + assert all(np.array(rs[0]["vector"]) == [1, 2]) def test_query_builder_with_filter(table): - df = LanceVectorQueryBuilder(table, [0, 0], "vector").where("id = 2").to_df() - assert df["id"].values[0] == 2 - assert all(df["vector"].values[0] == [3, 4]) + rs = LanceVectorQueryBuilder(table, [0, 0], "vector").where("id = 2").to_list() + assert rs[0]["id"] == 2 + assert all(np.array(rs[0]["vector"]) == [3, 4]) def test_query_builder_with_prefilter(table): @@ -103,7 +106,7 @@ def test_query_builder_with_prefilter(table): LanceVectorQueryBuilder(table, [0, 0], "vector") .where("id = 2") .limit(1) - .to_df() + .to_pandas() ) assert len(df) == 0 @@ -111,7 +114,7 @@ def test_query_builder_with_prefilter(table): LanceVectorQueryBuilder(table, [0, 0], "vector") .where("id = 2", prefilter=True) .limit(1) - .to_df() + .to_pandas() ) assert df["id"].values[0] == 2 assert all(df["vector"].values[0] == [3, 4]) @@ -120,9 +123,11 @@ def test_query_builder_with_prefilter(table): def test_query_builder_with_metric(table): query = [4, 8] vector_column_name = "vector" - df_default = LanceVectorQueryBuilder(table, query, vector_column_name).to_df() + df_default = LanceVectorQueryBuilder(table, query, vector_column_name).to_pandas() df_l2 = ( - LanceVectorQueryBuilder(table, query, vector_column_name).metric("L2").to_df() + LanceVectorQueryBuilder(table, query, vector_column_name) + .metric("L2") + .to_pandas() ) tm.assert_frame_equal(df_default, df_l2) @@ -130,7 +135,7 @@ def test_query_builder_with_metric(table): LanceVectorQueryBuilder(table, query, vector_column_name) .metric("cosine") .limit(1) - .to_df() + .to_pandas() ) assert df_cosine._distance[0] == pytest.approx( cosine_distance(query, df_cosine.vector[0]), diff --git a/python/tests/test_remote_client.py b/python/tests/test_remote_client.py index 1afd0b60..73ebf153 100644 --- a/python/tests/test_remote_client.py +++ b/python/tests/test_remote_client.py @@ -86,7 +86,7 @@ async def test_e2e_with_mock_server(): columns=["id", "vector"], ), ) - ).to_df() + ).to_pandas() assert "vector" in df.columns assert "id" in df.columns diff --git a/python/tests/test_remote_db.py b/python/tests/test_remote_db.py index aa480870..592c5b7d 100644 --- a/python/tests/test_remote_db.py +++ b/python/tests/test_remote_db.py @@ -32,4 +32,4 @@ def test_remote_db(): setattr(conn, "_client", FakeLanceDBClient()) table = conn["test"] - table.search([1.0, 2.0]).to_df() + table.search([1.0, 2.0]).to_pandas() diff --git a/python/tests/test_table.py b/python/tests/test_table.py index 103d26fb..0c001bda 100644 --- a/python/tests/test_table.py +++ b/python/tests/test_table.py @@ -427,8 +427,8 @@ def test_multiple_vector_columns(db): table.add(df) q = np.random.randn(10) - result1 = table.search(q, vector_column_name="vector1").limit(1).to_df() - result2 = table.search(q, vector_column_name="vector2").limit(1).to_df() + result1 = table.search(q, vector_column_name="vector1").limit(1).to_pandas() + result2 = table.search(q, vector_column_name="vector2").limit(1).to_pandas() assert result1["text"].iloc[0] != result2["text"].iloc[0] @@ -439,6 +439,6 @@ def test_empty_query(db): "my_table", data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}], ) - df = table.search().select(["id"]).where("text='bar'").limit(1).to_df() + df = table.search().select(["id"]).where("text='bar'").limit(1).to_pandas() val = df.id.iloc[0] assert val == 1