diff --git a/python/lancedb/query.py b/python/lancedb/query.py index 9b970f2b..07c022f8 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -78,7 +78,7 @@ class LanceQueryBuilder: def to_df(self) -> pd.DataFrame: """Execute the query and return the results as a pandas DataFrame. """ - ds = self._table._dataset + ds = self._table.to_lance() # TODO indexed search tbl = ds.to_table( columns=self._columns, @@ -89,4 +89,4 @@ class LanceQueryBuilder: "k": self._limit } ) - return tbl.to_pandas() \ No newline at end of file + return tbl.to_pandas() diff --git a/python/lancedb/table.py b/python/lancedb/table.py index cbd1e01d..ce0d5fb5 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -108,7 +108,7 @@ class LanceTable: return LanceQueryBuilder(self, query) @classmethod - def create(cls, db, name, data, schema): + def create(cls, db, name, data, schema=None): tbl = LanceTable(db, name) data = _sanitize_data(data, schema) lance.write_dataset(data, tbl._dataset_uri, mode="create") @@ -131,10 +131,8 @@ def _sanitize_schema(data: pa.Table, schema: pa.Schema = None) -> pa.Table: return data # cast the columns to the expected types data = data.combine_chunks() - return pa.Table.from_arrays([ - data[name].cast(schema.field(name).type) - for name in schema.names - ], schema=schema) + return pa.Table.from_arrays([data[name] for name in schema.names], + schema=schema) # just check the vector column return _sanitize_vector_column(data, vector_column_name=VECTOR_COLUMN_NAME) diff --git a/python/tests/test_db.py b/python/tests/test_db.py index ba43bcd9..7ce51e71 100644 --- a/python/tests/test_db.py +++ b/python/tests/test_db.py @@ -16,6 +16,10 @@ import lancedb def test_basic(tmp_path): db = lancedb.connect(tmp_path) + + assert db.uri == str(tmp_path) + assert db.table_names() == [] + table = db.create_table("test", data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}]) @@ -23,6 +27,12 @@ def test_basic(tmp_path): assert len(rs) == 1 assert rs["item"].iloc[0] == "bar" - rs = table.search([100, 100]).where("price < 15").limit(1).to_df() + rs = table.search([100, 100]).where("price < 15").limit(2).to_df() assert len(rs) == 1 assert rs["item"].iloc[0] == "foo" + + assert db.table_names() == ["test"] + assert "test" in db + assert len(db) == 1 + + assert db.open_table("test").name == db["test"].name diff --git a/python/tests/test_query.py b/python/tests/test_query.py new file mode 100644 index 00000000..692debcc --- /dev/null +++ b/python/tests/test_query.py @@ -0,0 +1,59 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import lance +from lancedb.query import LanceQueryBuilder + +import pandas as pd +import pyarrow as pa + +import pytest + + +class MockTable: + + def __init__(self, tmp_path): + self.uri = tmp_path + + def to_lance(self): + return lance.dataset(self.uri) + + +@pytest.fixture +def table(tmp_path) -> MockTable: + df = pd.DataFrame({ + "vector": [[1, 2], [3, 4]], + "id": [1, 2], + "str_field": ["a", "b"], + "float_field": [1.0, 2.0] + }) + schema = pa.schema([pa.field("vector", pa.list_(pa.float32(), list_size=2)), + pa.field("id", pa.int32()), + pa.field("str_field", pa.string()), + pa.field("float_field", pa.float64())]) + lance.write_dataset(df, tmp_path, schema) + return MockTable(tmp_path) + + +def test_query_builder(table): + df = LanceQueryBuilder(table, [0, 0]).limit(1).select(["id"]).to_df() + assert df["id"].values[0] == 1 + assert all(df["vector"].values[0] == [1, 2]) + + +def test_query_builder_with_filter(table): + df = LanceQueryBuilder(table, [0, 0]).where("id = 2").to_df() + assert df["id"].values[0] == 2 + assert all(df["vector"].values[0] == [3, 4]) + + diff --git a/python/tests/test_table.py b/python/tests/test_table.py new file mode 100644 index 00000000..3050d69b --- /dev/null +++ b/python/tests/test_table.py @@ -0,0 +1,65 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import pandas as pd +import pyarrow as pa +import pytest + +from lancedb.table import LanceTable + + +class MockDB: + + def __init__(self, uri: Path): + self.uri = uri + + +@pytest.fixture +def db(tmp_path) -> MockDB: + return MockDB(tmp_path) + + +def test_basic(db): + ds = LanceTable.create( + db, "test", + data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, + {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}] + ).to_lance() + + table = LanceTable(db, "test") + assert table.name == "test" + assert table.schema == ds.schema + assert table.to_lance().to_table() == ds.to_table() + + +def test_add(db): + schema = pa.schema([pa.field("vector", pa.list_(pa.float32())), + pa.field("item", pa.string()), + pa.field("price", pa.float32())]) + expected = pa.Table.from_arrays([ + pa.array([[3.1, 4.1], [5.9, 26.5]]), + pa.array(["foo", "bar"]), + pa.array([10.0, 20.0]) + ], schema=schema) + data = [[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, + {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}]] + df = pd.DataFrame(data[0]) + data.append(df) + data.append(pa.Table.from_pandas(df, schema=schema)) + + for i, d in enumerate(data): + tbl = (LanceTable.create(db, f"test_{i}", data=d, schema=schema) + .to_lance().to_table()) + assert expected == tbl