Compare commits

...

8 Commits

Author SHA1 Message Date
Lance Release
b06e214d29 [python] Bump version: 0.1.15 → 0.1.16 2023-07-31 18:32:40 +00:00
Chang She
c1f8feb6ed make pandas an optional dependency in lancedb as well (#385) 2023-07-31 14:08:58 -04:00
Chang She
cada35d5b7 Improve pydantic integration (#384) 2023-07-31 12:16:44 -04:00
Chang She
2d25c263e9 Implement drop table if exists (#383) 2023-07-31 10:25:09 +02:00
gsilvestrin
bcd7f66dc7 fix(node): Handle overflows in the node bridge (#372)
- Fixes many numeric conversions that results in hard to reproduce issues
- JsObjectExt extends JsObject with safe methods to extract numericvalues
2023-07-28 13:15:21 -07:00
gsilvestrin
1daecac648 fix(python): Pin pylance and add pandas as test dependency (#373) 2023-07-27 15:21:45 -07:00
Lance Release
b8e656b2a7 Updating package-lock.json 2023-07-27 21:53:30 +00:00
Lance Release
ff7c1193a7 Updating package-lock.json 2023-07-27 21:06:32 +00:00
27 changed files with 363 additions and 132 deletions

View File

@@ -30,7 +30,7 @@ jobs:
python-version: 3.${{ matrix.python-minor-version }} python-version: 3.${{ matrix.python-minor-version }}
- name: Install lancedb - name: Install lancedb
run: | run: |
pip install -e . pip install -e .[tests]
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985 pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest pytest-mock black isort pip install pytest pytest-mock black isort
- name: Black - name: Black
@@ -59,7 +59,7 @@ jobs:
python-version: "3.11" python-version: "3.11"
- name: Install lancedb - name: Install lancedb
run: | run: |
pip install -e . pip install -e .[tests]
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985 pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest pytest-mock black pip install pytest pytest-mock black
- name: Black - name: Black

View File

@@ -1,6 +1,8 @@
# Pydantic # Pydantic
[Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python. [Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python.
LanceDB integrates with Pydantic for schema inference, data ingestion, and query result casting.
## Schema ## Schema

74
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{ {
"name": "vectordb", "name": "vectordb",
"version": "0.1.18", "version": "0.1.19",
"lockfileVersion": 2, "lockfileVersion": 2,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "vectordb", "name": "vectordb",
"version": "0.1.18", "version": "0.1.19",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"
@@ -51,11 +51,11 @@
"typescript": "*" "typescript": "*"
}, },
"optionalDependencies": { "optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.1.18", "@lancedb/vectordb-darwin-arm64": "0.1.19",
"@lancedb/vectordb-darwin-x64": "0.1.18", "@lancedb/vectordb-darwin-x64": "0.1.19",
"@lancedb/vectordb-linux-arm64-gnu": "0.1.18", "@lancedb/vectordb-linux-arm64-gnu": "0.1.19",
"@lancedb/vectordb-linux-x64-gnu": "0.1.18", "@lancedb/vectordb-linux-x64-gnu": "0.1.19",
"@lancedb/vectordb-win32-x64-msvc": "0.1.18" "@lancedb/vectordb-win32-x64-msvc": "0.1.19"
} }
}, },
"node_modules/@apache-arrow/ts": { "node_modules/@apache-arrow/ts": {
@@ -315,9 +315,9 @@
} }
}, },
"node_modules/@lancedb/vectordb-darwin-arm64": { "node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.19.tgz",
"integrity": "sha512-vu8MCFgaAAGmTJF+4RaoApROMpRVVgrCk+V9my4adAfWkkXbSmtxiDgiIwwL1VqdGb8UwzGn3kVbNW7idE1ojA==", "integrity": "sha512-efQhJkBKvMNhjFq3Sw3/qHo9D9gb9UqiIr98n3STsbNxBQjMnWemXn91Ckl40siRG1O8qXcINW7Qs/EGmus+kg==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -327,9 +327,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-darwin-x64": { "node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.19.tgz",
"integrity": "sha512-ZU30bd6frRyKJ515ow972PlqO2wIiNT4Ohor9+KbUwl/VKDyAwKOKG8cWhRJXTxk0k1oqpiJ6+Q28TcYJ0sSAw==", "integrity": "sha512-r6OZNVyemAssABz2w7CRhe7dyREwBEfTytn+ux1zzTnzsgMgDovCQ0rQ3WZcxWvcy7SFCxiemA9IP1b/lsb4tQ==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -339,9 +339,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-arm64-gnu": { "node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.1.19.tgz",
"integrity": "sha512-2UroC026bUYwyciSRonYlXei0SoYbKgfWpozxYOu7GgBAV2CQQtaAPgWJTEl6ZiCNeBmBTx+j0h3+ydUfZA73Q==", "integrity": "sha512-mL/hRmZp6Kw7hmGJBdOZfp/tTYiCdlOcs8DA/+nr2eiXERv0gIhyiKvr2P5DwbBmut3qXEkDalMHTo95BSdL2A==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@@ -351,9 +351,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-linux-x64-gnu": { "node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.19.tgz",
"integrity": "sha512-DoQBskl22JAJFZh219ZOJ6o+f1niTZp0qRYngHa/kTIpLKzHWQ0OTtMCz32VBAjAsKjSLNxHE8rrT/S6tvS7KQ==", "integrity": "sha512-AG0FHksbbr+cHVKPi4B8cmBtqb6T9E0uaK4kyZkXrX52/xtv9RYVZcykaB/tSSm0XNFPWWRnx9R8UqNZV/hxMA==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -363,9 +363,9 @@
] ]
}, },
"node_modules/@lancedb/vectordb-win32-x64-msvc": { "node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.1.19.tgz",
"integrity": "sha512-a/kUM3V6rWuXS80pPECYxKfCUAnq56Of/GPCvnAkpk9C9ldyX10iff4aA6DiPHjEk9V2ytqDfJKl9N3QcMLKLA==", "integrity": "sha512-PDWZ2hvLVXH4Z4WIO1rsWY8ev3NpNm7aXlaey32P+l1Iz9Hia9+F2GBpp2UiEQKfvbk82ucAvBLRmpSsHY8Tlw==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@@ -4852,33 +4852,33 @@
} }
}, },
"@lancedb/vectordb-darwin-arm64": { "@lancedb/vectordb-darwin-arm64": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.19.tgz",
"integrity": "sha512-vu8MCFgaAAGmTJF+4RaoApROMpRVVgrCk+V9my4adAfWkkXbSmtxiDgiIwwL1VqdGb8UwzGn3kVbNW7idE1ojA==", "integrity": "sha512-efQhJkBKvMNhjFq3Sw3/qHo9D9gb9UqiIr98n3STsbNxBQjMnWemXn91Ckl40siRG1O8qXcINW7Qs/EGmus+kg==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-darwin-x64": { "@lancedb/vectordb-darwin-x64": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.19.tgz",
"integrity": "sha512-ZU30bd6frRyKJ515ow972PlqO2wIiNT4Ohor9+KbUwl/VKDyAwKOKG8cWhRJXTxk0k1oqpiJ6+Q28TcYJ0sSAw==", "integrity": "sha512-r6OZNVyemAssABz2w7CRhe7dyREwBEfTytn+ux1zzTnzsgMgDovCQ0rQ3WZcxWvcy7SFCxiemA9IP1b/lsb4tQ==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-linux-arm64-gnu": { "@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.1.19.tgz",
"integrity": "sha512-2UroC026bUYwyciSRonYlXei0SoYbKgfWpozxYOu7GgBAV2CQQtaAPgWJTEl6ZiCNeBmBTx+j0h3+ydUfZA73Q==", "integrity": "sha512-mL/hRmZp6Kw7hmGJBdOZfp/tTYiCdlOcs8DA/+nr2eiXERv0gIhyiKvr2P5DwbBmut3qXEkDalMHTo95BSdL2A==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-linux-x64-gnu": { "@lancedb/vectordb-linux-x64-gnu": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.19.tgz",
"integrity": "sha512-DoQBskl22JAJFZh219ZOJ6o+f1niTZp0qRYngHa/kTIpLKzHWQ0OTtMCz32VBAjAsKjSLNxHE8rrT/S6tvS7KQ==", "integrity": "sha512-AG0FHksbbr+cHVKPi4B8cmBtqb6T9E0uaK4kyZkXrX52/xtv9RYVZcykaB/tSSm0XNFPWWRnx9R8UqNZV/hxMA==",
"optional": true "optional": true
}, },
"@lancedb/vectordb-win32-x64-msvc": { "@lancedb/vectordb-win32-x64-msvc": {
"version": "0.1.18", "version": "0.1.19",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.1.18.tgz", "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.1.19.tgz",
"integrity": "sha512-a/kUM3V6rWuXS80pPECYxKfCUAnq56Of/GPCvnAkpk9C9ldyX10iff4aA6DiPHjEk9V2ytqDfJKl9N3QcMLKLA==", "integrity": "sha512-PDWZ2hvLVXH4Z4WIO1rsWY8ev3NpNm7aXlaey32P+l1Iz9Hia9+F2GBpp2UiEQKfvbk82ucAvBLRmpSsHY8Tlw==",
"optional": true "optional": true
}, },
"@neon-rs/cli": { "@neon-rs/cli": {

View File

@@ -250,6 +250,14 @@ describe('LanceDB client', function () {
const createIndex = table.createIndex({ type: 'ivf_pq', column: 'name', num_partitions: 2, max_iters: 2, num_sub_vectors: 2 }) const createIndex = table.createIndex({ type: 'ivf_pq', column: 'name', num_partitions: 2, max_iters: 2, num_sub_vectors: 2 })
await expect(createIndex).to.be.rejectedWith(/VectorIndex requires the column data type to be fixed size list of float32s/) await expect(createIndex).to.be.rejectedWith(/VectorIndex requires the column data type to be fixed size list of float32s/)
}) })
it('it should fail when the column is not a vector', async function () {
const uri = await createTestDB(32, 300)
const con = await lancedb.connect(uri)
const table = await con.openTable('vectors')
const createIndex = table.createIndex({ type: 'ivf_pq', column: 'name', num_partitions: -1, max_iters: 2, num_sub_vectors: 2 })
await expect(createIndex).to.be.rejectedWith('num_partitions: must be > 0')
})
}) })
describe('when using a custom embedding function', function () { describe('when using a custom embedding function', function () {

View File

@@ -1,5 +1,5 @@
[bumpversion] [bumpversion]
current_version = 0.1.15 current_version = 0.1.16
commit = True commit = True
message = [python] Bump version: {current_version} → {new_version} message = [python] Bump version: {current_version} → {new_version}
tag = True tag = True

View File

@@ -11,17 +11,18 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from pathlib import Path from pathlib import Path
from typing import List, Union from typing import Iterable, List, Union
import numpy as np import numpy as np
import pandas as pd
import pyarrow as pa import pyarrow as pa
from .util import safe_import_pandas
pd = safe_import_pandas()
DATA = Union[List[dict], dict, "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]]
VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray] VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]
URI = Union[str, Path] URI = Union[str, Path]
# TODO support generator
DATA = Union[List[dict], dict, pd.DataFrame]
VECTOR_COLUMN_NAME = "vector" VECTOR_COLUMN_NAME = "vector"

View File

@@ -12,12 +12,13 @@
# limitations under the License. # limitations under the License.
from __future__ import annotations from __future__ import annotations
import pandas as pd
from .exceptions import MissingColumnError, MissingValueError from .exceptions import MissingColumnError, MissingValueError
from .util import safe_import_pandas
pd = safe_import_pandas()
def contextualize(raw_df: pd.DataFrame) -> Contextualizer: def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
"""Create a Contextualizer object for the given DataFrame. """Create a Contextualizer object for the given DataFrame.
Used to create context windows. Context windows are rolling subsets of text Used to create context windows. Context windows are rolling subsets of text
@@ -175,8 +176,12 @@ class Contextualizer:
self._min_window_size = min_window_size self._min_window_size = min_window_size
return self return self
def to_df(self) -> pd.DataFrame: def to_df(self) -> "pd.DataFrame":
"""Create the context windows and return a DataFrame.""" """Create the context windows and return a DataFrame."""
if pd is None:
raise ImportError(
"pandas is required to create context windows using lancedb"
)
if self._text_col not in self._raw_df.columns.tolist(): if self._text_col not in self._raw_df.columns.tolist():
raise MissingColumnError(self._text_col) raise MissingColumnError(self._text_col)

View File

@@ -16,9 +16,8 @@ from __future__ import annotations
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple, Union from typing import Optional
import pandas as pd
import pyarrow as pa import pyarrow as pa
from pyarrow import fs from pyarrow import fs
@@ -39,9 +38,7 @@ class DBConnection(ABC):
def create_table( def create_table(
self, self,
name: str, name: str,
data: Optional[ data: Optional[DATA] = None,
Union[List[dict], dict, pd.DataFrame, pa.Table, Iterable[pa.RecordBatch]],
] = None,
schema: Optional[pa.Schema] = None, schema: Optional[pa.Schema] = None,
mode: str = "create", mode: str = "create",
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
@@ -279,7 +276,7 @@ class LanceDBConnection(DBConnection):
def create_table( def create_table(
self, self,
name: str, name: str,
data: Optional[Union[List[dict], dict, pd.DataFrame]] = None, data: Optional[DATA] = None,
schema: pa.Schema = None, schema: pa.Schema = None,
mode: str = "create", mode: str = "create",
on_bad_vectors: str = "error", on_bad_vectors: str = "error",
@@ -319,14 +316,20 @@ class LanceDBConnection(DBConnection):
""" """
return LanceTable.open(self, name) return LanceTable.open(self, name)
def drop_table(self, name: str): def drop_table(self, name: str, ignore_missing: bool = False):
"""Drop a table from the database. """Drop a table from the database.
Parameters Parameters
---------- ----------
name: str name: str
The name of the table. The name of the table.
ignore_missing: bool, default False
If True, ignore if the table does not exist.
""" """
filesystem, path = fs_from_uri(self.uri) try:
table_path = os.path.join(path, name + ".lance") filesystem, path = fs_from_uri(self.uri)
filesystem.delete_dir(table_path) table_path = os.path.join(path, name + ".lance")
filesystem.delete_dir(table_path)
except FileNotFoundError:
if not ignore_missing:
raise

View File

@@ -16,15 +16,19 @@ import sys
from typing import Callable, Union from typing import Callable, Union
import numpy as np import numpy as np
import pandas as pd
import pyarrow as pa import pyarrow as pa
from lance.vector import vec_to_table from lance.vector import vec_to_table
from retry import retry from retry import retry
from .util import safe_import_pandas
pd = safe_import_pandas()
DATA = Union[pa.Table, "pd.DataFrame"]
def with_embeddings( def with_embeddings(
func: Callable, func: Callable,
data: Union[pa.Table, pd.DataFrame], data: DATA,
column: str = "text", column: str = "text",
wrap_api: bool = True, wrap_api: bool = True,
show_progress: bool = False, show_progress: bool = False,
@@ -60,7 +64,7 @@ def with_embeddings(
func = func.batch_size(batch_size) func = func.batch_size(batch_size)
if show_progress: if show_progress:
func = func.show_progress() func = func.show_progress()
if isinstance(data, pd.DataFrame): if pd is not None and isinstance(data, pd.DataFrame):
data = pa.Table.from_pandas(data, preserve_index=False) data = pa.Table.from_pandas(data, preserve_index=False)
embeddings = func(data[column].to_numpy()) embeddings = func(data[column].to_numpy())
table = vec_to_table(np.array(embeddings)) table = vec_to_table(np.array(embeddings))

View File

@@ -249,3 +249,36 @@ def pydantic_to_schema(model: Type[pydantic.BaseModel]) -> pa.Schema:
""" """
fields = _pydantic_model_to_fields(model) fields = _pydantic_model_to_fields(model)
return pa.schema(fields) return pa.schema(fields)
class LanceModel(pydantic.BaseModel):
"""
A Pydantic Model base class that can be converted to a LanceDB Table.
Examples
--------
>>> import lancedb
>>> from lancedb.pydantic import LanceModel, vector
>>>
>>> class TestModel(LanceModel):
... name: str
... vector: vector(2)
...
>>> db = lancedb.connect("/tmp")
>>> table = db.create_table("test", schema=TestModel.to_arrow_schema())
>>> table.add([
... TestModel(name="test", vector=[1.0, 2.0])
... ])
>>> table.search([0., 0.]).limit(1).to_pydantic(TestModel)
[TestModel(name='test', vector=FixedSizeList(dim=2))]
"""
@classmethod
def to_arrow_schema(cls):
return pydantic_to_schema(cls)
@classmethod
def field_names(cls) -> List[str]:
if PYDANTIC_VERSION.major < 2:
return list(cls.__fields__.keys())
return list(cls.model_fields.keys())

View File

@@ -13,17 +13,20 @@
from __future__ import annotations from __future__ import annotations
from typing import List, Literal, Optional, Union from typing import List, Literal, Optional, Type, Union
import numpy as np import numpy as np
import pandas as pd
import pyarrow as pa import pyarrow as pa
from pydantic import BaseModel import pydantic
from .common import VECTOR_COLUMN_NAME from .common import VECTOR_COLUMN_NAME
from .pydantic import LanceModel
from .util import safe_import_pandas
pd = safe_import_pandas()
class Query(BaseModel): class Query(pydantic.BaseModel):
"""A Query""" """A Query"""
vector_column: str = VECTOR_COLUMN_NAME vector_column: str = VECTOR_COLUMN_NAME
@@ -198,7 +201,7 @@ class LanceQueryBuilder:
self._refine_factor = refine_factor self._refine_factor = refine_factor
return self return self
def to_df(self) -> pd.DataFrame: def to_df(self) -> "pd.DataFrame":
""" """
Execute the query and return the results as a pandas DataFrame. Execute the query and return the results as a pandas DataFrame.
In addition to the selected columns, LanceDB also returns a vector In addition to the selected columns, LanceDB also returns a vector
@@ -230,9 +233,26 @@ class LanceQueryBuilder:
) )
return self._table._execute_query(query) return self._table._execute_query(query)
def to_pydantic(self, model: Type[LanceModel]) -> List[LanceModel]:
"""Return the table as a list of pydantic models.
Parameters
----------
model: Type[LanceModel]
The pydantic model to use.
Returns
-------
List[LanceModel]
"""
return [
model(**{k: v for k, v in row.items() if k in model.field_names()})
for row in self.to_arrow().to_pylist()
]
class LanceFtsQueryBuilder(LanceQueryBuilder): class LanceFtsQueryBuilder(LanceQueryBuilder):
def to_arrow(self) -> pd.Table: def to_arrow(self) -> pa.Table:
try: try:
import tantivy import tantivy
except ImportError: except ImportError:

View File

@@ -20,7 +20,6 @@ import pyarrow as pa
from lancedb.common import DATA from lancedb.common import DATA
from lancedb.db import DBConnection from lancedb.db import DBConnection
from lancedb.schema import schema_to_json
from lancedb.table import Table, _sanitize_data from lancedb.table import Table, _sanitize_data
from .arrow import to_ipc_binary from .arrow import to_ipc_binary

View File

@@ -16,11 +16,11 @@ from functools import cached_property
from typing import Union from typing import Union
import pyarrow as pa import pyarrow as pa
from lance import json_to_schema
from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME
from ..query import LanceQueryBuilder, Query from ..query import LanceQueryBuilder
from ..schema import json_to_schema
from ..table import Query, Table, _sanitize_data from ..table import Query, Table, _sanitize_data
from .arrow import to_ipc_binary from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE from .client import ARROW_STREAM_CONTENT_TYPE

View File

@@ -12,11 +12,7 @@
# limitations under the License. # limitations under the License.
"""Schema related utilities.""" """Schema related utilities."""
from typing import Any, Dict, Type
import pyarrow as pa import pyarrow as pa
from lance import json_to_schema, schema_to_json
def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataType: def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataType:

View File

@@ -20,26 +20,32 @@ from typing import Iterable, List, Union
import lance import lance
import numpy as np import numpy as np
import pandas as pd
import pyarrow as pa import pyarrow as pa
import pyarrow.compute as pc import pyarrow.compute as pc
from lance import LanceDataset from lance import LanceDataset
from lance.vector import vec_to_table from lance.vector import vec_to_table
from .common import DATA, VEC, VECTOR_COLUMN_NAME from .common import DATA, VEC, VECTOR_COLUMN_NAME
from .pydantic import LanceModel
from .query import LanceFtsQueryBuilder, LanceQueryBuilder, Query from .query import LanceFtsQueryBuilder, LanceQueryBuilder, Query
from .util import fs_from_uri from .util import fs_from_uri, safe_import_pandas
pd = safe_import_pandas()
def _sanitize_data(data, schema, on_bad_vectors, fill_value): def _sanitize_data(data, schema, on_bad_vectors, fill_value):
if isinstance(data, list): if isinstance(data, list):
# convert to list of dict if data is a bunch of LanceModels
if isinstance(data[0], LanceModel):
schema = data[0].__class__.to_arrow_schema()
data = [dict(d) for d in data]
data = pa.Table.from_pylist(data) data = pa.Table.from_pylist(data)
data = _sanitize_schema( data = _sanitize_schema(
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
) )
if isinstance(data, dict): if isinstance(data, dict):
data = vec_to_table(data) data = vec_to_table(data)
if isinstance(data, pd.DataFrame): if pd is not None and isinstance(data, pd.DataFrame):
data = pa.Table.from_pandas(data) data = pa.Table.from_pandas(data)
data = _sanitize_schema( data = _sanitize_schema(
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
@@ -94,7 +100,7 @@ class Table(ABC):
""" """
raise NotImplementedError raise NotImplementedError
def to_pandas(self) -> pd.DataFrame: def to_pandas(self):
"""Return the table as a pandas DataFrame. """Return the table as a pandas DataFrame.
Returns Returns
@@ -328,7 +334,7 @@ class LanceTable(Table):
"""Return the first n rows of the table.""" """Return the first n rows of the table."""
return self._dataset.head(n) return self._dataset.head(n)
def to_pandas(self) -> pd.DataFrame: def to_pandas(self) -> "pd.DataFrame":
"""Return the table as a pandas DataFrame. """Return the table as a pandas DataFrame.
Returns Returns

View File

@@ -15,7 +15,6 @@ import os
from typing import Tuple from typing import Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
import pyarrow as pa
import pyarrow.fs as pa_fs import pyarrow.fs as pa_fs
@@ -76,3 +75,12 @@ def fs_from_uri(uri: str) -> Tuple[pa_fs.FileSystem, str]:
return fs, path return fs, path
return pa_fs.FileSystem.from_uri(uri) return pa_fs.FileSystem.from_uri(uri)
def safe_import_pandas():
try:
import pandas as pd
return pd
except ImportError:
return None

View File

@@ -1,7 +1,7 @@
[project] [project]
name = "lancedb" name = "lancedb"
version = "0.1.15" version = "0.1.16"
dependencies = ["pylance~=0.5.8", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic", "attr", "semver"] dependencies = ["pylance==0.5.10", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic", "attr", "semver"]
description = "lancedb" description = "lancedb"
authors = [ authors = [
{ name = "LanceDB Devs", email = "dev@lancedb.com" }, { name = "LanceDB Devs", email = "dev@lancedb.com" },
@@ -37,7 +37,7 @@ repository = "https://github.com/lancedb/lancedb"
[project.optional-dependencies] [project.optional-dependencies]
tests = [ tests = [
"pytest", "pytest-mock", "pytest-asyncio" "pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio"
] ]
dev = [ dev = [
"ruff", "pre-commit", "black" "ruff", "pre-commit", "black"

View File

@@ -149,6 +149,10 @@ def test_delete_table(tmp_path):
db.create_table("test", data=data) db.create_table("test", data=data)
assert db.table_names() == ["test"] assert db.table_names() == ["test"]
# dropping a table that does not exist should pass
# if ignore_missing=True
db.drop_table("does_not_exist", ignore_missing=True)
def test_empty_or_nonexistent_table(tmp_path): def test_empty_or_nonexistent_table(tmp_path):
db = lancedb.connect(tmp_path) db = lancedb.connect(tmp_path)

View File

@@ -20,7 +20,7 @@ import pyarrow as pa
import pydantic import pydantic
import pytest import pytest
from lancedb.pydantic import PYDANTIC_VERSION, pydantic_to_schema, vector from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, pydantic_to_schema, vector
@pytest.mark.skipif( @pytest.mark.skipif(
@@ -163,3 +163,13 @@ def test_fixed_size_list_validation():
TestModel(vec=range(7)) TestModel(vec=range(7))
TestModel(vec=range(8)) TestModel(vec=range(8))
def test_lance_model():
class TestModel(LanceModel):
vec: vector(16)
li: List[int]
schema = pydantic_to_schema(TestModel)
assert schema == TestModel.to_arrow_schema()
assert TestModel.field_names() == ["vec", "li"]

View File

@@ -20,6 +20,7 @@ import pyarrow as pa
import pytest import pytest
from lancedb.db import LanceDBConnection from lancedb.db import LanceDBConnection
from lancedb.pydantic import LanceModel, vector
from lancedb.query import LanceQueryBuilder, Query from lancedb.query import LanceQueryBuilder, Query
from lancedb.table import LanceTable from lancedb.table import LanceTable
@@ -64,6 +65,24 @@ def table(tmp_path) -> MockTable:
return MockTable(tmp_path) return MockTable(tmp_path)
def test_cast(table):
class TestModel(LanceModel):
vector: vector(2)
id: int
str_field: str
float_field: float
q = LanceQueryBuilder(table, [0, 0], "vector").limit(1)
results = q.to_pydantic(TestModel)
assert len(results) == 1
r0 = results[0]
assert isinstance(r0, TestModel)
assert r0.id == 1
assert r0.vector == [1, 2]
assert r0.str_field == "a"
assert r0.float_field == 1.0
def test_query_builder(table): def test_query_builder(table):
df = LanceQueryBuilder(table, [0, 0], "vector").limit(1).select(["id"]).to_df() df = LanceQueryBuilder(table, [0, 0], "vector").limit(1).select(["id"]).to_df()
assert df["id"].values[0] == 1 assert df["id"].values[0] == 1

View File

@@ -13,15 +13,16 @@
import functools import functools
from pathlib import Path from pathlib import Path
from typing import List
from unittest.mock import PropertyMock, patch from unittest.mock import PropertyMock, patch
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pyarrow as pa import pyarrow as pa
import pytest import pytest
from lance.vector import vec_to_table
from lancedb.db import LanceDBConnection from lancedb.db import LanceDBConnection
from lancedb.pydantic import LanceModel, vector
from lancedb.table import LanceTable from lancedb.table import LanceTable
@@ -135,6 +136,17 @@ def test_add(db):
_add(table, schema) _add(table, schema)
def test_add_pydantic_model(db):
class TestModel(LanceModel):
vector: vector(16)
li: List[int]
data = TestModel(vector=list(range(16)), li=[1, 2, 3])
table = LanceTable.create(db, "test", data=[data])
assert len(table) == 1
assert table.schema == TestModel.to_arrow_schema()
def _add(table, schema): def _add(table, schema):
# table = LanceTable(db, "test") # table = LanceTable(db, "test")
assert len(table) == 2 assert len(table) == 2

View File

@@ -13,6 +13,7 @@ crate-type = ["cdylib"]
arrow-array = { workspace = true } arrow-array = { workspace = true }
arrow-ipc = { workspace = true } arrow-ipc = { workspace = true }
arrow-schema = { workspace = true } arrow-schema = { workspace = true }
conv = "0.3.3"
once_cell = "1" once_cell = "1"
futures = "0.3" futures = "0.3"
half = { workspace = true } half = { workspace = true }

View File

@@ -22,8 +22,15 @@ use snafu::Snafu;
pub enum Error { pub enum Error {
#[snafu(display("column '{name}' is missing"))] #[snafu(display("column '{name}' is missing"))]
MissingColumn { name: String }, MissingColumn { name: String },
#[snafu(display("{name}: {message}"))]
RangeError { name: String, message: String },
#[snafu(display("{index_type} is not a valid index type"))]
InvalidIndexType { index_type: String },
#[snafu(display("{message}"))] #[snafu(display("{message}"))]
LanceDB { message: String }, LanceDB { message: String },
#[snafu(display("{message}"))]
Neon { message: String },
} }
pub type Result<T> = std::result::Result<T, Error>; pub type Result<T> = std::result::Result<T, Error>;
@@ -52,6 +59,14 @@ impl From<ArrowError> for Error {
} }
} }
impl From<neon::result::Throw> for Error {
fn from(value: neon::result::Throw) -> Self {
Self::Neon {
message: value.to_string(),
}
}
}
/// ResultExt is used to transform a [`Result`] into a [`NeonResult`], /// ResultExt is used to transform a [`Result`] into a [`NeonResult`],
/// so it can be returned as a JavaScript error /// so it can be returned as a JavaScript error
/// Copied from [Neon](https://github.com/neon-bindings/neon/blob/4c2e455a9e6814f1ba0178616d63caec7f4df317/crates/neon/src/result/mod.rs#L88) /// Copied from [Neon](https://github.com/neon-bindings/neon/blob/4c2e455a9e6814f1ba0178616d63caec7f4df317/crates/neon/src/result/mod.rs#L88)

View File

@@ -22,12 +22,15 @@ use neon::prelude::*;
use vectordb::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder}; use vectordb::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
use crate::error::Error::InvalidIndexType;
use crate::error::ResultExt;
use crate::neon_ext::js_object_ext::JsObjectExt;
use crate::{runtime, JsTable}; use crate::{runtime, JsTable};
pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsPromise> { pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsPromise> {
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?; let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
let index_params = cx.argument::<JsObject>(0)?; let index_params = cx.argument::<JsObject>(0)?;
let index_params_builder = get_index_params_builder(&mut cx, index_params).unwrap(); let index_params_builder = get_index_params_builder(&mut cx, index_params).or_throw(&mut cx)?;
let rt = runtime(&mut cx)?; let rt = runtime(&mut cx)?;
let channel = cx.channel(); let channel = cx.channel();
@@ -54,27 +57,21 @@ pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsP
fn get_index_params_builder( fn get_index_params_builder(
cx: &mut FunctionContext, cx: &mut FunctionContext,
obj: Handle<JsObject>, obj: Handle<JsObject>,
) -> Result<impl VectorIndexBuilder, String> { ) -> crate::error::Result<impl VectorIndexBuilder> {
let idx_type = obj let idx_type = obj.get::<JsString, _, _>(cx, "type")?.value(cx);
.get::<JsString, _, _>(cx, "type")
.map_err(|t| t.to_string())?
.value(cx);
match idx_type.as_str() { match idx_type.as_str() {
"ivf_pq" => { "ivf_pq" => {
let mut index_builder: IvfPQIndexBuilder = IvfPQIndexBuilder::new(); let mut index_builder: IvfPQIndexBuilder = IvfPQIndexBuilder::new();
let mut pq_params = PQBuildParams::default(); let mut pq_params = PQBuildParams::default();
obj.get_opt::<JsString, _, _>(cx, "column") obj.get_opt::<JsString, _, _>(cx, "column")?
.map_err(|t| t.to_string())?
.map(|s| index_builder.column(s.value(cx))); .map(|s| index_builder.column(s.value(cx)));
obj.get_opt::<JsString, _, _>(cx, "index_name") obj.get_opt::<JsString, _, _>(cx, "index_name")?
.map_err(|t| t.to_string())?
.map(|s| index_builder.index_name(s.value(cx))); .map(|s| index_builder.index_name(s.value(cx)));
obj.get_opt::<JsString, _, _>(cx, "metric_type") obj.get_opt::<JsString, _, _>(cx, "metric_type")?
.map_err(|t| t.to_string())?
.map(|s| MetricType::try_from(s.value(cx).as_str())) .map(|s| MetricType::try_from(s.value(cx).as_str()))
.map(|mt| { .map(|mt| {
let metric_type = mt.unwrap(); let metric_type = mt.unwrap();
@@ -82,15 +79,8 @@ fn get_index_params_builder(
pq_params.metric_type = metric_type; pq_params.metric_type = metric_type;
}); });
let num_partitions = obj let num_partitions = obj.get_opt_usize(cx, "num_partitions")?;
.get_opt::<JsNumber, _, _>(cx, "num_partitions") let max_iters = obj.get_opt_usize(cx, "max_iters")?;
.map_err(|t| t.to_string())?
.map(|s| s.value(cx) as usize);
let max_iters = obj
.get_opt::<JsNumber, _, _>(cx, "max_iters")
.map_err(|t| t.to_string())?
.map(|s| s.value(cx) as usize);
num_partitions.map(|np| { num_partitions.map(|np| {
let max_iters = max_iters.unwrap_or(50); let max_iters = max_iters.unwrap_or(50);
@@ -102,32 +92,28 @@ fn get_index_params_builder(
index_builder.ivf_params(ivf_params) index_builder.ivf_params(ivf_params)
}); });
obj.get_opt::<JsBoolean, _, _>(cx, "use_opq") obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")?
.map_err(|t| t.to_string())?
.map(|s| pq_params.use_opq = s.value(cx)); .map(|s| pq_params.use_opq = s.value(cx));
obj.get_opt::<JsNumber, _, _>(cx, "num_sub_vectors") obj.get_opt_usize(cx, "num_sub_vectors")?
.map_err(|t| t.to_string())? .map(|s| pq_params.num_sub_vectors = s);
.map(|s| pq_params.num_sub_vectors = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "num_bits") obj.get_opt_usize(cx, "num_bits")?
.map_err(|t| t.to_string())? .map(|s| pq_params.num_bits = s);
.map(|s| pq_params.num_bits = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "max_iters") obj.get_opt_usize(cx, "max_iters")?
.map_err(|t| t.to_string())? .map(|s| pq_params.max_iters = s);
.map(|s| pq_params.max_iters = s.value(cx) as usize);
obj.get_opt::<JsNumber, _, _>(cx, "max_opq_iters") obj.get_opt_usize(cx, "max_opq_iters")?
.map_err(|t| t.to_string())? .map(|s| pq_params.max_opq_iters = s);
.map(|s| pq_params.max_opq_iters = s.value(cx) as usize);
obj.get_opt::<JsBoolean, _, _>(cx, "replace") obj.get_opt::<JsBoolean, _, _>(cx, "replace")?
.map_err(|t| t.to_string())?
.map(|s| index_builder.replace(s.value(cx))); .map(|s| index_builder.replace(s.value(cx)));
Ok(index_builder) Ok(index_builder)
} }
t => Err(format!("{} is not a valid index type", t).to_string()), index_type => Err(InvalidIndexType {
index_type: index_type.into(),
}),
} }
} }

View File

@@ -31,16 +31,17 @@ use once_cell::sync::OnceCell;
use tokio::runtime::Runtime; use tokio::runtime::Runtime;
use vectordb::database::Database; use vectordb::database::Database;
use vectordb::error::Error;
use vectordb::table::{ReadParams, Table}; use vectordb::table::{ReadParams, Table};
use crate::arrow::{arrow_buffer_to_record_batch, record_batch_to_buffer}; use crate::arrow::{arrow_buffer_to_record_batch, record_batch_to_buffer};
use crate::error::ResultExt; use crate::error::ResultExt;
use crate::neon_ext::js_object_ext::JsObjectExt;
mod arrow; mod arrow;
mod convert; mod convert;
mod error; mod error;
mod index; mod index;
mod neon_ext;
struct JsDatabase { struct JsDatabase {
database: Arc<Database>, database: Arc<Database>,
@@ -245,12 +246,9 @@ fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
.get_opt::<JsString, _, _>(&mut cx, "_filter")? .get_opt::<JsString, _, _>(&mut cx, "_filter")?
.map(|s| s.value(&mut cx)); .map(|s| s.value(&mut cx));
let refine_factor = query_obj let refine_factor = query_obj
.get_opt::<JsNumber, _, _>(&mut cx, "_refineFactor")? .get_opt_u32(&mut cx, "_refineFactor")
.map(|s| s.value(&mut cx)) .or_throw(&mut cx)?;
.map(|i| i as u32); let nprobes = query_obj.get_usize(&mut cx, "_nprobes").or_throw(&mut cx)?;
let nprobes = query_obj
.get::<JsNumber, _, _>(&mut cx, "_nprobes")?
.value(&mut cx) as usize;
let metric_type = query_obj let metric_type = query_obj
.get_opt::<JsString, _, _>(&mut cx, "_metricType")? .get_opt::<JsString, _, _>(&mut cx, "_metricType")?
.map(|s| s.value(&mut cx)) .map(|s| s.value(&mut cx))
@@ -277,7 +275,11 @@ fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
.select(select); .select(select);
let record_batch_stream = builder.execute(); let record_batch_stream = builder.execute();
let results = record_batch_stream let results = record_batch_stream
.and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from)) .and_then(|stream| {
stream
.try_collect::<Vec<_>>()
.map_err(vectordb::error::Error::from)
})
.await; .await;
deferred.settle_with(&channel, move |mut cx| { deferred.settle_with(&channel, move |mut cx| {

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod js_object_ext;

View File

@@ -0,0 +1,82 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::error::{Error, Result};
use neon::prelude::*;
// extends neon's [JsObject] with helper functions to extract properties
pub trait JsObjectExt {
fn get_opt_u32(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<u32>>;
fn get_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<usize>;
fn get_opt_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<usize>>;
}
impl JsObjectExt for JsObject {
fn get_opt_u32(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<u32>> {
let val_opt = self
.get_opt::<JsNumber, _, _>(cx, key)?
.map(|s| f64_to_u32_safe(s.value(cx), key));
val_opt.transpose()
}
fn get_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<usize> {
let val = self.get::<JsNumber, _, _>(cx, key)?.value(cx);
f64_to_usize_safe(val, key)
}
fn get_opt_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<usize>> {
let val_opt = self
.get_opt::<JsNumber, _, _>(cx, key)?
.map(|s| f64_to_usize_safe(s.value(cx), key));
val_opt.transpose()
}
}
fn f64_to_u32_safe(n: f64, key: &str) -> Result<u32> {
use conv::*;
n.approx_as::<u32>().map_err(|e| match e {
FloatError::NegOverflow(_) => Error::RangeError {
name: key.into(),
message: "must be > 0".to_string(),
},
FloatError::PosOverflow(_) => Error::RangeError {
name: key.into(),
message: format!("must be < {}", u32::MAX),
},
FloatError::NotANumber(_) => Error::RangeError {
name: key.into(),
message: "not a valid number".to_string(),
},
})
}
fn f64_to_usize_safe(n: f64, key: &str) -> Result<usize> {
use conv::*;
n.approx_as::<usize>().map_err(|e| match e {
FloatError::NegOverflow(_) => Error::RangeError {
name: key.into(),
message: "must be > 0".to_string(),
},
FloatError::PosOverflow(_) => Error::RangeError {
name: key.into(),
message: format!("must be < {}", usize::MAX),
},
FloatError::NotANumber(_) => Error::RangeError {
name: key.into(),
message: "not a valid number".to_string(),
},
})
}