mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 13:59:58 +00:00
Compare commits
10 Commits
update-doc
...
changhiskh
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9eca8e7cd1 | ||
|
|
587fe6ffc1 | ||
|
|
89c8e5839b | ||
|
|
50c20af060 | ||
|
|
0965d7dd5a | ||
|
|
7bbb2872de | ||
|
|
e81d2975da | ||
|
|
2c7f96ba4f | ||
|
|
f9dd7a5d8a | ||
|
|
1d4943688d |
33
.github/ISSUE_TEMPLATE/bug-node.yml
vendored
Normal file
33
.github/ISSUE_TEMPLATE/bug-node.yml
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
name: Bug Report - Node / Typescript
|
||||
description: File a bug report
|
||||
title: "bug(node): "
|
||||
labels: [bug, typescript]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
- type: input
|
||||
id: version
|
||||
attributes:
|
||||
label: LanceDB version
|
||||
description: What version of LanceDB are you using? `npm list | grep vectordb`.
|
||||
placeholder: v0.3.2
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
attributes:
|
||||
label: Are there known steps to reproduce?
|
||||
description: |
|
||||
Let us know how to reproduce the bug and we may be able to fix it more
|
||||
quickly. This is not required, but it is helpful.
|
||||
validations:
|
||||
required: false
|
||||
33
.github/ISSUE_TEMPLATE/bug-python.yml
vendored
Normal file
33
.github/ISSUE_TEMPLATE/bug-python.yml
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
name: Bug Report - Python
|
||||
description: File a bug report
|
||||
title: "bug(python): "
|
||||
labels: [bug, python]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
- type: input
|
||||
id: version
|
||||
attributes:
|
||||
label: LanceDB version
|
||||
description: What version of LanceDB are you using? `python -c "import lancedb; print(lancedb.__version__)"`.
|
||||
placeholder: v0.3.2
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
attributes:
|
||||
label: Are there known steps to reproduce?
|
||||
description: |
|
||||
Let us know how to reproduce the bug and we may be able to fix it more
|
||||
quickly. This is not required, but it is helpful.
|
||||
validations:
|
||||
required: false
|
||||
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
5
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: Discord Community Support
|
||||
url: https://discord.com/invite/zMM32dvNtd
|
||||
about: Please ask and answer questions here.
|
||||
23
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
23
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
name: 'Documentation improvement'
|
||||
description: Report an issue with the documentation.
|
||||
labels: [documentation]
|
||||
|
||||
body:
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Description
|
||||
description: >
|
||||
Describe the issue with the documentation and how it can be fixed or improved.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: input
|
||||
id: link
|
||||
attributes:
|
||||
label: Link
|
||||
description: >
|
||||
Provide a link to the existing documentation, if applicable.
|
||||
placeholder: ex. https://lancedb.github.io/lancedb/guides/tables/...
|
||||
validations:
|
||||
required: false
|
||||
31
.github/ISSUE_TEMPLATE/feature.yml
vendored
Normal file
31
.github/ISSUE_TEMPLATE/feature.yml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
name: Feature suggestion
|
||||
description: Suggestion a new feature for LanceDB
|
||||
title: "Feature: "
|
||||
labels: [enhancement]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Share a new idea for a feature or improvement. Be sure to search existing
|
||||
issues first to avoid duplicates.
|
||||
- type: dropdown
|
||||
id: sdk
|
||||
attributes:
|
||||
label: SDK
|
||||
description: Which SDK are you using? This helps us prioritize.
|
||||
options:
|
||||
- Python
|
||||
- Node
|
||||
- Rust
|
||||
default: 0
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: description
|
||||
attributes:
|
||||
label: Description
|
||||
description: |
|
||||
Describe the feature and why it would be useful. If applicable, consider
|
||||
providing a code example of what it might be like to use the feature.
|
||||
validations:
|
||||
required: true
|
||||
13
.github/workflows/python.yml
vendored
13
.github/workflows/python.yml
vendored
@@ -44,12 +44,19 @@ jobs:
|
||||
run: pytest -m "not slow" -x -v --durations=30 tests
|
||||
- name: doctest
|
||||
run: pytest --doctest-modules lancedb
|
||||
mac:
|
||||
platform:
|
||||
name: "Platform: ${{ matrix.config.name }}"
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
matrix:
|
||||
mac-runner: [ "macos-13", "macos-13-xlarge" ]
|
||||
runs-on: "${{ matrix.mac-runner }}"
|
||||
config:
|
||||
- name: x86 Mac
|
||||
runner: macos-13
|
||||
- name: Arm Mac
|
||||
runner: macos-13-xlarge
|
||||
- name: x86 Windows
|
||||
runner: windows-latest
|
||||
runs-on: "${{ matrix.config.runner }}"
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
23
.github/workflows/rust.yml
vendored
23
.github/workflows/rust.yml
vendored
@@ -24,6 +24,29 @@ env:
|
||||
RUST_BACKTRACE: "1"
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
timeout-minutes: 30
|
||||
runs-on: ubuntu-22.04
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
working-directory: rust
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: true
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
workspaces: rust
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
- name: Run format
|
||||
run: cargo fmt --all -- --check
|
||||
- name: Run clippy
|
||||
run: cargo clippy --all --all-features -- -D warnings
|
||||
linux:
|
||||
timeout-minutes: 30
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
26
Cargo.toml
26
Cargo.toml
@@ -5,24 +5,24 @@ exclude = ["python"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.9.0", "features" = ["dynamodb"] }
|
||||
lance-index = { "version" = "=0.9.0" }
|
||||
lance-linalg = { "version" = "=0.9.0" }
|
||||
lance-testing = { "version" = "=0.9.0" }
|
||||
lance = { "version" = "=0.9.1", "features" = ["dynamodb"] }
|
||||
lance-index = { "version" = "=0.9.1" }
|
||||
lance-linalg = { "version" = "=0.9.1" }
|
||||
lance-testing = { "version" = "=0.9.1" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "47.0.0", optional = false }
|
||||
arrow-array = "47.0"
|
||||
arrow-data = "47.0"
|
||||
arrow-ipc = "47.0"
|
||||
arrow-ord = "47.0"
|
||||
arrow-schema = "47.0"
|
||||
arrow-arith = "47.0"
|
||||
arrow-cast = "47.0"
|
||||
arrow = { version = "49.0.0", optional = false }
|
||||
arrow-array = "49.0"
|
||||
arrow-data = "49.0"
|
||||
arrow-ipc = "49.0"
|
||||
arrow-ord = "49.0"
|
||||
arrow-schema = "49.0"
|
||||
arrow-arith = "49.0"
|
||||
arrow-cast = "49.0"
|
||||
chrono = "0.4.23"
|
||||
half = { "version" = "=2.3.1", default-features = false, features = [
|
||||
"num-traits",
|
||||
] }
|
||||
log = "0.4"
|
||||
object_store = "0.7.1"
|
||||
object_store = "0.8.0"
|
||||
snafu = "0.7.4"
|
||||
url = "2"
|
||||
|
||||
@@ -64,18 +64,26 @@ We'll cover the basics of using LanceDB on your local machine in this section.
|
||||
tbl = db.create_table("table_from_df", data=df)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
If you want to overwrite the table, you can pass in `mode="overwrite"`
|
||||
to the `createTable` function.
|
||||
|
||||
=== "Javascript"
|
||||
```javascript
|
||||
const tb = await db.createTable("my_table",
|
||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
const tb = await db.createTable(
|
||||
"myTable",
|
||||
[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
```
|
||||
|
||||
!!! warning
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
If you want to overwrite the table, you can pass in `mode="overwrite"`
|
||||
to the `createTable` function.
|
||||
!!! warning
|
||||
|
||||
If the table already exists, LanceDB will raise an error by default.
|
||||
If you want to overwrite the table, you can pass in `"overwrite"`
|
||||
to the `createTable` function like this: `await con.createTable(tableName, data, { writeMode: WriteMode.Overwrite })`
|
||||
|
||||
|
||||
??? info "Under the hood, LanceDB is converting the input data into an Apache Arrow table and persisting it to disk in [Lance format](https://www.github.com/lancedb/lance)."
|
||||
|
||||
@@ -108,7 +116,7 @@ Once created, you can open a table using the following code:
|
||||
|
||||
=== "Javascript"
|
||||
```javascript
|
||||
const tbl = await db.openTable("my_table");
|
||||
const tbl = await db.openTable("myTable");
|
||||
```
|
||||
|
||||
If you forget the name of your table, you can always get a listing of all table names:
|
||||
@@ -194,10 +202,17 @@ Use the `drop_table()` method on the database to remove a table.
|
||||
db.drop_table("my_table")
|
||||
```
|
||||
|
||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||
By default, if the table does not exist an exception is raised. To suppress this,
|
||||
you can pass in `ignore_missing=True`.
|
||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||
By default, if the table does not exist an exception is raised. To suppress this,
|
||||
you can pass in `ignore_missing=True`.
|
||||
|
||||
=== "JavaScript"
|
||||
```javascript
|
||||
await db.dropTable('myTable')
|
||||
```
|
||||
|
||||
This permanently removes the table and is not recoverable, unlike deleting rows.
|
||||
If the table does not exist an exception is raised.
|
||||
|
||||
## What's next
|
||||
|
||||
|
||||
@@ -201,8 +201,8 @@ A Table is a collection of Records in a LanceDB Database. You can follow along o
|
||||
```javascript
|
||||
data
|
||||
const tb = await db.createTable("my_table",
|
||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||
```
|
||||
|
||||
!!! info "Note"
|
||||
|
||||
@@ -22,7 +22,7 @@ import numpy as np
|
||||
uri = "data/sample-lancedb"
|
||||
db = lancedb.connect(uri)
|
||||
|
||||
data = [{"vector": row, "item": f"item {i}"}
|
||||
data = [{"vector": row, "item": f"item {i}", "id": i}
|
||||
for i, row in enumerate(np.random.random((10_000, 2)).astype('int'))]
|
||||
|
||||
tbl = db.create_table("my_vectors", data=data)
|
||||
@@ -35,33 +35,25 @@ const db = await vectordb.connect('data/sample-lancedb')
|
||||
|
||||
let data = []
|
||||
for (let i = 0; i < 10_000; i++) {
|
||||
data.push({vector: Array(1536).fill(i), id: `${i}`, content: "", longId: `${i}`},)
|
||||
data.push({vector: Array(1536).fill(i), id: i, item: `item ${i}`, strId: `${i}`})
|
||||
}
|
||||
const tbl = await db.createTable('my_vectors', data)
|
||||
const tbl = await db.createTable('myVectors', data)
|
||||
```
|
||||
-->
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
tbl.search([100, 102]) \
|
||||
.where("""(
|
||||
(label IN [10, 20])
|
||||
AND
|
||||
(note.email IS NOT NULL)
|
||||
) OR NOT note.created
|
||||
""")
|
||||
|
||||
.where("(item IN ('item 0', 'item 2')) AND (id > 10)") \
|
||||
.to_arrow()
|
||||
```
|
||||
|
||||
=== "Javascript"
|
||||
|
||||
```javascript
|
||||
tbl.search([100, 102])
|
||||
.where(`(
|
||||
(label IN [10, 20])
|
||||
AND
|
||||
(note.email IS NOT NULL)
|
||||
) OR NOT note.created
|
||||
`)
|
||||
await tbl.search(Array(1536).fill(0))
|
||||
.where("(item IN ('item 0', 'item 2')) AND (id > 10)")
|
||||
.execute()
|
||||
```
|
||||
|
||||
|
||||
@@ -118,3 +110,22 @@ The mapping from SQL types to Arrow types is:
|
||||
|
||||
[^1]: See precision mapping in previous table.
|
||||
|
||||
|
||||
## Filtering without Vector Search
|
||||
|
||||
You can also filter your data without search.
|
||||
|
||||
=== "Python"
|
||||
```python
|
||||
tbl.search().where("id=10").limit(10).to_arrow()
|
||||
```
|
||||
|
||||
=== "JavaScript"
|
||||
```javascript
|
||||
await tbl.where('id=10').limit(10).execute()
|
||||
```
|
||||
|
||||
!!! warning
|
||||
If your table is large, this could potentially return a very large
|
||||
amount of data. Please be sure to use a `limit` clause unless
|
||||
you're sure you want to return the whole result set.
|
||||
|
||||
@@ -57,8 +57,8 @@ export class RemoteConnection implements Connection {
|
||||
return 'db://' + this._client.uri
|
||||
}
|
||||
|
||||
async tableNames (): Promise<string[]> {
|
||||
const response = await this._client.get('/v1/table/')
|
||||
async tableNames (pageToken: string = '', limit: number = 10): Promise<string[]> {
|
||||
const response = await this._client.get('/v1/table/', { limit, page_token: pageToken })
|
||||
return response.data.tables
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ from overrides import EnforceOverrides, override
|
||||
from pyarrow import fs
|
||||
|
||||
from .table import LanceTable, Table
|
||||
from .util import fs_from_uri, get_uri_location, get_uri_scheme
|
||||
from .util import fs_from_uri, get_uri_location, get_uri_scheme, join_uri
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .common import DATA, URI
|
||||
@@ -288,14 +288,13 @@ class LanceDBConnection(DBConnection):
|
||||
A list of table names.
|
||||
"""
|
||||
try:
|
||||
filesystem, path = fs_from_uri(self.uri)
|
||||
filesystem = fs_from_uri(self.uri)[0]
|
||||
except pa.ArrowInvalid:
|
||||
raise NotImplementedError("Unsupported scheme: " + self.uri)
|
||||
|
||||
try:
|
||||
paths = filesystem.get_file_info(
|
||||
fs.FileSelector(get_uri_location(self.uri))
|
||||
)
|
||||
loc = get_uri_location(self.uri)
|
||||
paths = filesystem.get_file_info(fs.FileSelector(loc))
|
||||
except FileNotFoundError:
|
||||
# It is ok if the file does not exist since it will be created
|
||||
paths = []
|
||||
@@ -373,7 +372,7 @@ class LanceDBConnection(DBConnection):
|
||||
"""
|
||||
try:
|
||||
filesystem, path = fs_from_uri(self.uri)
|
||||
table_path = os.path.join(path, name + ".lance")
|
||||
table_path = join_uri(path, name + ".lance")
|
||||
filesystem.delete_dir(table_path)
|
||||
except FileNotFoundError:
|
||||
if not ignore_missing:
|
||||
|
||||
@@ -23,6 +23,7 @@ import lance
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pyarrow.fs as pa_fs
|
||||
from lance import LanceDataset
|
||||
from lance.vector import vec_to_table
|
||||
|
||||
@@ -30,7 +31,7 @@ from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||
from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
|
||||
from .pydantic import LanceModel, model_to_dict
|
||||
from .query import LanceQueryBuilder, Query
|
||||
from .util import fs_from_uri, safe_import_pandas, value_to_sql
|
||||
from .util import fs_from_uri, safe_import_pandas, value_to_sql, join_uri
|
||||
from .utils.events import register_event
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -396,14 +397,6 @@ class LanceTable(Table):
|
||||
self.name = name
|
||||
self._version = version
|
||||
|
||||
def _reset_dataset(self, version=None):
|
||||
try:
|
||||
if "_dataset" in self.__dict__:
|
||||
del self.__dict__["_dataset"]
|
||||
self._version = version
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
@property
|
||||
def schema(self) -> pa.Schema:
|
||||
"""Return the schema of the table.
|
||||
@@ -412,16 +405,16 @@ class LanceTable(Table):
|
||||
-------
|
||||
pa.Schema
|
||||
A PyArrow schema object."""
|
||||
return self._dataset.schema
|
||||
return self.to_lance().schema
|
||||
|
||||
def list_versions(self):
|
||||
"""List all versions of the table"""
|
||||
return self._dataset.versions()
|
||||
return self.to_lance().versions()
|
||||
|
||||
@property
|
||||
def version(self) -> int:
|
||||
"""Get the current version of the table"""
|
||||
return self._dataset.version
|
||||
return self.to_lance().version
|
||||
|
||||
def checkout(self, version: int):
|
||||
"""Checkout a version of the table. This is an in-place operation.
|
||||
@@ -454,14 +447,12 @@ class LanceTable(Table):
|
||||
vector type
|
||||
0 [1.1, 0.9] vector
|
||||
"""
|
||||
max_ver = max([v["version"] for v in self._dataset.versions()])
|
||||
max_ver = max([v["version"] for v in self.to_lance().versions()])
|
||||
if version < 1 or version > max_ver:
|
||||
raise ValueError(f"Invalid version {version}")
|
||||
self._reset_dataset(version=version)
|
||||
|
||||
try:
|
||||
# Accessing the property updates the cached value
|
||||
_ = self._dataset
|
||||
self.to_lance().checkout(version)
|
||||
except Exception as e:
|
||||
if "not found" in str(e):
|
||||
raise ValueError(
|
||||
@@ -504,7 +495,7 @@ class LanceTable(Table):
|
||||
>>> len(table.list_versions())
|
||||
4
|
||||
"""
|
||||
max_ver = max([v["version"] for v in self._dataset.versions()])
|
||||
max_ver = max([v["version"] for v in self.to_lance().versions()])
|
||||
if version is None:
|
||||
version = self.version
|
||||
elif version < 1 or version > max_ver:
|
||||
@@ -516,11 +507,10 @@ class LanceTable(Table):
|
||||
# no-op if restoring the latest version
|
||||
return
|
||||
|
||||
self._dataset.restore()
|
||||
self._reset_dataset()
|
||||
self.to_lance().restore()
|
||||
|
||||
def __len__(self):
|
||||
return self._dataset.count_rows()
|
||||
return self.to_lance().count_rows()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"LanceTable({self.name})"
|
||||
@@ -530,7 +520,7 @@ class LanceTable(Table):
|
||||
|
||||
def head(self, n=5) -> pa.Table:
|
||||
"""Return the first n rows of the table."""
|
||||
return self._dataset.head(n)
|
||||
return self.to_lance().head(n)
|
||||
|
||||
def to_pandas(self) -> "pd.DataFrame":
|
||||
"""Return the table as a pandas DataFrame.
|
||||
@@ -547,11 +537,11 @@ class LanceTable(Table):
|
||||
Returns
|
||||
-------
|
||||
pa.Table"""
|
||||
return self._dataset.to_table()
|
||||
return self.to_lance().to_table()
|
||||
|
||||
@property
|
||||
def _dataset_uri(self) -> str:
|
||||
return os.path.join(self._conn.uri, f"{self.name}.lance")
|
||||
return join_uri(self._conn.uri, f"{self.name}.lance")
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
@@ -574,10 +564,11 @@ class LanceTable(Table):
|
||||
accelerator=accelerator,
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
self._reset_dataset()
|
||||
register_event("create_index")
|
||||
|
||||
def create_fts_index(self, field_names: Union[str, List[str]]):
|
||||
def create_fts_index(
|
||||
self, field_names: Union[str, List[str]], *, replace: bool = False
|
||||
):
|
||||
"""Create a full-text search index on the table.
|
||||
|
||||
Warning - this API is highly experimental and is highly likely to change
|
||||
@@ -587,17 +578,35 @@ class LanceTable(Table):
|
||||
----------
|
||||
field_names: str or list of str
|
||||
The name(s) of the field to index.
|
||||
replace: bool, default False
|
||||
If True, replace the existing index if it exists. Note that this is
|
||||
not yet an atomic operation; the index will be temporarily
|
||||
unavailable while the new index is being created.
|
||||
"""
|
||||
from .fts import create_index, populate_index
|
||||
|
||||
if isinstance(field_names, str):
|
||||
field_names = [field_names]
|
||||
|
||||
fs, path = fs_from_uri(self._get_fts_index_path())
|
||||
index_exists = fs.get_file_info(path).type != pa_fs.FileType.NotFound
|
||||
if index_exists:
|
||||
if not replace:
|
||||
raise ValueError(
|
||||
f"Index already exists. Use replace=True to overwrite."
|
||||
)
|
||||
try:
|
||||
fs.delete_dir(path)
|
||||
except FileNotFoundError as e:
|
||||
if "Cannot get information for path" in str(e):
|
||||
pass
|
||||
|
||||
index = create_index(self._get_fts_index_path(), field_names)
|
||||
populate_index(index, self, field_names)
|
||||
register_event("create_fts_index")
|
||||
|
||||
def _get_fts_index_path(self):
|
||||
return os.path.join(self._dataset_uri, "_indices", "tantivy")
|
||||
return join_uri(self._dataset_uri, "_indices", "tantivy")
|
||||
|
||||
@cached_property
|
||||
def _dataset(self) -> LanceDataset:
|
||||
@@ -645,8 +654,7 @@ class LanceTable(Table):
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
lance.write_dataset(data, self._dataset_uri, schema=self.schema, mode=mode)
|
||||
self._reset_dataset()
|
||||
self.to_lance().write(data, mode=mode)
|
||||
register_event("add")
|
||||
|
||||
def merge(
|
||||
@@ -707,10 +715,9 @@ class LanceTable(Table):
|
||||
other_table = other_table.to_lance()
|
||||
if isinstance(other_table, LanceDataset):
|
||||
other_table = other_table.to_table()
|
||||
self._dataset.merge(
|
||||
self.to_lance().merge(
|
||||
other_table, left_on=left_on, right_on=right_on, schema=schema
|
||||
)
|
||||
self._reset_dataset()
|
||||
register_event("merge")
|
||||
|
||||
@cached_property
|
||||
@@ -913,7 +920,7 @@ class LanceTable(Table):
|
||||
return tbl
|
||||
|
||||
def delete(self, where: str):
|
||||
self._dataset.delete(where)
|
||||
self.to_lance().delete(where)
|
||||
|
||||
def update(
|
||||
self,
|
||||
@@ -968,7 +975,6 @@ class LanceTable(Table):
|
||||
values_sql = {k: value_to_sql(v) for k, v in values.items()}
|
||||
|
||||
self.to_lance().update(values_sql, where)
|
||||
self._reset_dataset()
|
||||
register_event("update")
|
||||
|
||||
def _execute_query(self, query: Query) -> pa.Table:
|
||||
|
||||
@@ -14,7 +14,8 @@
|
||||
import os
|
||||
from datetime import date, datetime
|
||||
from functools import singledispatch
|
||||
from typing import Tuple
|
||||
import pathlib
|
||||
from typing import Tuple, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import numpy as np
|
||||
@@ -62,6 +63,12 @@ def get_uri_location(uri: str) -> str:
|
||||
str: Location part of the URL, without scheme
|
||||
"""
|
||||
parsed = urlparse(uri)
|
||||
if len(parsed.scheme) == 1:
|
||||
# Windows drive names are parsed as the scheme
|
||||
# e.g. "c:\path" -> ParseResult(scheme="c", netloc="", path="/path", ...)
|
||||
# So we add special handling here for schemes that are a single character
|
||||
return uri
|
||||
|
||||
if not parsed.netloc:
|
||||
return parsed.path
|
||||
else:
|
||||
@@ -84,6 +91,29 @@ def fs_from_uri(uri: str) -> Tuple[pa_fs.FileSystem, str]:
|
||||
return pa_fs.FileSystem.from_uri(uri)
|
||||
|
||||
|
||||
def join_uri(base: Union[str, pathlib.Path], *parts: str) -> str:
|
||||
"""
|
||||
Join a URI with multiple parts, handles both local and remote paths
|
||||
|
||||
Parameters
|
||||
----------
|
||||
base : str
|
||||
The base URI
|
||||
parts : str
|
||||
The parts to join to the base URI, each separated by the
|
||||
appropriate path separator for the URI scheme and OS
|
||||
"""
|
||||
if isinstance(base, pathlib.Path):
|
||||
return base.joinpath(*parts)
|
||||
base = str(base)
|
||||
if get_uri_scheme(base) == "file":
|
||||
# using pathlib for local paths make this windows compatible
|
||||
# `get_uri_scheme` returns `file` for windows drive names (e.g. `c:\path`)
|
||||
return str(pathlib.Path(base, *parts))
|
||||
# for remote paths, just use os.path.join
|
||||
return "/".join([p.rstrip("/") for p in [base, *parts]])
|
||||
|
||||
|
||||
def safe_import_pandas():
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
@@ -3,7 +3,7 @@ name = "lancedb"
|
||||
version = "0.4.0"
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.9.0",
|
||||
"pylance==0.9.1",
|
||||
"ratelimiter~=1.0",
|
||||
"retry>=0.9.2",
|
||||
"tqdm>=4.27.0",
|
||||
|
||||
@@ -83,6 +83,24 @@ def test_create_index_from_table(tmp_path, table):
|
||||
assert len(df) == 10
|
||||
assert "text" in df.columns
|
||||
|
||||
# Check whether it can be updated
|
||||
table.add(
|
||||
[
|
||||
{
|
||||
"vector": np.random.randn(128),
|
||||
"text": "gorilla",
|
||||
"text2": "gorilla",
|
||||
"nested": {"text": "gorilla"},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
table.create_fts_index("text", replace=True)
|
||||
assert len(table.search("gorilla").limit(1).to_pandas()) == 1
|
||||
|
||||
with pytest.raises(ValueError, match="already exists"):
|
||||
table.create_fts_index("text")
|
||||
|
||||
|
||||
def test_create_index_multiple_columns(tmp_path, table):
|
||||
table.create_fts_index(["text", "text2"])
|
||||
|
||||
@@ -21,8 +21,8 @@ import lance
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from pydantic import BaseModel
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from lancedb.conftest import MockTextEmbeddingFunction
|
||||
from lancedb.db import LanceDBConnection
|
||||
@@ -226,39 +226,38 @@ def test_versioning(db):
|
||||
|
||||
|
||||
def test_create_index_method():
|
||||
with patch.object(LanceTable, "_reset_dataset", return_value=None):
|
||||
with patch.object(
|
||||
LanceTable, "_dataset", new_callable=PropertyMock
|
||||
) as mock_dataset:
|
||||
# Setup mock responses
|
||||
mock_dataset.return_value.create_index.return_value = None
|
||||
with patch.object(
|
||||
LanceTable, "_dataset", new_callable=PropertyMock
|
||||
) as mock_dataset:
|
||||
# Setup mock responses
|
||||
mock_dataset.return_value.create_index.return_value = None
|
||||
|
||||
# Create a LanceTable object
|
||||
connection = LanceDBConnection(uri="mock.uri")
|
||||
table = LanceTable(connection, "test_table")
|
||||
# Create a LanceTable object
|
||||
connection = LanceDBConnection(uri="mock.uri")
|
||||
table = LanceTable(connection, "test_table")
|
||||
|
||||
# Call the create_index method
|
||||
table.create_index(
|
||||
metric="L2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
vector_column_name="vector",
|
||||
replace=True,
|
||||
index_cache_size=256,
|
||||
)
|
||||
# Call the create_index method
|
||||
table.create_index(
|
||||
metric="L2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
vector_column_name="vector",
|
||||
replace=True,
|
||||
index_cache_size=256,
|
||||
)
|
||||
|
||||
# Check that the _dataset.create_index method was called
|
||||
# with the right parameters
|
||||
mock_dataset.return_value.create_index.assert_called_once_with(
|
||||
column="vector",
|
||||
index_type="IVF_PQ",
|
||||
metric="L2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
replace=True,
|
||||
accelerator=None,
|
||||
index_cache_size=256,
|
||||
)
|
||||
# Check that the _dataset.create_index method was called
|
||||
# with the right parameters
|
||||
mock_dataset.return_value.create_index.assert_called_once_with(
|
||||
column="vector",
|
||||
index_type="IVF_PQ",
|
||||
metric="L2",
|
||||
num_partitions=256,
|
||||
num_sub_vectors=96,
|
||||
replace=True,
|
||||
accelerator=None,
|
||||
index_cache_size=256,
|
||||
)
|
||||
|
||||
|
||||
def test_add_with_nans(db):
|
||||
|
||||
@@ -11,7 +11,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from lancedb.util import get_uri_scheme
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from lancedb.util import get_uri_scheme, join_uri
|
||||
|
||||
|
||||
def test_normalize_uri():
|
||||
@@ -28,3 +33,55 @@ def test_normalize_uri():
|
||||
for uri, expected_scheme in zip(uris, schemes):
|
||||
parsed_scheme = get_uri_scheme(uri)
|
||||
assert parsed_scheme == expected_scheme
|
||||
|
||||
|
||||
def test_join_uri_remote():
|
||||
schemes = ["s3", "az", "gs"]
|
||||
for scheme in schemes:
|
||||
expected = f"{scheme}://bucket/path/to/table.lance"
|
||||
base_uri = f"{scheme}://bucket/path/to/"
|
||||
parts = ["table.lance"]
|
||||
assert join_uri(base_uri, *parts) == expected
|
||||
|
||||
base_uri = f"{scheme}://bucket"
|
||||
parts = ["path", "to", "table.lance"]
|
||||
assert join_uri(base_uri, *parts) == expected
|
||||
|
||||
|
||||
# skip this test if on windows
|
||||
@pytest.mark.skipif(os.name == "nt", reason="Windows paths are not POSIX")
|
||||
def test_join_uri_posix():
|
||||
for base in [
|
||||
# relative path
|
||||
"relative/path",
|
||||
"relative/path/",
|
||||
# an absolute path
|
||||
"/absolute/path",
|
||||
"/absolute/path/",
|
||||
# a file URI
|
||||
"file:///absolute/path",
|
||||
"file:///absolute/path/",
|
||||
]:
|
||||
joined = join_uri(base, "table.lance")
|
||||
assert joined == str(pathlib.Path(base) / "table.lance")
|
||||
joined = join_uri(pathlib.Path(base), "table.lance")
|
||||
assert joined == pathlib.Path(base) / "table.lance"
|
||||
|
||||
|
||||
# skip this test if not on windows
|
||||
@pytest.mark.skipif(os.name != "nt", reason="Windows paths are not POSIX")
|
||||
def test_local_join_uri_windows():
|
||||
# https://learn.microsoft.com/en-us/dotnet/standard/io/file-path-formats
|
||||
for base in [
|
||||
# windows relative path
|
||||
"relative\\path",
|
||||
"relative\\path\\",
|
||||
# windows absolute path from current drive
|
||||
"c:\\absolute\\path",
|
||||
# relative path from root of current drive
|
||||
"\\relative\\path",
|
||||
]:
|
||||
joined = join_uri(base, "table.lance")
|
||||
assert joined == str(pathlib.Path(base) / "table.lance")
|
||||
joined = join_uri(pathlib.Path(base), "table.lance")
|
||||
assert joined == pathlib.Path(base) / "table.lance"
|
||||
|
||||
@@ -23,7 +23,7 @@ pub enum Error {
|
||||
#[snafu(display("column '{name}' is missing"))]
|
||||
MissingColumn { name: String },
|
||||
#[snafu(display("{name}: {message}"))]
|
||||
RangeError { name: String, message: String },
|
||||
OutOfRange { name: String, message: String },
|
||||
#[snafu(display("{index_type} is not a valid index type"))]
|
||||
InvalidIndexType { index_type: String },
|
||||
|
||||
|
||||
@@ -65,12 +65,10 @@ fn get_index_params_builder(
|
||||
obj.get_opt::<JsString, _, _>(cx, "index_name")?
|
||||
.map(|s| index_builder.index_name(s.value(cx)));
|
||||
|
||||
obj.get_opt::<JsString, _, _>(cx, "metric_type")?
|
||||
.map(|s| MetricType::try_from(s.value(cx).as_str()))
|
||||
.map(|mt| {
|
||||
let metric_type = mt.unwrap();
|
||||
index_builder.metric_type(metric_type);
|
||||
});
|
||||
if let Some(metric_type) = obj.get_opt::<JsString, _, _>(cx, "metric_type")? {
|
||||
let metric_type = MetricType::try_from(metric_type.value(cx).as_str()).unwrap();
|
||||
index_builder.metric_type(metric_type);
|
||||
}
|
||||
|
||||
let num_partitions = obj.get_opt_usize(cx, "num_partitions")?;
|
||||
let max_iters = obj.get_opt_usize(cx, "max_iters")?;
|
||||
@@ -85,23 +83,29 @@ fn get_index_params_builder(
|
||||
index_builder.ivf_params(ivf_params)
|
||||
});
|
||||
|
||||
obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")?
|
||||
.map(|s| pq_params.use_opq = s.value(cx));
|
||||
if let Some(use_opq) = obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")? {
|
||||
pq_params.use_opq = use_opq.value(cx);
|
||||
}
|
||||
|
||||
obj.get_opt_usize(cx, "num_sub_vectors")?
|
||||
.map(|s| pq_params.num_sub_vectors = s);
|
||||
if let Some(num_sub_vectors) = obj.get_opt_usize(cx, "num_sub_vectors")? {
|
||||
pq_params.num_sub_vectors = num_sub_vectors;
|
||||
}
|
||||
|
||||
obj.get_opt_usize(cx, "num_bits")?
|
||||
.map(|s| pq_params.num_bits = s);
|
||||
if let Some(num_bits) = obj.get_opt_usize(cx, "num_bits")? {
|
||||
pq_params.num_bits = num_bits;
|
||||
}
|
||||
|
||||
obj.get_opt_usize(cx, "max_iters")?
|
||||
.map(|s| pq_params.max_iters = s);
|
||||
if let Some(max_iters) = obj.get_opt_usize(cx, "max_iters")? {
|
||||
pq_params.max_iters = max_iters;
|
||||
}
|
||||
|
||||
obj.get_opt_usize(cx, "max_opq_iters")?
|
||||
.map(|s| pq_params.max_opq_iters = s);
|
||||
if let Some(max_opq_iters) = obj.get_opt_usize(cx, "max_opq_iters")? {
|
||||
pq_params.max_opq_iters = max_opq_iters;
|
||||
}
|
||||
|
||||
obj.get_opt::<JsBoolean, _, _>(cx, "replace")?
|
||||
.map(|s| index_builder.replace(s.value(cx)));
|
||||
if let Some(replace) = obj.get_opt::<JsBoolean, _, _>(cx, "replace")? {
|
||||
index_builder.replace(replace.value(cx));
|
||||
}
|
||||
|
||||
Ok(index_builder)
|
||||
}
|
||||
|
||||
@@ -47,15 +47,15 @@ fn f64_to_u32_safe(n: f64, key: &str) -> Result<u32> {
|
||||
use conv::*;
|
||||
|
||||
n.approx_as::<u32>().map_err(|e| match e {
|
||||
FloatError::NegOverflow(_) => Error::RangeError {
|
||||
FloatError::NegOverflow(_) => Error::OutOfRange {
|
||||
name: key.into(),
|
||||
message: "must be > 0".to_string(),
|
||||
},
|
||||
FloatError::PosOverflow(_) => Error::RangeError {
|
||||
FloatError::PosOverflow(_) => Error::OutOfRange {
|
||||
name: key.into(),
|
||||
message: format!("must be < {}", u32::MAX),
|
||||
},
|
||||
FloatError::NotANumber(_) => Error::RangeError {
|
||||
FloatError::NotANumber(_) => Error::OutOfRange {
|
||||
name: key.into(),
|
||||
message: "not a valid number".to_string(),
|
||||
},
|
||||
@@ -66,15 +66,15 @@ fn f64_to_usize_safe(n: f64, key: &str) -> Result<usize> {
|
||||
use conv::*;
|
||||
|
||||
n.approx_as::<usize>().map_err(|e| match e {
|
||||
FloatError::NegOverflow(_) => Error::RangeError {
|
||||
FloatError::NegOverflow(_) => Error::OutOfRange {
|
||||
name: key.into(),
|
||||
message: "must be > 0".to_string(),
|
||||
},
|
||||
FloatError::PosOverflow(_) => Error::RangeError {
|
||||
FloatError::PosOverflow(_) => Error::OutOfRange {
|
||||
name: key.into(),
|
||||
message: format!("must be < {}", usize::MAX),
|
||||
},
|
||||
FloatError::NotANumber(_) => Error::RangeError {
|
||||
FloatError::NotANumber(_) => Error::OutOfRange {
|
||||
name: key.into(),
|
||||
message: "not a valid number".to_string(),
|
||||
},
|
||||
|
||||
@@ -25,11 +25,11 @@ impl JsQuery {
|
||||
let limit = query_obj
|
||||
.get_opt::<JsNumber, _, _>(&mut cx, "_limit")?
|
||||
.map(|value| {
|
||||
let limit = value.value(&mut cx) as u64;
|
||||
if limit <= 0 {
|
||||
let limit = value.value(&mut cx);
|
||||
if limit <= 0.0 {
|
||||
panic!("Limit must be a positive integer");
|
||||
}
|
||||
limit
|
||||
limit as u64
|
||||
});
|
||||
let select = query_obj
|
||||
.get_opt::<JsArray, _, _>(&mut cx, "_select")?
|
||||
@@ -73,7 +73,7 @@ impl JsQuery {
|
||||
|
||||
rt.spawn(async move {
|
||||
let mut builder = table
|
||||
.search(query.map(|q| Float32Array::from(q)))
|
||||
.search(query.map(Float32Array::from))
|
||||
.refine_factor(refine_factor)
|
||||
.nprobes(nprobes)
|
||||
.filter(filter)
|
||||
|
||||
@@ -45,7 +45,7 @@ impl JsTable {
|
||||
let table_name = cx.argument::<JsString>(0)?.value(&mut cx);
|
||||
let buffer = cx.argument::<JsBuffer>(1)?;
|
||||
let (batches, schema) =
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&cx)).or_throw(&mut cx)?;
|
||||
|
||||
// Write mode
|
||||
let mode = match cx.argument::<JsString>(2)?.value(&mut cx).as_str() {
|
||||
@@ -93,7 +93,7 @@ impl JsTable {
|
||||
let buffer = cx.argument::<JsBuffer>(0)?;
|
||||
let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
|
||||
let (batches, schema) =
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&mut cx)).or_throw(&mut cx)?;
|
||||
arrow_buffer_to_record_batch(buffer.as_slice(&cx)).or_throw(&mut cx)?;
|
||||
let rt = runtime(&mut cx)?;
|
||||
let channel = cx.channel();
|
||||
let mut table = js_table.table.clone();
|
||||
@@ -186,7 +186,7 @@ impl JsTable {
|
||||
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
||||
|
||||
let value = updates_arg
|
||||
.get_value(&mut cx, property.clone())?
|
||||
.get_value(&mut cx, property)?
|
||||
.downcast_or_throw::<JsString, _>(&mut cx)?;
|
||||
|
||||
let property = property.value(&mut cx);
|
||||
@@ -216,7 +216,7 @@ impl JsTable {
|
||||
.map(|(k, v)| (k.as_str(), v.as_str()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let predicate = predicate.as_ref().map(|s| s.as_str());
|
||||
let predicate = predicate.as_deref();
|
||||
|
||||
let update_result = table.update(predicate, updates_arg).await;
|
||||
deferred.settle_with(&channel, move |mut cx| {
|
||||
|
||||
@@ -26,7 +26,7 @@ use futures::{stream::BoxStream, FutureExt, StreamExt};
|
||||
use lance::io::object_store::WrappingObjectStore;
|
||||
use object_store::{
|
||||
path::Path, Error, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore,
|
||||
Result,
|
||||
PutOptions, PutResult, Result,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
@@ -72,13 +72,28 @@ impl PrimaryOnly for Path {
|
||||
/// Note: this object store does not mirror writes to *.manifest files
|
||||
#[async_trait]
|
||||
impl ObjectStore for MirroringObjectStore {
|
||||
async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> {
|
||||
async fn put(&self, location: &Path, bytes: Bytes) -> Result<PutResult> {
|
||||
if location.primary_only() {
|
||||
self.primary.put(location, bytes).await
|
||||
} else {
|
||||
self.secondary.put(location, bytes.clone()).await?;
|
||||
self.primary.put(location, bytes).await?;
|
||||
Ok(())
|
||||
self.primary.put(location, bytes).await
|
||||
}
|
||||
}
|
||||
|
||||
async fn put_opts(
|
||||
&self,
|
||||
location: &Path,
|
||||
bytes: Bytes,
|
||||
options: PutOptions,
|
||||
) -> Result<PutResult> {
|
||||
if location.primary_only() {
|
||||
self.primary.put_opts(location, bytes, options).await
|
||||
} else {
|
||||
self.secondary
|
||||
.put_opts(location, bytes.clone(), options.clone())
|
||||
.await?;
|
||||
self.primary.put_opts(location, bytes, options).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,8 +144,8 @@ impl ObjectStore for MirroringObjectStore {
|
||||
self.primary.delete(location).await
|
||||
}
|
||||
|
||||
async fn list(&self, prefix: Option<&Path>) -> Result<BoxStream<'_, Result<ObjectMeta>>> {
|
||||
self.primary.list(prefix).await
|
||||
fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
|
||||
self.primary.list(prefix)
|
||||
}
|
||||
|
||||
async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
|
||||
|
||||
Reference in New Issue
Block a user