Compare commits

...

10 Commits

Author SHA1 Message Date
Lance Release
245786fed7 [python] Bump version: 0.5.7 → 0.6.0 2024-02-29 16:03:01 +00:00
BubbleCal
edd9a043f8 chore: enable test for dropping table (#1038)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2024-02-29 15:00:24 +08:00
natcharacter
38c09fc294 A simple base usage that install the dependencies necessary to use FT… (#1036)
A simple base usage that install the dependencies necessary to use FTS
and Hybrid search

---------

Co-authored-by: Nat Roth <natroth@Nats-MacBook-Pro.local>
Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
2024-02-28 09:38:05 -08:00
Rob Meng
ebaa2dede5 chore: upgrade to lance 0.10.1 (#1034)
upgrade to lance 0.10.1 and update doc string to reflect dynamic
projection options
2024-02-28 11:06:46 -05:00
BubbleCal
ba7618a026 chore(rust): report the TableNotFound error while dropping non-exist table (#1022)
this will work after upgrading lance with
https://github.com/lancedb/lance/pull/1995 merged
see #884 for details

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2024-02-28 04:46:39 -08:00
Weston Pace
a6bcbd007b feat: add a basic async python client starting point (#1014)
This changes `lancedb` from a "pure python" setuptools project to a
maturin project and adds a rust lancedb dependency.

The async python client is extremely minimal (only `connect` and
`Connection.table_names` are supported). The purpose of this PR is to
get the infrastructure in place for building out the rest of the async
client.

Although this is not technically a breaking change (no APIs are
changing) it is still a considerable change in the way the wheels are
built because they now include the native shared library.
2024-02-27 04:52:02 -08:00
Will Jones
5af74b5aca feat: {add|alter|drop}_columns APIs (#1015)
Initial work for #959. This exposes the basic functionality for each in
all of the APIs. Will add user guide documentation in a later PR.
2024-02-26 11:04:53 -08:00
Weston Pace
8a52619bc0 refactor: change arrow from a direct dependency to a peer dependency (#984)
BREAKING CHANGE: users will now need to npm install `apache-arrow` and
`@apache-arrow/ts` themselves.
2024-02-23 14:08:39 -08:00
Lance Release
314d4c93e5 Updating package-lock.json 2024-02-23 05:11:22 +00:00
Lance Release
c5471ee694 Updating package-lock.json 2024-02-23 03:57:39 +00:00
99 changed files with 1887 additions and 217 deletions

View File

@@ -0,0 +1,58 @@
# We create a composite action to be re-used both for testing and for releasing
name: build-linux-wheel
description: "Build a manylinux wheel for lance"
inputs:
python-minor-version:
description: "8, 9, 10, 11, 12"
required: true
args:
description: "--release"
required: false
default: ""
arm-build:
description: "Build for arm64 instead of x86_64"
# Note: this does *not* mean the host is arm64, since we might be cross-compiling.
required: false
default: "false"
runs:
using: "composite"
steps:
- name: CONFIRM ARM BUILD
shell: bash
run: |
echo "ARM BUILD: ${{ inputs.arm-build }}"
- name: Build x86_64 Manylinux wheel
if: ${{ inputs.arm-build == 'false' }}
uses: PyO3/maturin-action@v1
with:
command: build
working-directory: python
target: x86_64-unknown-linux-gnu
manylinux: "2_17"
args: ${{ inputs.args }}
before-script-linux: |
set -e
yum install -y openssl-devel \
&& curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$(uname -m).zip > /tmp/protoc.zip \
&& unzip /tmp/protoc.zip -d /usr/local \
&& rm /tmp/protoc.zip
- name: Build Arm Manylinux Wheel
if: ${{ inputs.arm-build == 'true' }}
uses: PyO3/maturin-action@v1
with:
command: build
working-directory: python
target: aarch64-unknown-linux-gnu
manylinux: "2_24"
args: ${{ inputs.args }}
before-script-linux: |
set -e
apt install -y unzip
if [ $(uname -m) = "x86_64" ]; then
PROTOC_ARCH="x86_64"
else
PROTOC_ARCH="aarch_64"
fi
curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$PROTOC_ARCH.zip > /tmp/protoc.zip \
&& unzip /tmp/protoc.zip -d /usr/local \
&& rm /tmp/protoc.zip

View File

@@ -0,0 +1,25 @@
# We create a composite action to be re-used both for testing and for releasing
name: build_wheel
description: "Build a lance wheel"
inputs:
python-minor-version:
description: "8, 9, 10, 11"
required: true
args:
description: "--release"
required: false
default: ""
runs:
using: "composite"
steps:
- name: Install macos dependency
shell: bash
run: |
brew install protobuf
- name: Build wheel
uses: PyO3/maturin-action@v1
with:
command: build
args: ${{ inputs.args }}
working-directory: python
interpreter: 3.${{ inputs.python-minor-version }}

View File

@@ -0,0 +1,33 @@
# We create a composite action to be re-used both for testing and for releasing
name: build_wheel
description: "Build a lance wheel"
inputs:
python-minor-version:
description: "8, 9, 10, 11"
required: true
args:
description: "--release"
required: false
default: ""
runs:
using: "composite"
steps:
- name: Install Protoc v21.12
working-directory: C:\
run: |
New-Item -Path 'C:\protoc' -ItemType Directory
Set-Location C:\protoc
Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
7z x protoc.zip
Add-Content $env:GITHUB_PATH "C:\protoc\bin"
shell: powershell
- name: Build wheel
uses: PyO3/maturin-action@v1
with:
command: build
args: ${{ inputs.args }}
working-directory: python
- uses: actions/upload-artifact@v3
with:
name: windows-wheels
path: python\target\wheels

View File

@@ -2,30 +2,91 @@ name: PyPI Publish
on:
release:
types: [ published ]
types: [published]
jobs:
publish:
runs-on: ubuntu-latest
# Only runs on tags that matches the python-make-release action
if: startsWith(github.ref, 'refs/tags/python-v')
defaults:
run:
shell: bash
working-directory: python
linux:
timeout-minutes: 60
strategy:
matrix:
python-minor-version: ["8"]
platform:
- x86_64
- aarch64
runs-on: "ubuntu-22.04"
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: "3.8"
- name: Build distribution
run: |
ls -la
pip install wheel setuptools --upgrade
python setup.py sdist bdist_wheel
- name: Publish
uses: pypa/gh-action-pypi-publish@v1.8.5
python-version: 3.${{ matrix.python-minor-version }}
- uses: ./.github/workflows/build_linux_wheel
with:
password: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
packages-dir: python/dist
python-minor-version: ${{ matrix.python-minor-version }}
args: "--release --strip"
arm-build: ${{ matrix.platform == 'aarch64' }}
- uses: ./.github/workflows/upload_wheel
with:
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
repo: "pypi"
mac:
timeout-minutes: 60
runs-on: ${{ matrix.config.runner }}
strategy:
matrix:
python-minor-version: ["8"]
config:
- target: x86_64-apple-darwin
runner: macos-13
- target: aarch64-apple-darwin
runner: macos-14
env:
MACOSX_DEPLOYMENT_TARGET: 10.15
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.12
- uses: ./.github/workflows/build_mac_wheel
with:
python-minor-version: ${{ matrix.python-minor-version }}
args: "--release --strip --target ${{ matrix.config.target }}"
- uses: ./.github/workflows/upload_wheel
with:
python-minor-version: ${{ matrix.python-minor-version }}
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
repo: "pypi"
windows:
timeout-minutes: 60
runs-on: windows-latest
strategy:
matrix:
python-minor-version: ["8"]
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.${{ matrix.python-minor-version }}
- uses: ./.github/workflows/build_windows_wheel
with:
python-minor-version: ${{ matrix.python-minor-version }}
args: "--release --strip"
vcpkg_token: ${{ secrets.VCPKG_GITHUB_PACKAGES }}
- uses: ./.github/workflows/upload_wheel
with:
python-minor-version: ${{ matrix.python-minor-version }}
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
repo: "pypi"

View File

@@ -14,49 +14,133 @@ concurrency:
cancel-in-progress: true
jobs:
linux:
lint:
name: "Lint"
timeout-minutes: 30
strategy:
matrix:
python-minor-version: [ "8", "11" ]
runs-on: "ubuntu-22.04"
defaults:
run:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.${{ matrix.python-minor-version }}
- name: Install lancedb
run: |
pip install -e .[tests]
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest pytest-mock ruff
- name: Format check
run: ruff format --check .
- name: Lint
run: ruff .
- name: Run tests
run: pytest -m "not slow" -x -v --durations=30 tests
- name: doctest
run: pytest --doctest-modules lancedb
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install ruff
run: |
pip install ruff
- name: Format check
run: ruff format --check .
- name: Lint
run: ruff .
doctest:
name: "Doctest"
timeout-minutes: 30
runs-on: "ubuntu-22.04"
defaults:
run:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"
- name: Install protobuf
run: |
sudo apt update
sudo apt install -y protobuf-compiler
- uses: Swatinem/rust-cache@v2
with:
workspaces: python
- name: Install
run: |
pip install -e .[tests,dev,embeddings]
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install mlx
- name: Doctest
run: pytest --doctest-modules python/lancedb
linux:
name: "Linux: python-3.${{ matrix.python-minor-version }}"
timeout-minutes: 30
strategy:
matrix:
python-minor-version: ["8", "11"]
runs-on: "ubuntu-22.04"
defaults:
run:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Install protobuf
run: |
sudo apt update
sudo apt install -y protobuf-compiler
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.${{ matrix.python-minor-version }}
- uses: Swatinem/rust-cache@v2
with:
workspaces: python
- uses: ./.github/workflows/build_linux_wheel
- uses: ./.github/workflows/run_tests
# Make sure wheels are not included in the Rust cache
- name: Delete wheels
run: rm -rf target/wheels
platform:
name: "Platform: ${{ matrix.config.name }}"
name: "Mac: ${{ matrix.config.name }}"
timeout-minutes: 30
strategy:
matrix:
config:
- name: x86 Mac
- name: x86
runner: macos-13
- name: Arm Mac
- name: Arm
runner: macos-14
- name: x86 Windows
runs-on: "${{ matrix.config.runner }}"
defaults:
run:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- uses: Swatinem/rust-cache@v2
with:
workspaces: python
- uses: ./.github/workflows/build_mac_wheel
- uses: ./.github/workflows/run_tests
# Make sure wheels are not included in the Rust cache
- name: Delete wheels
run: rm -rf target/wheels
windows:
name: "Windows: ${{ matrix.config.name }}"
timeout-minutes: 30
strategy:
matrix:
config:
- name: x86
runner: windows-latest
runs-on: "${{ matrix.config.runner }}"
defaults:
@@ -64,21 +148,22 @@ jobs:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install lancedb
run: |
pip install -e .[tests]
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest pytest-mock
- name: Run tests
run: pytest -m "not slow" -x -v --durations=30 tests
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- uses: Swatinem/rust-cache@v2
with:
workspaces: python
- uses: ./.github/workflows/build_windows_wheel
- uses: ./.github/workflows/run_tests
# Make sure wheels are not included in the Rust cache
- name: Delete wheels
run: rm -rf target/wheels
pydantic1x:
timeout-minutes: 30
runs-on: "ubuntu-22.04"
@@ -87,21 +172,22 @@ jobs:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Install lancedb
run: |
pip install "pydantic<2"
pip install -e .[tests]
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
pip install pytest pytest-mock
- name: Run tests
run: pytest -m "not slow" -x -v --durations=30 tests
- name: doctest
run: pytest --doctest-modules lancedb
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Install lancedb
run: |
pip install "pydantic<2"
pip install -e .[tests]
pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985
- name: Run tests
run: pytest -m "not slow" -x -v --durations=30 python/tests

17
.github/workflows/run_tests/action.yml vendored Normal file
View File

@@ -0,0 +1,17 @@
name: run-tests
description: "Install lance wheel and run unit tests"
inputs:
python-minor-version:
required: true
description: "8 9 10 11 12"
runs:
using: "composite"
steps:
- name: Install lancedb
shell: bash
run: |
pip3 install $(ls target/wheels/lancedb-*.whl)[tests,dev,embeddings]
- name: pytest
shell: bash
run: pytest -m "not slow" -x -v --durations=30 python/python/tests

View File

@@ -0,0 +1,29 @@
name: upload-wheel
description: "Upload wheels to Pypi"
inputs:
os:
required: true
description: "ubuntu-22.04 or macos-13"
repo:
required: false
description: "pypi or testpypi"
default: "pypi"
token:
required: true
description: "release token for the repo"
runs:
using: "composite"
steps:
- name: Install dependencies
shell: bash
run: |
python -m pip install --upgrade pip
pip install twine
- name: Publish wheel
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ inputs.token }}
shell: bash
run: twine upload --repository ${{ inputs.repo }} target/wheels/lancedb-*.whl

5
.gitignore vendored
View File

@@ -22,6 +22,11 @@ python/dist
**/.hypothesis
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
## Javascript
*.node
**/node_modules

View File

@@ -1,5 +1,5 @@
[workspace]
members = ["rust/ffi/node", "rust/lancedb", "nodejs"]
members = ["rust/ffi/node", "rust/lancedb", "nodejs", "python"]
# Python package needs to be built by maturin.
exclude = ["python"]
resolver = "2"
@@ -14,10 +14,10 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
categories = ["database-implementations"]
[workspace.dependencies]
lance = { "version" = "=0.9.18", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.9.18" }
lance-linalg = { "version" = "=0.9.18" }
lance-testing = { "version" = "=0.9.18" }
lance = { "version" = "=0.10.1", "features" = ["dynamodb"] }
lance-index = { "version" = "=0.10.1" }
lance-linalg = { "version" = "=0.10.1" }
lance-testing = { "version" = "=0.10.1" }
# Note that this one does not include pyarrow
arrow = { version = "50.0", optional = false }
arrow-array = "50.0"

27
dockerfiles/Dockerfile Normal file
View File

@@ -0,0 +1,27 @@
#Simple base dockerfile that supports basic dependencies required to run lance with FTS and Hybrid Search
#Usage docker build -t lancedb:latest -f Dockerfile .
FROM python:3.10-slim-buster
# Install Rust
RUN apt-get update && apt-get install -y curl build-essential && \
curl https://sh.rustup.rs -sSf | sh -s -- -y
# Set the environment variable for Rust
ENV PATH="/root/.cargo/bin:${PATH}"
# Install protobuf compiler
RUN apt-get install -y protobuf-compiler && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN apt-get -y update &&\
apt-get -y upgrade && \
apt-get -y install git
# Verify installations
RUN python --version && \
rustc --version && \
protoc --version
RUN pip install tantivy lancedb

44
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.4.10",
"version": "0.4.11",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.4.10",
"version": "0.4.11",
"cpu": [
"x64",
"arm64"
@@ -53,11 +53,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.4.10",
"@lancedb/vectordb-darwin-x64": "0.4.10",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.10",
"@lancedb/vectordb-linux-x64-gnu": "0.4.10",
"@lancedb/vectordb-win32-x64-msvc": "0.4.10"
"@lancedb/vectordb-darwin-arm64": "0.4.11",
"@lancedb/vectordb-darwin-x64": "0.4.11",
"@lancedb/vectordb-linux-arm64-gnu": "0.4.11",
"@lancedb/vectordb-linux-x64-gnu": "0.4.11",
"@lancedb/vectordb-win32-x64-msvc": "0.4.11"
}
},
"node_modules/@75lb/deep-merge": {
@@ -329,9 +329,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.4.10",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.10.tgz",
"integrity": "sha512-y/uHOGb0g15pvqv5tdTyZ6oN+0QVpBmZDzKFWW6pPbuSZjB2uPqcs+ti0RB+AUdmS21kavVQqaNsw/HLKEGrHA==",
"version": "0.4.11",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.11.tgz",
"integrity": "sha512-JDOKmFnuJPFkA7ZmrzBJolROwSjWr7yMvAbi40uLBc25YbbVezodd30u2EFtIwWwtk1GqNYRZ49FZOElKYeC/Q==",
"cpu": [
"arm64"
],
@@ -341,9 +341,9 @@
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.4.10",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.10.tgz",
"integrity": "sha512-XbfR58OkQpAe0xMSTrwJh9ZjGSzG9EZ7zwO6HfYem8PxcLYAcC6eWRWoSG/T0uObyrPTcYYyvHsp0eNQWYBFAQ==",
"version": "0.4.11",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.11.tgz",
"integrity": "sha512-iy6r+8tp2v1EFgJV52jusXtxgO6NY6SkpOdX41xPqN2mQWMkfUAR9Xtks1mgknjPOIKH4MRc8ZS0jcW/UWmilQ==",
"cpu": [
"x64"
],
@@ -353,9 +353,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.4.10",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.10.tgz",
"integrity": "sha512-x40WKH9b+KxorRmKr9G7fv8p5mMj8QJQvRMA0v6v+nbZHr2FLlAZV+9mvhHOnm4AGIkPP5335cUgv6Qz6hgwkQ==",
"version": "0.4.11",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.11.tgz",
"integrity": "sha512-5K6IVcTMuH0SZBjlqB5Gg39WC889FpTwIWKufxzQMMXrzxo5J3lKUHVoR28RRlNhDF2d9kZXBEyCpIfDFsV9iQ==",
"cpu": [
"arm64"
],
@@ -365,9 +365,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.4.10",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.10.tgz",
"integrity": "sha512-CTGPpuzlqq2nVjUxI9gAJOT1oBANIovtIaFsOmBSnEAHgX7oeAxKy2b6L/kJzsgqSzvR5vfLwYcWFrr6ZmBxSA==",
"version": "0.4.11",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.11.tgz",
"integrity": "sha512-hF9ZChsdqKqqnivOzd9mE7lC3PmhZadXtwThi2RrsPiOLoEaGDfmr6Ni3amVQnB3bR8YEJtTxdQxe0NC4uW/8g==",
"cpu": [
"x64"
],
@@ -377,9 +377,9 @@
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.4.10",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.10.tgz",
"integrity": "sha512-Fd7r74coZyrKzkfXg4WthqOL+uKyJyPTia6imcrMNqKOlTGdKmHf02Qi2QxWZrFaabkRYo4Tpn5FeRJ3yYX8CA==",
"version": "0.4.11",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.11.tgz",
"integrity": "sha512-0+9ut1ccKoqIyGxsVixwx3771Z+DXpl5WfSmOeA8kf3v3jlOg2H+0YUahiXLDid2ju+yeLPrAUYm7A1gKHVhew==",
"cpu": [
"x64"
],

View File

@@ -61,11 +61,13 @@
"uuid": "^9.0.0"
},
"dependencies": {
"@apache-arrow/ts": "^14.0.2",
"@neon-rs/load": "^0.0.74",
"apache-arrow": "^14.0.2",
"axios": "^1.4.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
"apache-arrow": "^14.0.2"
},
"os": [
"darwin",
"linux",

View File

@@ -42,7 +42,10 @@ const {
tableCompactFiles,
tableListIndices,
tableIndexStats,
tableSchema
tableSchema,
tableAddColumns,
tableAlterColumns,
tableDropColumns
// eslint-disable-next-line @typescript-eslint/no-var-requires
} = require('../native.js')
@@ -500,6 +503,59 @@ export interface Table<T = number[]> {
filter(value: string): Query<T>
schema: Promise<Schema>
// TODO: Support BatchUDF
/**
* Add new columns with defined values.
*
* @param newColumnTransforms pairs of column names and the SQL expression to use
* to calculate the value of the new column. These
* expressions will be evaluated for each row in the
* table, and can reference existing columns in the table.
*/
addColumns(newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise<void>
/**
* Alter the name or nullability of columns.
*
* @param columnAlterations One or more alterations to apply to columns.
*/
alterColumns(columnAlterations: ColumnAlteration[]): Promise<void>
/**
* Drop one or more columns from the dataset
*
* This is a metadata-only operation and does not remove the data from the
* underlying storage. In order to remove the data, you must subsequently
* call ``compact_files`` to rewrite the data without the removed columns and
* then call ``cleanup_files`` to remove the old files.
*
* @param columnNames The names of the columns to drop. These can be nested
* column references (e.g. "a.b.c") or top-level column
* names (e.g. "a").
*/
dropColumns(columnNames: string[]): Promise<void>
}
/**
* A definition of a column alteration. The alteration changes the column at
* `path` to have the new name `name`, to be nullable if `nullable` is true,
* and to have the data type `data_type`. At least one of `rename` or `nullable`
* must be provided.
*/
export interface ColumnAlteration {
/**
* The path to the column to alter. This is a dot-separated path to the column.
* If it is a top-level column then it is just the name of the column. If it is
* a nested column then it is the path to the column, e.g. "a.b.c" for a column
* `c` nested inside a column `b` nested inside a column `a`.
*/
path: string
rename?: string
/**
* Set the new nullability. Note that a nullable column cannot be made non-nullable.
*/
nullable?: boolean
}
export interface UpdateArgs {
@@ -1028,6 +1084,18 @@ export class LocalTable<T = number[]> implements Table<T> {
return false
}
}
async addColumns (newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise<void> {
return tableAddColumns.call(this._tbl, newColumnTransforms)
}
async alterColumns (columnAlterations: ColumnAlteration[]): Promise<void> {
return tableAlterColumns.call(this._tbl, columnAlterations)
}
async dropColumns (columnNames: string[]): Promise<void> {
return tableDropColumns.call(this._tbl, columnNames)
}
}
export interface CleanupStats {

View File

@@ -25,7 +25,8 @@ import {
type UpdateArgs,
type UpdateSqlArgs,
makeArrowTable,
type MergeInsertArgs
type MergeInsertArgs,
type ColumnAlteration
} from '../index'
import { Query } from '../query'
@@ -474,4 +475,16 @@ export class RemoteTable<T = number[]> implements Table<T> {
numUnindexedRows: results.data.num_unindexed_rows
}
}
async addColumns (newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise<void> {
throw new Error('Add columns is not yet supported in LanceDB Cloud.')
}
async alterColumns (columnAlterations: ColumnAlteration[]): Promise<void> {
throw new Error('Alter columns is not yet supported in LanceDB Cloud.')
}
async dropColumns (columnNames: string[]): Promise<void> {
throw new Error('Drop columns is not yet supported in LanceDB Cloud.')
}
}

View File

@@ -37,8 +37,10 @@ import {
Utf8,
Table as ArrowTable,
vectorFromArray,
Float64,
Float32,
Float16
Float16,
Int64
} from 'apache-arrow'
const expect = chai.expect
@@ -196,7 +198,7 @@ describe('LanceDB client', function () {
const table = await con.openTable('vectors')
const results = await table
.search([0.1, 0.1])
.select(['is_active'])
.select(['is_active', 'vector'])
.execute()
assert.equal(results.length, 2)
// vector and _distance are always returned
@@ -1057,3 +1059,63 @@ describe('Compact and cleanup', function () {
assert.equal(await table.countRows(), 3)
})
})
describe('schema evolution', function () {
// Create a new sample table
it('can add a new column to the schema', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const table = await con.createTable('vectors', [
{ id: 1n, vector: [0.1, 0.2] }
])
await table.addColumns([{ name: 'price', valueSql: 'cast(10.0 as float)' }])
const expectedSchema = new Schema([
new Field('id', new Int64()),
new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true))),
new Field('price', new Float32())
])
expect(await table.schema).to.deep.equal(expectedSchema)
})
it('can alter the columns in the schema', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const schema = new Schema([
new Field('id', new Int64(), false),
new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true))),
new Field('price', new Float64(), false)
])
const table = await con.createTable('vectors', [
{ id: 1n, vector: [0.1, 0.2], price: 10.0 }
])
expect(await table.schema).to.deep.equal(schema)
await table.alterColumns([
{ path: 'id', rename: 'new_id' },
{ path: 'price', nullable: true }
])
const expectedSchema = new Schema([
new Field('new_id', new Int64(), false),
new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true))),
new Field('price', new Float64(), true)
])
expect(await table.schema).to.deep.equal(expectedSchema)
})
it('can drop a column from the schema', async function () {
const dir = await track().mkdir('lancejs')
const con = await lancedb.connect(dir)
const table = await con.createTable('vectors', [
{ id: 1n, vector: [0.1, 0.2] }
])
await table.dropColumns(['vector'])
const expectedSchema = new Schema([
new Field('id', new Int64(), false)
])
expect(await table.schema).to.deep.equal(expectedSchema)
})
})

View File

@@ -17,7 +17,7 @@ import * as path from "path";
import * as fs from "fs";
import { connect } from "../dist";
import { Schema, Field, Float32, Int32, FixedSizeList } from "apache-arrow";
import { Schema, Field, Float32, Int32, FixedSizeList, Int64, Float64 } from "apache-arrow";
import { makeArrowTable } from "../dist/arrow";
describe("Test creating index", () => {
@@ -214,4 +214,69 @@ describe("Read consistency interval", () => {
expect(await table2.countRows()).toEqual(2n);
}
});
});
describe('schema evolution', function () {
let tmpDir: string;
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "schema-evolution-"));
});
// Create a new sample table
it('can add a new column to the schema', async function () {
const con = await connect(tmpDir)
const table = await con.createTable('vectors', [
{ id: 1n, vector: [0.1, 0.2] }
])
await table.addColumns([{ name: 'price', valueSql: 'cast(10.0 as float)' }])
const expectedSchema = new Schema([
new Field('id', new Int64(), true),
new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true)), true),
new Field('price', new Float32(), false)
])
expect(await table.schema()).toEqual(expectedSchema)
});
it('can alter the columns in the schema', async function () {
const con = await connect(tmpDir)
const schema = new Schema([
new Field('id', new Int64(), true),
new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true)), true),
new Field('price', new Float64(), false)
])
const table = await con.createTable('vectors', [
{ id: 1n, vector: [0.1, 0.2] }
])
// Can create a non-nullable column only through addColumns at the moment.
await table.addColumns([{ name: 'price', valueSql: 'cast(10.0 as double)' }])
expect(await table.schema()).toEqual(schema)
await table.alterColumns([
{ path: 'id', rename: 'new_id' },
{ path: 'price', nullable: true }
])
const expectedSchema = new Schema([
new Field('new_id', new Int64(), true),
new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true)), true),
new Field('price', new Float64(), true)
])
expect(await table.schema()).toEqual(expectedSchema)
});
it('can drop a column from the schema', async function () {
const con = await connect(tmpDir)
const table = await con.createTable('vectors', [
{ id: 1n, vector: [0.1, 0.2] }
])
await table.dropColumns(['vector'])
const expectedSchema = new Schema([
new Field('id', new Int64(), true)
])
expect(await table.schema()).toEqual(expectedSchema)
});
});

View File

@@ -12,6 +12,38 @@ export const enum MetricType {
Cosine = 1,
Dot = 2
}
/**
* A definition of a column alteration. The alteration changes the column at
* `path` to have the new name `name`, to be nullable if `nullable` is true,
* and to have the data type `data_type`. At least one of `rename` or `nullable`
* must be provided.
*/
export interface ColumnAlteration {
/**
* The path to the column to alter. This is a dot-separated path to the column.
* If it is a top-level column then it is just the name of the column. If it is
* a nested column then it is the path to the column, e.g. "a.b.c" for a column
* `c` nested inside a column `b` nested inside a column `a`.
*/
path: string
/**
* The new name of the column. If not provided then the name will not be changed.
* This must be distinct from the names of all other columns in the table.
*/
rename?: string
/** Set the new nullability. Note that a nullable column cannot be made non-nullable. */
nullable?: boolean
}
/** A definition of a new column to add to a table. */
export interface AddColumnsSql {
/** The name of the new column. */
name: string
/**
* The values to populate the new column with, as a SQL expression.
* The expression can reference other columns in the table.
*/
valueSql: string
}
export interface ConnectionOptions {
uri: string
apiKey?: string
@@ -89,4 +121,7 @@ export class Table {
delete(predicate: string): Promise<void>
createIndex(): IndexBuilder
query(): Query
addColumns(transforms: Array<AddColumnsSql>): Promise<void>
alterColumns(alterations: Array<ColumnAlteration>): Promise<void>
dropColumns(columns: Array<string>): Promise<void>
}

View File

@@ -13,7 +13,7 @@
// limitations under the License.
import { Schema, tableFromIPC } from "apache-arrow";
import { Table as _NativeTable } from "./native";
import { AddColumnsSql, ColumnAlteration, Table as _NativeTable } from "./native";
import { toBuffer, Data } from "./arrow";
import { Query } from "./query";
import { IndexBuilder } from "./indexer";
@@ -150,4 +150,42 @@ export class Table {
}
return q;
}
// TODO: Support BatchUDF
/**
* Add new columns with defined values.
*
* @param newColumnTransforms pairs of column names and the SQL expression to use
* to calculate the value of the new column. These
* expressions will be evaluated for each row in the
* table, and can reference existing columns in the table.
*/
async addColumns(newColumnTransforms: AddColumnsSql[]): Promise<void> {
await this.inner.addColumns(newColumnTransforms);
}
/**
* Alter the name or nullability of columns.
*
* @param columnAlterations One or more alterations to apply to columns.
*/
async alterColumns(columnAlterations: ColumnAlteration[]): Promise<void> {
await this.inner.alterColumns(columnAlterations);
}
/**
* Drop one or more columns from the dataset
*
* This is a metadata-only operation and does not remove the data from the
* underlying storage. In order to remove the data, you must subsequently
* call ``compact_files`` to rewrite the data without the removed columns and
* then call ``cleanup_files`` to remove the old files.
*
* @param columnNames The names of the columns to drop. These can be nested
* column references (e.g. "a.b.c") or top-level column
* names (e.g. "a").
*/
async dropColumns(columnNames: string[]): Promise<void> {
await this.inner.dropColumns(columnNames);
}
}

View File

@@ -62,7 +62,7 @@
"lancedb-linux-arm64-gnu": "0.4.3",
"lancedb-linux-x64-gnu": "0.4.3"
},
"dependencies": {
"peerDependencies": {
"apache-arrow": "^15.0.0"
}
}

View File

@@ -13,8 +13,11 @@
// limitations under the License.
use arrow_ipc::writer::FileWriter;
use lancedb::table::AddDataOptions;
use lancedb::{ipc::ipc_file_to_batches, table::TableRef};
use lance::dataset::ColumnAlteration as LanceColumnAlteration;
use lancedb::{
ipc::ipc_file_to_batches,
table::{AddDataOptions, TableRef},
};
use napi::bindgen_prelude::*;
use napi_derive::napi;
@@ -93,4 +96,106 @@ impl Table {
pub fn query(&self) -> Query {
Query::new(self)
}
#[napi]
pub async fn add_columns(&self, transforms: Vec<AddColumnsSql>) -> napi::Result<()> {
let transforms = transforms
.into_iter()
.map(|sql| (sql.name, sql.value_sql))
.collect::<Vec<_>>();
let transforms = lance::dataset::NewColumnTransform::SqlExpressions(transforms);
self.table
.add_columns(transforms, None)
.await
.map_err(|err| {
napi::Error::from_reason(format!(
"Failed to add columns to table {}: {}",
self.table, err
))
})?;
Ok(())
}
#[napi]
pub async fn alter_columns(&self, alterations: Vec<ColumnAlteration>) -> napi::Result<()> {
for alteration in &alterations {
if alteration.rename.is_none() && alteration.nullable.is_none() {
return Err(napi::Error::from_reason(
"Alteration must have a 'rename' or 'nullable' field.",
));
}
}
let alterations = alterations
.into_iter()
.map(LanceColumnAlteration::from)
.collect::<Vec<_>>();
self.table
.alter_columns(&alterations)
.await
.map_err(|err| {
napi::Error::from_reason(format!(
"Failed to alter columns in table {}: {}",
self.table, err
))
})?;
Ok(())
}
#[napi]
pub async fn drop_columns(&self, columns: Vec<String>) -> napi::Result<()> {
let col_refs = columns.iter().map(String::as_str).collect::<Vec<_>>();
self.table.drop_columns(&col_refs).await.map_err(|err| {
napi::Error::from_reason(format!(
"Failed to drop columns from table {}: {}",
self.table, err
))
})?;
Ok(())
}
}
/// A definition of a column alteration. The alteration changes the column at
/// `path` to have the new name `name`, to be nullable if `nullable` is true,
/// and to have the data type `data_type`. At least one of `rename` or `nullable`
/// must be provided.
#[napi(object)]
pub struct ColumnAlteration {
/// The path to the column to alter. This is a dot-separated path to the column.
/// If it is a top-level column then it is just the name of the column. If it is
/// a nested column then it is the path to the column, e.g. "a.b.c" for a column
/// `c` nested inside a column `b` nested inside a column `a`.
pub path: String,
/// The new name of the column. If not provided then the name will not be changed.
/// This must be distinct from the names of all other columns in the table.
pub rename: Option<String>,
/// Set the new nullability. Note that a nullable column cannot be made non-nullable.
pub nullable: Option<bool>,
}
impl From<ColumnAlteration> for LanceColumnAlteration {
fn from(js: ColumnAlteration) -> Self {
let ColumnAlteration {
path,
rename,
nullable,
} = js;
Self {
path,
rename,
nullable,
// TODO: wire up this field
data_type: None,
}
}
}
/// A definition of a new column to add to a table.
#[napi(object)]
pub struct AddColumnsSql {
/// The name of the new column.
pub name: String,
/// The values to populate the new column with, as a SQL expression.
/// The expression can reference other columns in the table.
pub value_sql: String,
}

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.5.7
current_version = 0.6.0
commit = True
message = [python] Bump version: {current_version} → {new_version}
tag = True

26
python/Cargo.toml Normal file
View File

@@ -0,0 +1,26 @@
[package]
name = "lancedb-python"
version = "0.4.10"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true
repository.workspace = true
keywords.workspace = true
categories.workspace = true
[lib]
name = "_lancedb"
crate-type = ["cdylib"]
[dependencies]
lancedb = { path = "../rust/lancedb" }
env_logger = "0.10"
pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
# Prevent dynamic linking of lzma, which comes from datafusion
lzma-sys = { version = "*", features = ["static"] }
[build-dependencies]
pyo3-build-config = { version = "0.20.3", features = ["extension-module", "abi3-py38"] }

View File

@@ -20,10 +20,10 @@ results = table.search([0.1, 0.3]).limit(20).to_list()
print(results)
```
## Development
Create a virtual environment and activate it:
LanceDb is based on the rust crate `lancedb` and is built with maturin. In order to build with maturin
you will either need a conda environment or a virtual environment (venv).
```bash
python -m venv venv
@@ -33,7 +33,15 @@ python -m venv venv
Install the necessary packages:
```bash
python -m pip install .
python -m pip install .[tests,dev]
```
To build the python package you can use maturin:
```bash
# This will build the rust bindings and place them in the appropriate place
# in your venv or conda environment
matruin develop
```
To run the unit tests:
@@ -45,7 +53,7 @@ pytest
To run the doc tests:
```bash
pytest --doctest-modules lancedb
pytest --doctest-modules python/lancedb
```
To run linter and automatically fix all errors:
@@ -61,31 +69,27 @@ If any packages are missing, install them with:
pip install <PACKAGE_NAME>
```
___
For **Windows** users, there may be errors when installing packages, so these commands may be helpful:
Activate the virtual environment:
```bash
. .\venv\Scripts\activate
```
You may need to run the installs separately:
```bash
pip install -e .[tests]
pip install -e .[dev]
```
`tantivy` requires `rust` to be installed, so install it with `conda`, as it doesn't support windows installation:
```bash
pip install wheel
pip install cargo
conda install rust
pip install tantivy
```
To run the unit tests:
```bash
pytest
```

3
python/build.rs Normal file
View File

@@ -0,0 +1,3 @@
fn main() {
pyo3_build_config::add_extension_module_link_args();
}

View File

@@ -1,9 +1,9 @@
[project]
name = "lancedb"
version = "0.5.7"
version = "0.6.0"
dependencies = [
"deprecation",
"pylance==0.9.18",
"pylance==0.10.1",
"ratelimiter~=1.0",
"retry>=0.9.2",
"tqdm>=4.27.0",
@@ -14,7 +14,7 @@ dependencies = [
"pyyaml>=6.0",
"click>=8.1.7",
"requests>=2.31.0",
"overrides>=0.7"
"overrides>=0.7",
]
description = "lancedb"
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
@@ -26,7 +26,7 @@ keywords = [
"data-science",
"machine-learning",
"arrow",
"data-analytics"
"data-analytics",
]
classifiers = [
"Development Status :: 3 - Alpha",
@@ -48,21 +48,53 @@ classifiers = [
repository = "https://github.com/lancedb/lancedb"
[project.optional-dependencies]
tests = ["aiohttp", "pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "duckdb", "pytz", "polars>=0.19"]
tests = [
"aiohttp",
"pandas>=1.4",
"pytest",
"pytest-mock",
"pytest-asyncio",
"duckdb",
"pytz",
"polars>=0.19",
]
dev = ["ruff", "pre-commit"]
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]", "mkdocs-ultralytics-plugin==0.0.44"]
docs = [
"mkdocs",
"mkdocs-jupyter",
"mkdocs-material",
"mkdocstrings[python]",
"mkdocs-ultralytics-plugin==0.0.44",
]
clip = ["torch", "pillow", "open-clip"]
embeddings = ["openai>=1.6.1", "sentence-transformers", "torch", "pillow", "open-clip-torch", "cohere", "huggingface_hub",
"InstructorEmbedding", "google.generativeai", "boto3>=1.28.57", "awscli>=1.29.57", "botocore>=1.31.57"]
embeddings = [
"openai>=1.6.1",
"sentence-transformers",
"torch",
"pillow",
"open-clip-torch",
"cohere",
"huggingface_hub",
"InstructorEmbedding",
"google.generativeai",
"boto3>=1.28.57",
"awscli>=1.29.57",
"botocore>=1.31.57",
]
[tool.maturin]
python-source = "python"
module-name = "lancedb._lancedb"
[project.scripts]
lancedb = "lancedb.cli.cli:cli"
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
requires = ["maturin>=1.4"]
build-backend = "maturin"
[tool.ruff]
[tool.ruff.lint]
select = ["F", "E", "W", "I", "G", "TCH", "PERF"]
[tool.pytest.ini_options]
@@ -70,5 +102,5 @@ addopts = "--strict-markers --ignore-glob=lancedb/embeddings/*.py"
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"asyncio"
"asyncio",
]

View File

@@ -19,8 +19,9 @@ from typing import Optional, Union
__version__ = importlib.metadata.version("lancedb")
from .common import URI
from .db import DBConnection, LanceDBConnection
from ._lancedb import connect as lancedb_connect
from .common import URI, sanitize_uri
from .db import AsyncConnection, AsyncLanceDBConnection, DBConnection, LanceDBConnection
from .remote.db import RemoteDBConnection
from .schema import vector # noqa: F401
from .utils import sentry_log # noqa: F401
@@ -101,3 +102,74 @@ def connect(
uri, api_key, region, host_override, request_thread_pool=request_thread_pool
)
return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval)
async def connect_async(
uri: URI,
*,
api_key: Optional[str] = None,
region: str = "us-east-1",
host_override: Optional[str] = None,
read_consistency_interval: Optional[timedelta] = None,
request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None,
) -> AsyncConnection:
"""Connect to a LanceDB database.
Parameters
----------
uri: str or Path
The uri of the database.
api_key: str, optional
If present, connect to LanceDB cloud.
Otherwise, connect to a database on file system or cloud storage.
Can be set via environment variable `LANCEDB_API_KEY`.
region: str, default "us-east-1"
The region to use for LanceDB Cloud.
host_override: str, optional
The override url for LanceDB Cloud.
read_consistency_interval: timedelta, default None
(For LanceDB OSS only)
The interval at which to check for updates to the table from other
processes. If None, then consistency is not checked. For performance
reasons, this is the default. For strong consistency, set this to
zero seconds. Then every read will check for updates from other
processes. As a compromise, you can set this to a non-zero timedelta
for eventual consistency. If more than that interval has passed since
the last check, then the table will be checked for updates. Note: this
consistency only applies to read operations. Write operations are
always consistent.
request_thread_pool: int or ThreadPoolExecutor, optional
The thread pool to use for making batch requests to the LanceDB Cloud API.
If an integer, then a ThreadPoolExecutor will be created with that
number of threads. If None, then a ThreadPoolExecutor will be created
with the default number of threads. If a ThreadPoolExecutor, then that
executor will be used for making requests. This is for LanceDB Cloud
only and is only used when making batch requests (i.e., passing in
multiple queries to the search method at once).
Examples
--------
For a local directory, provide a path for the database:
>>> import lancedb
>>> db = lancedb.connect("~/.lancedb")
For object storage, use a URI prefix:
>>> db = lancedb.connect("s3://my-bucket/lancedb")
Connect to LancdDB cloud:
>>> db = lancedb.connect("db://my_database", api_key="ldb_...")
Returns
-------
conn : DBConnection
A connection to a LanceDB database.
"""
return AsyncLanceDBConnection(
await lancedb_connect(
sanitize_uri(uri), api_key, region, host_override, read_consistency_interval
)
)

View File

@@ -0,0 +1,12 @@
from typing import Optional
class Connection(object):
async def table_names(self) -> list[str]: ...
async def connect(
uri: str,
api_key: Optional[str],
region: Optional[str],
host_override: Optional[str],
read_consistency_interval: Optional[float],
) -> Connection: ...

View File

@@ -34,3 +34,7 @@ class Credential(str):
def __str__(self) -> str:
return "********"
def sanitize_uri(uri: URI) -> str:
return str(uri)

View File

@@ -28,6 +28,7 @@ from .util import fs_from_uri, get_uri_location, get_uri_scheme, join_uri
if TYPE_CHECKING:
from datetime import timedelta
from ._lancedb import Connection as LanceDbConnection
from .common import DATA, URI
from .embeddings import EmbeddingFunctionConfig
from .pydantic import LanceModel
@@ -40,14 +41,21 @@ class DBConnection(EnforceOverrides):
def table_names(
self, page_token: Optional[str] = None, limit: int = 10
) -> Iterable[str]:
"""List all table in this database
"""List all tables in this database, in sorted order
Parameters
----------
page_token: str, optional
The token to use for pagination. If not present, start from the beginning.
Typically, this token is last table name from the previous page.
Only supported by LanceDb Cloud.
limit: int, default 10
The size of the page to return.
Only supported by LanceDb Cloud.
Returns
-------
Iterable of str
"""
pass
@@ -412,3 +420,254 @@ class LanceDBConnection(DBConnection):
def drop_database(self):
filesystem, path = fs_from_uri(self.uri)
filesystem.delete_dir(path)
class AsyncConnection(EnforceOverrides):
"""An active LanceDB connection interface."""
@abstractmethod
async def table_names(
self, *, page_token: Optional[str] = None, limit: int = 10
) -> Iterable[str]:
"""List all tables in this database, in sorted order
Parameters
----------
page_token: str, optional
The token to use for pagination. If not present, start from the beginning.
Typically, this token is last table name from the previous page.
Only supported by LanceDb Cloud.
limit: int, default 10
The size of the page to return.
Only supported by LanceDb Cloud.
Returns
-------
Iterable of str
"""
pass
@abstractmethod
async def create_table(
self,
name: str,
data: Optional[DATA] = None,
schema: Optional[Union[pa.Schema, LanceModel]] = None,
mode: str = "create",
exist_ok: bool = False,
on_bad_vectors: str = "error",
fill_value: float = 0.0,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
) -> Table:
"""Create a [Table][lancedb.table.Table] in the database.
Parameters
----------
name: str
The name of the table.
data: The data to initialize the table, *optional*
User must provide at least one of `data` or `schema`.
Acceptable types are:
- dict or list-of-dict
- pandas.DataFrame
- pyarrow.Table or pyarrow.RecordBatch
schema: The schema of the table, *optional*
Acceptable types are:
- pyarrow.Schema
- [LanceModel][lancedb.pydantic.LanceModel]
mode: str; default "create"
The mode to use when creating the table.
Can be either "create" or "overwrite".
By default, if the table already exists, an exception is raised.
If you want to overwrite the table, use mode="overwrite".
exist_ok: bool, default False
If a table by the same name already exists, then raise an exception
if exist_ok=False. If exist_ok=True, then open the existing table;
it will not add the provided data but will validate against any
schema that's specified.
on_bad_vectors: str, default "error"
What to do if any of the vectors are not the same size or contains NaNs.
One of "error", "drop", "fill".
fill_value: float
The value to use when filling vectors. Only used if on_bad_vectors="fill".
Returns
-------
LanceTable
A reference to the newly created table.
!!! note
The vector index won't be created by default.
To create the index, call the `create_index` method on the table.
Examples
--------
Can create with list of tuples or dictionaries:
>>> import lancedb
>>> db = lancedb.connect("./.lancedb")
>>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}]
>>> db.create_table("my_table", data)
LanceTable(connection=..., name="my_table")
>>> db["my_table"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: double
long: double
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
You can also pass a pandas DataFrame:
>>> import pandas as pd
>>> data = pd.DataFrame({
... "vector": [[1.1, 1.2], [0.2, 1.8]],
... "lat": [45.5, 40.1],
... "long": [-122.7, -74.1]
... })
>>> db.create_table("table2", data)
LanceTable(connection=..., name="table2")
>>> db["table2"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: double
long: double
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
Data is converted to Arrow before being written to disk. For maximum
control over how data is saved, either provide the PyArrow schema to
convert to or else provide a [PyArrow Table](pyarrow.Table) directly.
>>> custom_schema = pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("lat", pa.float32()),
... pa.field("long", pa.float32())
... ])
>>> db.create_table("table3", data, schema = custom_schema)
LanceTable(connection=..., name="table3")
>>> db["table3"].head()
pyarrow.Table
vector: fixed_size_list<item: float>[2]
child 0, item: float
lat: float
long: float
----
vector: [[[1.1,1.2],[0.2,1.8]]]
lat: [[45.5,40.1]]
long: [[-122.7,-74.1]]
It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
>>> import pyarrow as pa
>>> def make_batches():
... for i in range(5):
... yield pa.RecordBatch.from_arrays(
... [
... pa.array([[3.1, 4.1], [5.9, 26.5]],
... pa.list_(pa.float32(), 2)),
... pa.array(["foo", "bar"]),
... pa.array([10.0, 20.0]),
... ],
... ["vector", "item", "price"],
... )
>>> schema=pa.schema([
... pa.field("vector", pa.list_(pa.float32(), 2)),
... pa.field("item", pa.utf8()),
... pa.field("price", pa.float32()),
... ])
>>> db.create_table("table4", make_batches(), schema=schema)
LanceTable(connection=..., name="table4")
"""
raise NotImplementedError
async def open_table(self, name: str) -> Table:
"""Open a Lance Table in the database.
Parameters
----------
name: str
The name of the table.
Returns
-------
A LanceTable object representing the table.
"""
raise NotImplementedError
async def drop_table(self, name: str):
"""Drop a table from the database.
Parameters
----------
name: str
The name of the table.
"""
raise NotImplementedError
async def drop_database(self):
"""
Drop database
This is the same thing as dropping all the tables
"""
raise NotImplementedError
class AsyncLanceDBConnection(AsyncConnection):
def __init__(self, connection: LanceDbConnection):
self._inner = connection
async def __repr__(self) -> str:
pass
@override
async def table_names(
self,
*,
page_token=None,
limit=None,
) -> Iterable[str]:
return await self._inner.table_names()
@override
async def create_table(
self,
name: str,
data: Optional[DATA] = None,
schema: Optional[Union[pa.Schema, LanceModel]] = None,
mode: str = "create",
exist_ok: bool = False,
on_bad_vectors: str = "error",
fill_value: float = 0.0,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
) -> LanceTable:
raise NotImplementedError
@override
async def open_table(self, name: str) -> LanceTable:
raise NotImplementedError
@override
async def drop_table(self, name: str, ignore_missing: bool = False):
raise NotImplementedError
@override
async def drop_database(self):
raise NotImplementedError

View File

@@ -16,7 +16,7 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Type, Union
import deprecation
import numpy as np
@@ -93,7 +93,7 @@ class Query(pydantic.BaseModel):
metric: str = "L2"
# which columns to return in the results
columns: Optional[List[str]] = None
columns: Optional[Union[List[str], Dict[str, str]]] = None
# optional query parameters for tuning the results,
# e.g. `{"nprobes": "10", "refine_factor": "10"}`
@@ -321,20 +321,27 @@ class LanceQueryBuilder(ABC):
self._limit = limit
return self
def select(self, columns: list) -> LanceQueryBuilder:
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
"""Set the columns to return.
Parameters
----------
columns: list
The columns to return.
columns: list of str, or dict of str to str default None
List of column names to be fetched.
Or a dictionary of column names to SQL expressions.
All columns are fetched if None or unspecified.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
self._columns = columns
if isinstance(columns, list):
self._columns = columns
elif isinstance(columns, dict):
self._columns = list(columns.items())
else:
raise ValueError("columns must be a list or a dictionary")
return self
def where(self, where: str, prefilter: bool = False) -> LanceQueryBuilder:
@@ -392,7 +399,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
>>> (table.search([0.4, 0.4])
... .metric("cosine")
... .where("b < 10")
... .select(["b"])
... .select(["b", "vector"])
... .limit(2)
... .to_pandas())
b vector _distance

View File

@@ -15,7 +15,7 @@ import logging
import uuid
from concurrent.futures import Future
from functools import cached_property
from typing import Dict, Optional, Union
from typing import Dict, Iterable, Optional, Union
import pyarrow as pa
from lance import json_to_schema
@@ -473,6 +473,21 @@ class RemoteTable(Table):
"count_rows() is not yet supported on the LanceDB cloud"
)
def add_columns(self, transforms: Dict[str, str]):
raise NotImplementedError(
"add_columns() is not yet supported on the LanceDB cloud"
)
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
raise NotImplementedError(
"alter_columns() is not yet supported on the LanceDB cloud"
)
def drop_columns(self, columns: Iterable[str]):
raise NotImplementedError(
"drop_columns() is not yet supported on the LanceDB cloud"
)
def add_index(tbl: pa.Table, i: int) -> pa.Table:
return tbl.add_column(

View File

@@ -160,7 +160,7 @@ class Table(ABC):
Can query the table with [Table.search][lancedb.table.Table.search].
>>> table.search([0.4, 0.4]).select(["b"]).to_pandas()
>>> table.search([0.4, 0.4]).select(["b", "vector"]).to_pandas()
b vector _distance
0 4 [0.5, 1.3] 0.82
1 2 [1.1, 1.2] 1.13
@@ -436,7 +436,7 @@ class Table(ABC):
>>> query = [0.4, 1.4, 2.4]
>>> (table.search(query)
... .where("original_width > 1000", prefilter=True)
... .select(["caption", "original_width"])
... .select(["caption", "original_width", "vector"])
... .limit(2)
... .to_pandas())
caption original_width vector _distance
@@ -660,6 +660,56 @@ class Table(ABC):
For most cases, the default should be fine.
"""
@abstractmethod
def add_columns(self, transforms: Dict[str, str]):
"""
Add new columns with defined values.
This is not yet available in LanceDB Cloud.
Parameters
----------
transforms: Dict[str, str]
A map of column name to a SQL expression to use to calculate the
value of the new column. These expressions will be evaluated for
each row in the table, and can reference existing columns.
"""
@abstractmethod
def alter_columns(self, alterations: Iterable[Dict[str, str]]):
"""
Alter column names and nullability.
This is not yet available in LanceDB Cloud.
alterations : Iterable[Dict[str, Any]]
A sequence of dictionaries, each with the following keys:
- "path": str
The column path to alter. For a top-level column, this is the name.
For a nested column, this is the dot-separated path, e.g. "a.b.c".
- "name": str, optional
The new name of the column. If not specified, the column name is
not changed.
- "nullable": bool, optional
Whether the column should be nullable. If not specified, the column
nullability is not changed. Only non-nullable columns can be changed
to nullable. Currently, you cannot change a nullable column to
non-nullable.
"""
@abstractmethod
def drop_columns(self, columns: Iterable[str]):
"""
Drop columns from the table.
This is not yet available in LanceDB Cloud.
Parameters
----------
columns : Iterable[str]
The names of the columns to drop.
"""
class _LanceDatasetRef(ABC):
@property
@@ -1219,7 +1269,7 @@ class LanceTable(Table):
>>> query = [0.4, 1.4, 2.4]
>>> (table.search(query)
... .where("original_width > 1000", prefilter=True)
... .select(["caption", "original_width"])
... .select(["caption", "original_width", "vector"])
... .limit(2)
... .to_pandas())
caption original_width vector _distance
@@ -1536,6 +1586,22 @@ class LanceTable(Table):
"""
return self.to_lance().optimize.compact_files(*args, **kwargs)
def add_columns(self, transforms: Dict[str, str]):
self._dataset_mut.add_columns(transforms)
def alter_columns(self, *alterations: Iterable[Dict[str, str]]):
modified = []
# I called this name in pylance, but I think I regret that now. So we
# allow both name and rename.
for alter in alterations:
if "rename" in alter:
alter["name"] = alter.pop("rename")
modified.append(alter)
self._dataset_mut.alter_columns(*modified)
def drop_columns(self, columns: Iterable[str]):
self._dataset_mut.drop_columns(columns)
def _sanitize_schema(
data: pa.Table,

View File

@@ -1,5 +1,4 @@
from click.testing import CliRunner
from lancedb.cli.cli import cli
from lancedb.utils import CONFIG

View File

@@ -13,7 +13,6 @@
import pandas as pd
import pytest
from lancedb.context import contextualize

View File

@@ -11,12 +11,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import lancedb
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
import lancedb
from lancedb.pydantic import LanceModel, Vector
@@ -166,6 +165,24 @@ def test_table_names(tmp_path):
assert db.table_names() == ["test1", "test2", "test3"]
@pytest.mark.asyncio
async def test_table_names_async(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(
{
"vector": [[3.1, 4.1], [5.9, 26.5]],
"item": ["foo", "bar"],
"price": [10.0, 20.0],
}
)
db.create_table("test2", data=data)
db.create_table("test1", data=data)
db.create_table("test3", data=data)
db = await lancedb.connect_async(tmp_path)
assert await db.table_names() == ["test1", "test2", "test3"]
def test_create_mode(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(

View File

@@ -13,7 +13,6 @@
import numpy as np
import pytest
from lancedb import LanceDBConnection
# TODO: setup integ test mark and script

View File

@@ -13,11 +13,10 @@
import sys
import lance
import lancedb
import numpy as np
import pyarrow as pa
import pytest
import lancedb
from lancedb.conftest import MockTextEmbeddingFunction
from lancedb.embeddings import (
EmbeddingFunctionConfig,

View File

@@ -14,12 +14,11 @@ import importlib
import io
import os
import lancedb
import numpy as np
import pandas as pd
import pytest
import requests
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
@@ -185,10 +184,9 @@ def test_imagebind(tmp_path):
import shutil
import tempfile
import lancedb.embeddings.imagebind
import pandas as pd
import requests
import lancedb.embeddings.imagebind
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

View File

@@ -14,13 +14,13 @@ import os
import random
from unittest import mock
import lancedb as ldb
import numpy as np
import pandas as pd
import pytest
import tantivy
import lancedb as ldb
import lancedb.fts
pytest.importorskip("lancedb.fts")
tantivy = pytest.importorskip("tantivy")
@pytest.fixture

View File

@@ -13,9 +13,8 @@
import os
import pytest
import lancedb
import pytest
# You need to setup AWS credentials an a base path to run this test. Example
# AWS_PROFILE=default TEST_S3_BASE_URL=s3://my_bucket/dataset pytest tests/test_io.py

View File

@@ -20,9 +20,8 @@ from typing import List, Optional, Tuple
import pyarrow as pa
import pydantic
import pytest
from pydantic import Field
from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, Vector, pydantic_to_schema
from pydantic import Field
@pytest.mark.skipif(

View File

@@ -18,7 +18,6 @@ import numpy as np
import pandas.testing as tm
import pyarrow as pa
import pytest
from lancedb.db import LanceDBConnection
from lancedb.pydantic import LanceModel, Vector
from lancedb.query import LanceVectorQueryBuilder, Query
@@ -88,7 +87,7 @@ def test_query_builder(table):
rs = (
LanceVectorQueryBuilder(table, [0, 0], "vector")
.limit(1)
.select(["id"])
.select(["id", "vector"])
.to_list()
)
assert rs[0]["id"] == 1

View File

@@ -17,7 +17,6 @@ import pandas as pd
import pyarrow as pa
import pytest
from aiohttp import web
from lancedb.remote.client import RestfulLanceDBClient, VectorQuery

View File

@@ -11,9 +11,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pyarrow as pa
import lancedb
import pyarrow as pa
from lancedb.remote.client import VectorQuery, VectorQueryResult

View File

@@ -1,9 +1,8 @@
import os
import lancedb
import numpy as np
import pytest
import lancedb
from lancedb.conftest import MockTextEmbeddingFunction # noqa
from lancedb.embeddings import EmbeddingFunctionRegistry
from lancedb.pydantic import LanceModel, Vector
@@ -15,6 +14,9 @@ from lancedb.rerankers import (
)
from lancedb.table import LanceTable
# Tests rely on FTS index
pytest.importorskip("lancedb.fts")
def get_test_table(tmp_path):
db = lancedb.connect(tmp_path)

View File

@@ -20,19 +20,18 @@ from typing import List
from unittest.mock import PropertyMock, patch
import lance
import lancedb
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pytest
from pydantic import BaseModel
import lancedb
from lancedb.conftest import MockTextEmbeddingFunction
from lancedb.db import LanceDBConnection
from lancedb.embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry
from lancedb.pydantic import LanceModel, Vector
from lancedb.table import LanceTable
from pydantic import BaseModel
class MockDB:
@@ -804,6 +803,9 @@ def test_count_rows(db):
def test_hybrid_search(db, tmp_path):
# This test uses an FTS index
pytest.importorskip("lancedb.fts")
db = MockDB(str(tmp_path))
# Create a LanceDB table schema with a vector and a text column
emb = EmbeddingFunctionRegistry.get_instance().get("test")()
@@ -898,3 +900,29 @@ def test_restore_consistency(tmp_path):
table.add([{"id": 2}])
assert table_fixed.version == table.version - 1
assert table_ref_latest.version == table.version
# Schema evolution
def test_add_columns(tmp_path):
db = lancedb.connect(tmp_path)
data = pa.table({"id": [0, 1]})
table = LanceTable.create(db, "my_table", data=data)
table.add_columns({"new_col": "id + 2"})
assert table.to_arrow().column_names == ["id", "new_col"]
assert table.to_arrow()["new_col"].to_pylist() == [2, 3]
def test_alter_columns(tmp_path):
db = lancedb.connect(tmp_path)
data = pa.table({"id": [0, 1]})
table = LanceTable.create(db, "my_table", data=data)
table.alter_columns({"path": "id", "rename": "new_id"})
assert table.to_arrow().column_names == ["new_id"]
def test_drop_columns(tmp_path):
db = lancedb.connect(tmp_path)
data = pa.table({"id": [0, 1], "category": ["a", "b"]})
table = LanceTable.create(db, "my_table", data=data)
table.drop_columns(["category"])
assert table.to_arrow().column_names == ["id"]

View File

@@ -1,8 +1,7 @@
import json
import pytest
import lancedb
import pytest
from lancedb.utils.events import _Events

View File

@@ -15,7 +15,6 @@ import os
import pathlib
import pytest
from lancedb.util import get_uri_scheme, join_uri

View File

@@ -1,17 +0,0 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import setuptools
if __name__ == "__main__":
setuptools.setup()

66
python/src/connection.rs Normal file
View File

@@ -0,0 +1,66 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Duration;
use lancedb::connection::Connection as LanceConnection;
use pyo3::{pyclass, pyfunction, pymethods, PyAny, PyRef, PyResult, Python};
use pyo3_asyncio::tokio::future_into_py;
use crate::error::PythonErrorExt;
#[pyclass]
pub struct Connection {
inner: LanceConnection,
}
#[pymethods]
impl Connection {
pub fn table_names(self_: PyRef<'_, Self>) -> PyResult<&PyAny> {
let inner = self_.inner.clone();
future_into_py(self_.py(), async move {
inner.table_names().await.infer_error()
})
}
}
#[pyfunction]
pub fn connect(
py: Python,
uri: String,
api_key: Option<String>,
region: Option<String>,
host_override: Option<String>,
read_consistency_interval: Option<f64>,
) -> PyResult<&PyAny> {
future_into_py(py, async move {
let mut builder = lancedb::connect(&uri);
if let Some(api_key) = api_key {
builder = builder.api_key(&api_key);
}
if let Some(region) = region {
builder = builder.region(&region);
}
if let Some(host_override) = host_override {
builder = builder.host_override(&host_override);
}
if let Some(read_consistency_interval) = read_consistency_interval {
let read_consistency_interval = Duration::from_secs_f64(read_consistency_interval);
builder = builder.read_consistency_interval(read_consistency_interval);
}
Ok(Connection {
inner: builder.execute().await.infer_error()?,
})
})
}

61
python/src/error.rs Normal file
View File

@@ -0,0 +1,61 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use pyo3::{
exceptions::{PyOSError, PyRuntimeError, PyValueError},
PyResult,
};
use lancedb::error::Error as LanceError;
pub trait PythonErrorExt<T> {
/// Convert to a python error based on the Lance error type
fn infer_error(self) -> PyResult<T>;
/// Convert to OSError
fn os_error(self) -> PyResult<T>;
/// Convert to RuntimeError
fn runtime_error(self) -> PyResult<T>;
/// Convert to ValueError
fn value_error(self) -> PyResult<T>;
}
impl<T> PythonErrorExt<T> for std::result::Result<T, LanceError> {
fn infer_error(self) -> PyResult<T> {
match &self {
Ok(_) => Ok(self.unwrap()),
Err(err) => match err {
LanceError::InvalidTableName { .. } => self.value_error(),
LanceError::TableNotFound { .. } => self.value_error(),
LanceError::TableAlreadyExists { .. } => self.runtime_error(),
LanceError::CreateDir { .. } => self.os_error(),
LanceError::Store { .. } => self.runtime_error(),
LanceError::Lance { .. } => self.runtime_error(),
LanceError::Schema { .. } => self.value_error(),
LanceError::Runtime { .. } => self.runtime_error(),
},
}
}
fn os_error(self) -> PyResult<T> {
self.map_err(|err| PyOSError::new_err(err.to_string()))
}
fn runtime_error(self) -> PyResult<T> {
self.map_err(|err| PyRuntimeError::new_err(err.to_string()))
}
fn value_error(self) -> PyResult<T> {
self.map_err(|err| PyValueError::new_err(err.to_string()))
}
}

32
python/src/lib.rs Normal file
View File

@@ -0,0 +1,32 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use connection::{connect, Connection};
use env_logger::Env;
use pyo3::{pymodule, types::PyModule, wrap_pyfunction, PyResult, Python};
pub mod connection;
pub(crate) mod error;
#[pymodule]
pub fn _lancedb(_py: Python, m: &PyModule) -> PyResult<()> {
let env = Env::new()
.filter_or("LANCEDB_LOG", "warn")
.write_style("LANCEDB_LOG_STYLE");
env_logger::init_from_env(env);
m.add_class::<Connection>()?;
m.add_function(wrap_pyfunction!(connect, m)?)?;
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
Ok(())
}

View File

@@ -286,5 +286,8 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
index::vector::table_create_vector_index,
)?;
cx.export_function("tableSchema", JsTable::js_schema)?;
cx.export_function("tableAddColumns", JsTable::js_add_columns)?;
cx.export_function("tableAlterColumns", JsTable::js_alter_columns)?;
cx.export_function("tableDropColumns", JsTable::js_drop_columns)?;
Ok(())
}

View File

@@ -16,7 +16,7 @@ use std::ops::Deref;
use arrow_array::{RecordBatch, RecordBatchIterator};
use lance::dataset::optimize::CompactionOptions;
use lance::dataset::{WriteMode, WriteParams};
use lance::dataset::{ColumnAlteration, NewColumnTransform, WriteMode, WriteParams};
use lance::io::ObjectStoreParams;
use lancedb::table::{AddDataOptions, OptimizeAction, WriteOptions};
@@ -544,4 +544,118 @@ impl JsTable {
});
Ok(promise)
}
pub(crate) fn js_add_columns(mut cx: FunctionContext) -> JsResult<JsPromise> {
let expressions = cx
.argument::<JsArray>(0)?
.to_vec(&mut cx)?
.into_iter()
.map(|val| {
let obj = val.downcast_or_throw::<JsObject, _>(&mut cx)?;
let name = obj.get::<JsString, _, _>(&mut cx, "name")?.value(&mut cx);
let sql = obj
.get::<JsString, _, _>(&mut cx, "valueSql")?
.value(&mut cx);
Ok((name, sql))
})
.collect::<NeonResult<Vec<(String, String)>>>()?;
let transforms = NewColumnTransform::SqlExpressions(expressions);
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let table = js_table.table.clone();
rt.spawn(async move {
let result = table.add_columns(transforms, None).await;
deferred.settle_with(&channel, move |mut cx| {
result.or_throw(&mut cx)?;
Ok(cx.undefined())
})
});
Ok(promise)
}
pub(crate) fn js_alter_columns(mut cx: FunctionContext) -> JsResult<JsPromise> {
let alterations = cx
.argument::<JsArray>(0)?
.to_vec(&mut cx)?
.into_iter()
.map(|val| {
let obj = val.downcast_or_throw::<JsObject, _>(&mut cx)?;
let path = obj.get::<JsString, _, _>(&mut cx, "path")?.value(&mut cx);
let rename = obj
.get_opt::<JsString, _, _>(&mut cx, "rename")?
.map(|val| val.value(&mut cx));
let nullable = obj
.get_opt::<JsBoolean, _, _>(&mut cx, "nullable")?
.map(|val| val.value(&mut cx));
// TODO: support data type here. Will need to do some serialization/deserialization
if rename.is_none() && nullable.is_none() {
return cx.throw_error("At least one of 'name' or 'nullable' must be provided");
}
Ok(ColumnAlteration {
path,
rename,
nullable,
// TODO: wire up this field
data_type: None,
})
})
.collect::<NeonResult<Vec<ColumnAlteration>>>()?;
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let table = js_table.table.clone();
rt.spawn(async move {
let result = table.alter_columns(&alterations).await;
deferred.settle_with(&channel, move |mut cx| {
result.or_throw(&mut cx)?;
Ok(cx.undefined())
})
});
Ok(promise)
}
pub(crate) fn js_drop_columns(mut cx: FunctionContext) -> JsResult<JsPromise> {
let columns = cx
.argument::<JsArray>(0)?
.to_vec(&mut cx)?
.into_iter()
.map(|val| {
Ok(val
.downcast_or_throw::<JsString, _>(&mut cx)?
.value(&mut cx))
})
.collect::<NeonResult<Vec<String>>>()?;
let js_table = cx.this().downcast_or_throw::<JsBox<Self>, _>(&mut cx)?;
let rt = runtime(&mut cx)?;
let (deferred, promise) = cx.promise();
let channel = cx.channel();
let table = js_table.table.clone();
rt.spawn(async move {
let col_refs = columns.iter().map(|s| s.as_str()).collect::<Vec<_>>();
let result = table.drop_columns(&col_refs).await;
deferred.settle_with(&channel, move |mut cx| {
result.or_throw(&mut cx)?;
Ok(cx.undefined())
})
});
Ok(promise)
}
}

View File

@@ -623,7 +623,17 @@ impl ConnectionInternal for Database {
async fn drop_table(&self, name: &str) -> Result<()> {
let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
let full_path = self.base_path.child(dir_name.clone());
self.object_store.remove_dir_all(full_path).await?;
self.object_store
.remove_dir_all(full_path)
.await
.map_err(|err| match err {
// this error is not lance::Error::DatasetNotFound,
// as the method `remove_dir_all` may be used to remove something not be a dataset
lance::Error::NotFound { .. } => Error::TableNotFound {
name: name.to_owned(),
},
_ => Error::from(err),
})?;
Ok(())
}
@@ -634,8 +644,6 @@ impl ConnectionInternal for Database {
#[cfg(test)]
mod tests {
use std::fs::create_dir_all;
use arrow_schema::{DataType, Field, Schema};
use tempfile::tempdir;
@@ -691,13 +699,46 @@ mod tests {
// let db = Database::connect("s3://bucket/path/to/database").await.unwrap();
}
#[tokio::test]
#[ignore = "this can't pass due to https://github.com/lancedb/lancedb/issues/1019, enable it after the bug fixed"]
async fn test_open_table() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let db = connect(uri).execute().await.unwrap();
assert_eq!(db.table_names().await.unwrap().len(), 0);
// open non-exist table
assert!(matches!(
db.open_table("invalid_table").execute().await,
Err(crate::Error::TableNotFound { .. })
));
assert_eq!(db.table_names().await.unwrap().len(), 0);
let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)]));
db.create_empty_table("table1", schema)
.execute()
.await
.unwrap();
db.open_table("table1").execute().await.unwrap();
let tables = db.table_names().await.unwrap();
assert_eq!(tables, vec!["table1".to_owned()]);
}
#[tokio::test]
async fn drop_table() {
let tmp_dir = tempdir().unwrap();
create_dir_all(tmp_dir.path().join("table1.lance")).unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let db = connect(uri).execute().await.unwrap();
// drop non-exist table
assert!(matches!(
db.drop_table("invalid_table").await,
Err(crate::Error::TableNotFound { .. }),
));
create_dir_all(tmp_dir.path().join("table1.lance")).unwrap();
db.drop_table("table1").await.unwrap();
let tables = db.table_names().await.unwrap();

View File

@@ -24,6 +24,13 @@ use crate::Error;
const DEFAULT_TOP_K: usize = 10;
#[derive(Debug, Clone)]
pub enum Select {
All,
Simple(Vec<String>),
Projection(Vec<(String, String)>),
}
/// A builder for nearest neighbor queries for LanceDB.
#[derive(Clone)]
pub struct Query {
@@ -44,7 +51,7 @@ pub struct Query {
/// Apply filter to the returned rows.
filter: Option<String>,
/// Select column projection.
select: Option<Vec<String>>,
select: Select,
/// Default is true. Set to false to enforce a brute force search.
use_index: bool,
@@ -70,7 +77,7 @@ impl Query {
metric_type: None,
use_index: true,
filter: None,
select: None,
select: Select::All,
prefilter: false,
}
}
@@ -115,7 +122,16 @@ impl Query {
scanner.use_index(self.use_index);
scanner.prefilter(self.prefilter);
self.select.as_ref().map(|p| scanner.project(p.as_slice()));
match &self.select {
Select::Simple(select) => {
scanner.project(select.as_slice())?;
}
Select::Projection(select_with_transform) => {
scanner.project_with_transform(select_with_transform.as_slice())?;
}
Select::All => { /* Do nothing */ }
}
self.filter.as_ref().map(|f| scanner.filter(f));
self.refine_factor.map(|rf| scanner.refine(rf));
self.metric_type.map(|mt| scanner.distance_metric(mt));
@@ -206,7 +222,23 @@ impl Query {
///
/// Only select the specified columns. If not specified, all columns will be returned.
pub fn select(mut self, columns: &[impl AsRef<str>]) -> Self {
self.select = Some(columns.iter().map(|c| c.as_ref().to_string()).collect());
self.select = Select::Simple(columns.iter().map(|c| c.as_ref().to_string()).collect());
self
}
/// Return only the specified columns.
///
/// Only select the specified columns. If not specified, all columns will be returned.
pub fn select_with_projection(
mut self,
columns: &[(impl AsRef<str>, impl AsRef<str>)],
) -> Self {
self.select = Select::Projection(
columns
.iter()
.map(|(c, t)| (c.as_ref().to_string(), t.as_ref().to_string()))
.collect(),
);
self
}
@@ -226,7 +258,7 @@ mod tests {
RecordBatchReader,
};
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
use futures::StreamExt;
use futures::{StreamExt, TryStreamExt};
use lance::dataset::Dataset;
use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector};
use tempfile::tempdir;
@@ -294,6 +326,38 @@ mod tests {
}
}
#[tokio::test]
async fn test_select_with_transform() {
let batches = make_non_empty_batches();
let ds = Dataset::write(batches, "memory://foo", None).await.unwrap();
let ds = DatasetConsistencyWrapper::new_latest(ds, None);
let query = Query::new(ds)
.limit(10)
.select_with_projection(&[("id2", "id * 2"), ("id", "id")]);
let result = query.execute_stream().await;
let mut batches = result
.expect("should have result")
.try_collect::<Vec<_>>()
.await
.unwrap();
assert_eq!(batches.len(), 1);
let batch = batches.pop().unwrap();
// id, and id2
assert_eq!(batch.num_columns(), 2);
let id: &Int32Array = batch.column_by_name("id").unwrap().as_primitive();
let id2: &Int32Array = batch.column_by_name("id2").unwrap().as_primitive();
id.iter().zip(id2.iter()).for_each(|(id, id2)| {
let id = id.unwrap();
let id2 = id2.unwrap();
assert_eq!(id * 2, id2);
});
}
#[tokio::test]
async fn test_execute_no_vector() {
// test that it's ok to not specify a query vector (just filter / limit)

View File

@@ -27,7 +27,10 @@ use lance::dataset::optimize::{
compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions,
};
pub use lance::dataset::ReadParams;
use lance::dataset::{Dataset, UpdateBuilder, WhenMatched, WriteMode, WriteParams};
use lance::dataset::{
ColumnAlteration, Dataset, NewColumnTransform, UpdateBuilder, WhenMatched, WriteMode,
WriteParams,
};
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
use lance::io::WrappingObjectStore;
use lance_index::{optimize::OptimizeOptions, DatasetIndexExt};
@@ -376,6 +379,19 @@ pub trait Table: std::fmt::Display + Send + Sync {
/// Modeled after ``VACUUM`` in PostgreSQL.
/// Not all implementations support explicit optimization.
async fn optimize(&self, action: OptimizeAction) -> Result<OptimizeStats>;
/// Add new columns to the table, providing values to fill in.
async fn add_columns(
&self,
transforms: NewColumnTransform,
read_columns: Option<Vec<String>>,
) -> Result<()>;
/// Change a column's name or nullability.
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()>;
/// Remove columns from the table.
async fn drop_columns(&self, columns: &[&str]) -> Result<()>;
}
/// Reference to a Table pointer.
@@ -902,6 +918,33 @@ impl Table for NativeTable {
}
Ok(stats)
}
async fn add_columns(
&self,
transforms: NewColumnTransform,
read_columns: Option<Vec<String>>,
) -> Result<()> {
self.dataset
.get_mut()
.await?
.add_columns(transforms, read_columns)
.await?;
Ok(())
}
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()> {
self.dataset
.get_mut()
.await?
.alter_columns(alterations)
.await?;
Ok(())
}
async fn drop_columns(&self, columns: &[&str]) -> Result<()> {
self.dataset.get_mut().await?.drop_columns(columns).await?;
Ok(())
}
}
#[cfg(test)]