diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 428bef69..f2e68df7 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.10 +current_version = 0.4.11 commit = True message = Bump version: {current_version} → {new_version} tag = True @@ -9,4 +9,4 @@ tag_name = v{new_version} [bumpversion:file:rust/ffi/node/Cargo.toml] -[bumpversion:file:rust/vectordb/Cargo.toml] +[bumpversion:file:rust/lancedb/Cargo.toml] diff --git a/.github/workflows/build_linux_wheel/action.yml b/.github/workflows/build_linux_wheel/action.yml new file mode 100644 index 00000000..8c3e02f3 --- /dev/null +++ b/.github/workflows/build_linux_wheel/action.yml @@ -0,0 +1,58 @@ +# We create a composite action to be re-used both for testing and for releasing +name: build-linux-wheel +description: "Build a manylinux wheel for lance" +inputs: + python-minor-version: + description: "8, 9, 10, 11, 12" + required: true + args: + description: "--release" + required: false + default: "" + arm-build: + description: "Build for arm64 instead of x86_64" + # Note: this does *not* mean the host is arm64, since we might be cross-compiling. + required: false + default: "false" +runs: + using: "composite" + steps: + - name: CONFIRM ARM BUILD + shell: bash + run: | + echo "ARM BUILD: ${{ inputs.arm-build }}" + - name: Build x86_64 Manylinux wheel + if: ${{ inputs.arm-build == 'false' }} + uses: PyO3/maturin-action@v1 + with: + command: build + working-directory: python + target: x86_64-unknown-linux-gnu + manylinux: "2_17" + args: ${{ inputs.args }} + before-script-linux: | + set -e + yum install -y openssl-devel \ + && curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$(uname -m).zip > /tmp/protoc.zip \ + && unzip /tmp/protoc.zip -d /usr/local \ + && rm /tmp/protoc.zip + - name: Build Arm Manylinux Wheel + if: ${{ inputs.arm-build == 'true' }} + uses: PyO3/maturin-action@v1 + with: + command: build + working-directory: python + target: aarch64-unknown-linux-gnu + manylinux: "2_24" + args: ${{ inputs.args }} + before-script-linux: | + set -e + apt install -y unzip + if [ $(uname -m) = "x86_64" ]; then + PROTOC_ARCH="x86_64" + else + PROTOC_ARCH="aarch_64" + fi + curl -L https://github.com/protocolbuffers/protobuf/releases/download/v24.4/protoc-24.4-linux-$PROTOC_ARCH.zip > /tmp/protoc.zip \ + && unzip /tmp/protoc.zip -d /usr/local \ + && rm /tmp/protoc.zip diff --git a/.github/workflows/build_mac_wheel/action.yml b/.github/workflows/build_mac_wheel/action.yml new file mode 100644 index 00000000..1932262d --- /dev/null +++ b/.github/workflows/build_mac_wheel/action.yml @@ -0,0 +1,25 @@ +# We create a composite action to be re-used both for testing and for releasing +name: build_wheel +description: "Build a lance wheel" +inputs: + python-minor-version: + description: "8, 9, 10, 11" + required: true + args: + description: "--release" + required: false + default: "" +runs: + using: "composite" + steps: + - name: Install macos dependency + shell: bash + run: | + brew install protobuf + - name: Build wheel + uses: PyO3/maturin-action@v1 + with: + command: build + args: ${{ inputs.args }} + working-directory: python + interpreter: 3.${{ inputs.python-minor-version }} diff --git a/.github/workflows/build_windows_wheel/action.yml b/.github/workflows/build_windows_wheel/action.yml new file mode 100644 index 00000000..756334f8 --- /dev/null +++ b/.github/workflows/build_windows_wheel/action.yml @@ -0,0 +1,33 @@ +# We create a composite action to be re-used both for testing and for releasing +name: build_wheel +description: "Build a lance wheel" +inputs: + python-minor-version: + description: "8, 9, 10, 11" + required: true + args: + description: "--release" + required: false + default: "" +runs: + using: "composite" + steps: + - name: Install Protoc v21.12 + working-directory: C:\ + run: | + New-Item -Path 'C:\protoc' -ItemType Directory + Set-Location C:\protoc + Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip + 7z x protoc.zip + Add-Content $env:GITHUB_PATH "C:\protoc\bin" + shell: powershell + - name: Build wheel + uses: PyO3/maturin-action@v1 + with: + command: build + args: ${{ inputs.args }} + working-directory: python + - uses: actions/upload-artifact@v3 + with: + name: windows-wheels + path: python\target\wheels diff --git a/.github/workflows/cargo-publish.yml b/.github/workflows/cargo-publish.yml index 6bbcdd57..9e4dfd35 100644 --- a/.github/workflows/cargo-publish.yml +++ b/.github/workflows/cargo-publish.yml @@ -26,4 +26,4 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev - name: Publish the package run: | - cargo publish -p vectordb --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }} + cargo publish -p lancedb --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 27833506..66605cc9 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -2,30 +2,91 @@ name: PyPI Publish on: release: - types: [ published ] + types: [published] jobs: - publish: - runs-on: ubuntu-latest - # Only runs on tags that matches the python-make-release action - if: startsWith(github.ref, 'refs/tags/python-v') - defaults: - run: - shell: bash - working-directory: python + linux: + timeout-minutes: 60 + strategy: + matrix: + python-minor-version: ["8"] + platform: + - x86_64 + - aarch64 + runs-on: "ubuntu-22.04" steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v4 with: - python-version: "3.8" - - name: Build distribution - run: | - ls -la - pip install wheel setuptools --upgrade - python setup.py sdist bdist_wheel - - name: Publish - uses: pypa/gh-action-pypi-publish@v1.8.5 + python-version: 3.${{ matrix.python-minor-version }} + - uses: ./.github/workflows/build_linux_wheel with: - password: ${{ secrets.LANCEDB_PYPI_API_TOKEN }} - packages-dir: python/dist + python-minor-version: ${{ matrix.python-minor-version }} + args: "--release --strip" + arm-build: ${{ matrix.platform == 'aarch64' }} + - uses: ./.github/workflows/upload_wheel + with: + token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }} + repo: "pypi" + mac: + timeout-minutes: 60 + runs-on: ${{ matrix.config.runner }} + strategy: + matrix: + python-minor-version: ["8"] + config: + - target: x86_64-apple-darwin + runner: macos-13 + - target: aarch64-apple-darwin + runner: macos-14 + env: + MACOSX_DEPLOYMENT_TARGET: 10.15 + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + fetch-depth: 0 + lfs: true + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.12 + - uses: ./.github/workflows/build_mac_wheel + with: + python-minor-version: ${{ matrix.python-minor-version }} + args: "--release --strip --target ${{ matrix.config.target }}" + - uses: ./.github/workflows/upload_wheel + with: + python-minor-version: ${{ matrix.python-minor-version }} + token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }} + repo: "pypi" + windows: + timeout-minutes: 60 + runs-on: windows-latest + strategy: + matrix: + python-minor-version: ["8"] + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + fetch-depth: 0 + lfs: true + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.${{ matrix.python-minor-version }} + - uses: ./.github/workflows/build_windows_wheel + with: + python-minor-version: ${{ matrix.python-minor-version }} + args: "--release --strip" + vcpkg_token: ${{ secrets.VCPKG_GITHUB_PACKAGES }} + - uses: ./.github/workflows/upload_wheel + with: + python-minor-version: ${{ matrix.python-minor-version }} + token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }} + repo: "pypi" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 846b8497..e4a95569 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -14,49 +14,133 @@ concurrency: cancel-in-progress: true jobs: - linux: + lint: + name: "Lint" timeout-minutes: 30 - strategy: - matrix: - python-minor-version: [ "8", "11" ] runs-on: "ubuntu-22.04" defaults: run: shell: bash working-directory: python steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - lfs: true - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.${{ matrix.python-minor-version }} - - name: Install lancedb - run: | - pip install -e .[tests] - pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985 - pip install pytest pytest-mock ruff - - name: Format check - run: ruff format --check . - - name: Lint - run: ruff . - - name: Run tests - run: pytest -m "not slow" -x -v --durations=30 tests - - name: doctest - run: pytest --doctest-modules lancedb + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install ruff + run: | + pip install ruff==0.2.2 + - name: Format check + run: ruff format --check . + - name: Lint + run: ruff . + doctest: + name: "Doctest" + timeout-minutes: 30 + runs-on: "ubuntu-22.04" + defaults: + run: + shell: bash + working-directory: python + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + - name: Install protobuf + run: | + sudo apt update + sudo apt install -y protobuf-compiler + - uses: Swatinem/rust-cache@v2 + with: + workspaces: python + - name: Install + run: | + pip install -e .[tests,dev,embeddings] + pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985 + pip install mlx + - name: Doctest + run: pytest --doctest-modules python/lancedb + linux: + name: "Linux: python-3.${{ matrix.python-minor-version }}" + timeout-minutes: 30 + strategy: + matrix: + python-minor-version: ["8", "11"] + runs-on: "ubuntu-22.04" + defaults: + run: + shell: bash + working-directory: python + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - name: Install protobuf + run: | + sudo apt update + sudo apt install -y protobuf-compiler + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.${{ matrix.python-minor-version }} + - uses: Swatinem/rust-cache@v2 + with: + workspaces: python + - uses: ./.github/workflows/build_linux_wheel + - uses: ./.github/workflows/run_tests + # Make sure wheels are not included in the Rust cache + - name: Delete wheels + run: rm -rf target/wheels platform: - name: "Platform: ${{ matrix.config.name }}" + name: "Mac: ${{ matrix.config.name }}" timeout-minutes: 30 strategy: matrix: config: - - name: x86 Mac + - name: x86 runner: macos-13 - - name: Arm Mac + - name: Arm runner: macos-14 - - name: x86 Windows + runs-on: "${{ matrix.config.runner }}" + defaults: + run: + shell: bash + working-directory: python + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - uses: Swatinem/rust-cache@v2 + with: + workspaces: python + - uses: ./.github/workflows/build_mac_wheel + - uses: ./.github/workflows/run_tests + # Make sure wheels are not included in the Rust cache + - name: Delete wheels + run: rm -rf target/wheels + windows: + name: "Windows: ${{ matrix.config.name }}" + timeout-minutes: 30 + strategy: + matrix: + config: + - name: x86 runner: windows-latest runs-on: "${{ matrix.config.runner }}" defaults: @@ -64,21 +148,22 @@ jobs: shell: bash working-directory: python steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - lfs: true - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Install lancedb - run: | - pip install -e .[tests] - pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985 - pip install pytest pytest-mock - - name: Run tests - run: pytest -m "not slow" -x -v --durations=30 tests + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - uses: Swatinem/rust-cache@v2 + with: + workspaces: python + - uses: ./.github/workflows/build_windows_wheel + - uses: ./.github/workflows/run_tests + # Make sure wheels are not included in the Rust cache + - name: Delete wheels + run: rm -rf target/wheels pydantic1x: timeout-minutes: 30 runs-on: "ubuntu-22.04" @@ -87,21 +172,22 @@ jobs: shell: bash working-directory: python steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - lfs: true - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.9 - - name: Install lancedb - run: | - pip install "pydantic<2" - pip install -e .[tests] - pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985 - pip install pytest pytest-mock - - name: Run tests - run: pytest -m "not slow" -x -v --durations=30 tests - - name: doctest - run: pytest --doctest-modules lancedb + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y protobuf-compiler + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + - name: Install lancedb + run: | + pip install "pydantic<2" + pip install -e .[tests] + pip install tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985 + - name: Run tests + run: pytest -m "not slow" -x -v --durations=30 python/tests diff --git a/.github/workflows/remote-integration.yml b/.github/workflows/remote-integration.yml new file mode 100644 index 00000000..68862ebf --- /dev/null +++ b/.github/workflows/remote-integration.yml @@ -0,0 +1,37 @@ +name: LanceDb Cloud Integration Test + +on: + workflow_run: + workflows: [Rust] + types: + - completed + +env: + LANCEDB_PROJECT: ${{ secrets.LANCEDB_PROJECT }} + LANCEDB_API_KEY: ${{ secrets.LANCEDB_API_KEY }} + LANCEDB_REGION: ${{ secrets.LANCEDB_REGION }} + +jobs: + test: + timeout-minutes: 30 + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash + working-directory: rust + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - uses: Swatinem/rust-cache@v2 + with: + workspaces: rust + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y protobuf-compiler libssl-dev + - name: Build + run: cargo build --all-features + - name: Run Integration test + run: cargo test --tests -- --ignored diff --git a/.github/workflows/run_tests/action.yml b/.github/workflows/run_tests/action.yml new file mode 100644 index 00000000..6140cb5c --- /dev/null +++ b/.github/workflows/run_tests/action.yml @@ -0,0 +1,17 @@ +name: run-tests + +description: "Install lance wheel and run unit tests" +inputs: + python-minor-version: + required: true + description: "8 9 10 11 12" +runs: + using: "composite" + steps: + - name: Install lancedb + shell: bash + run: | + pip3 install $(ls target/wheels/lancedb-*.whl)[tests,dev] + - name: pytest + shell: bash + run: pytest -m "not slow" -x -v --durations=30 python/python/tests diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c43a5d4f..d9c5358f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -119,3 +119,4 @@ jobs: $env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT cargo build cargo test + \ No newline at end of file diff --git a/.github/workflows/upload_wheel/action.yml b/.github/workflows/upload_wheel/action.yml new file mode 100644 index 00000000..8494e01e --- /dev/null +++ b/.github/workflows/upload_wheel/action.yml @@ -0,0 +1,29 @@ +name: upload-wheel + +description: "Upload wheels to Pypi" +inputs: + os: + required: true + description: "ubuntu-22.04 or macos-13" + repo: + required: false + description: "pypi or testpypi" + default: "pypi" + token: + required: true + description: "release token for the repo" + +runs: + using: "composite" + steps: + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install twine + - name: Publish wheel + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ inputs.token }} + shell: bash + run: twine upload --repository ${{ inputs.repo }} target/wheels/lancedb-*.whl diff --git a/.gitignore b/.gitignore index c065383d..46e13e7b 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,11 @@ python/dist **/.hypothesis +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + ## Javascript *.node **/node_modules @@ -34,4 +39,6 @@ dist ## Rust target +**/sccache.log + Cargo.lock diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b8f409a0..93801fcd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,17 +5,8 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace -- repo: https://github.com/psf/black - rev: 22.12.0 - hooks: - - id: black - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.0.277 + rev: v0.2.2 hooks: - id: ruff -- repo: https://github.com/pycqa/isort - rev: 5.12.0 - hooks: - - id: isort - name: isort (python) \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 90f00896..e3eb2f92 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["rust/ffi/node", "rust/vectordb", "nodejs"] +members = ["rust/ffi/node", "rust/lancedb", "nodejs", "python"] # Python package needs to be built by maturin. exclude = ["python"] resolver = "2" @@ -14,10 +14,10 @@ keywords = ["lancedb", "lance", "database", "vector", "search"] categories = ["database-implementations"] [workspace.dependencies] -lance = { "version" = "=0.9.18", "features" = ["dynamodb"] } -lance-index = { "version" = "=0.9.18" } -lance-linalg = { "version" = "=0.9.18" } -lance-testing = { "version" = "=0.9.18" } +lance = { "version" = "=0.10.1", "features" = ["dynamodb"] } +lance-index = { "version" = "=0.10.1" } +lance-linalg = { "version" = "=0.10.1" } +lance-testing = { "version" = "=0.10.1" } # Note that this one does not include pyarrow arrow = { version = "50.0", optional = false } arrow-array = "50.0" diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile new file mode 100644 index 00000000..b314ff64 --- /dev/null +++ b/dockerfiles/Dockerfile @@ -0,0 +1,27 @@ +#Simple base dockerfile that supports basic dependencies required to run lance with FTS and Hybrid Search +#Usage docker build -t lancedb:latest -f Dockerfile . +FROM python:3.10-slim-buster + +# Install Rust +RUN apt-get update && apt-get install -y curl build-essential && \ + curl https://sh.rustup.rs -sSf | sh -s -- -y + +# Set the environment variable for Rust +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install protobuf compiler +RUN apt-get install -y protobuf-compiler && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN apt-get -y update &&\ + apt-get -y upgrade && \ + apt-get -y install git + + +# Verify installations +RUN python --version && \ + rustc --version && \ + protoc --version + +RUN pip install tantivy lancedb diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index fd2be5e7..0deaaeb4 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -57,6 +57,16 @@ plugins: - https://arrow.apache.org/docs/objects.inv - https://pandas.pydata.org/docs/objects.inv - mkdocs-jupyter +- ultralytics: + verbose: True + enabled: True + default_image: "assets/lancedb_and_lance.png" # Default image for all pages + add_image: True # Automatically add meta image + add_keywords: True # Add page keywords in the header tag + add_share_buttons: True # Add social share buttons + add_authors: False # Display page authors + add_desc: False + add_dates: False markdown_extensions: - admonition @@ -206,7 +216,6 @@ extra_css: extra_javascript: - "extra_js/init_ask_ai_widget.js" - - "extra_js/meta_tag.js" extra: analytics: diff --git a/docs/requirements.txt b/docs/requirements.txt index e5b8bbd3..7f34591e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,4 +2,5 @@ mkdocs==1.5.3 mkdocs-jupyter==0.24.1 mkdocs-material==9.5.3 mkdocstrings[python]==0.20.0 -pydantic \ No newline at end of file +pydantic +mkdocs-ultralytics-plugin==0.0.44 \ No newline at end of file diff --git a/docs/src/extra_js/meta_tag.js b/docs/src/extra_js/meta_tag.js deleted file mode 100644 index b1349980..00000000 --- a/docs/src/extra_js/meta_tag.js +++ /dev/null @@ -1,6 +0,0 @@ -window.addEventListener('load', function() { - var meta = document.createElement('meta'); - meta.setAttribute('property', 'og:image'); - meta.setAttribute('content', '/assets/lancedb_and_lance.png'); - document.head.appendChild(meta); - }); \ No newline at end of file diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md index 4692aef0..72b71804 100644 --- a/docs/src/guides/tables.md +++ b/docs/src/guides/tables.md @@ -636,6 +636,70 @@ The `values` parameter is used to provide the new values for the columns as lite When rows are updated, they are moved out of the index. The row will still show up in ANN queries, but the query will not be as fast as it would be if the row was in the index. If you update a large proportion of rows, consider rebuilding the index afterwards. +## Consistency + +In LanceDB OSS, users can set the `read_consistency_interval` parameter on connections to achieve different levels of read consistency. This parameter determines how frequently the database synchronizes with the underlying storage system to check for updates made by other processes. If another process updates a table, the database will not see the changes until the next synchronization. + +There are three possible settings for `read_consistency_interval`: + +1. **Unset (default)**: The database does not check for updates to tables made by other processes. This provides the best query performance, but means that clients may not see the most up-to-date data. This setting is suitable for applications where the data does not change during the lifetime of the table reference. +2. **Zero seconds (Strong consistency)**: The database checks for updates on every read. This provides the strongest consistency guarantees, ensuring that all clients see the latest committed data. However, it has the most overhead. This setting is suitable when consistency matters more than having high QPS. +3. **Custom interval (Eventual consistency)**: The database checks for updates at a custom interval, such as every 5 seconds. This provides eventual consistency, allowing for some lag between write and read operations. Performance wise, this is a middle ground between strong consistency and no consistency check. This setting is suitable for applications where immediate consistency is not critical, but clients should see updated data eventually. + +!!! tip "Consistency in LanceDB Cloud" + + This is only tune-able in LanceDB OSS. In LanceDB Cloud, readers are always eventually consistent. + +=== "Python" + + To set strong consistency, use `timedelta(0)`: + + ```python + from datetime import timedelta + db = lancedb.connect("./.lancedb",. read_consistency_interval=timedelta(0)) + table = db.open_table("my_table") + ``` + + For eventual consistency, use a custom `timedelta`: + + ```python + from datetime import timedelta + db = lancedb.connect("./.lancedb", read_consistency_interval=timedelta(seconds=5)) + table = db.open_table("my_table") + ``` + + By default, a `Table` will never check for updates from other writers. To manually check for updates you can use `checkout_latest`: + + ```python + db = lancedb.connect("./.lancedb") + table = db.open_table("my_table") + + # (Other writes happen to my_table from another process) + + # Check for updates + table.checkout_latest() + ``` + +=== "JavaScript/Typescript" + + To set strong consistency, use `0`: + + ```javascript + const db = await lancedb.connect({ uri: "./.lancedb", readConsistencyInterval: 0 }); + const table = await db.openTable("my_table"); + ``` + + For eventual consistency, specify the update interval as seconds: + + ```javascript + const db = await lancedb.connect({ uri: "./.lancedb", readConsistencyInterval: 5 }); + const table = await db.openTable("my_table"); + ``` + + + ## What's next? Learn the best practices on creating an ANN index and getting the most out of it. \ No newline at end of file diff --git a/node/package-lock.json b/node/package-lock.json index 91981c24..06adef3a 100644 --- a/node/package-lock.json +++ b/node/package-lock.json @@ -1,12 +1,12 @@ { "name": "vectordb", - "version": "0.4.10", + "version": "0.4.11", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "vectordb", - "version": "0.4.10", + "version": "0.4.11", "cpu": [ "x64", "arm64" @@ -53,11 +53,11 @@ "uuid": "^9.0.0" }, "optionalDependencies": { - "@lancedb/vectordb-darwin-arm64": "0.4.10", - "@lancedb/vectordb-darwin-x64": "0.4.10", - "@lancedb/vectordb-linux-arm64-gnu": "0.4.10", - "@lancedb/vectordb-linux-x64-gnu": "0.4.10", - "@lancedb/vectordb-win32-x64-msvc": "0.4.10" + "@lancedb/vectordb-darwin-arm64": "0.4.11", + "@lancedb/vectordb-darwin-x64": "0.4.11", + "@lancedb/vectordb-linux-arm64-gnu": "0.4.11", + "@lancedb/vectordb-linux-x64-gnu": "0.4.11", + "@lancedb/vectordb-win32-x64-msvc": "0.4.11" } }, "node_modules/@75lb/deep-merge": { @@ -329,9 +329,9 @@ } }, "node_modules/@lancedb/vectordb-darwin-arm64": { - "version": "0.4.10", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.10.tgz", - "integrity": "sha512-y/uHOGb0g15pvqv5tdTyZ6oN+0QVpBmZDzKFWW6pPbuSZjB2uPqcs+ti0RB+AUdmS21kavVQqaNsw/HLKEGrHA==", + "version": "0.4.11", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.4.11.tgz", + "integrity": "sha512-JDOKmFnuJPFkA7ZmrzBJolROwSjWr7yMvAbi40uLBc25YbbVezodd30u2EFtIwWwtk1GqNYRZ49FZOElKYeC/Q==", "cpu": [ "arm64" ], @@ -341,9 +341,9 @@ ] }, "node_modules/@lancedb/vectordb-darwin-x64": { - "version": "0.4.10", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.10.tgz", - "integrity": "sha512-XbfR58OkQpAe0xMSTrwJh9ZjGSzG9EZ7zwO6HfYem8PxcLYAcC6eWRWoSG/T0uObyrPTcYYyvHsp0eNQWYBFAQ==", + "version": "0.4.11", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.4.11.tgz", + "integrity": "sha512-iy6r+8tp2v1EFgJV52jusXtxgO6NY6SkpOdX41xPqN2mQWMkfUAR9Xtks1mgknjPOIKH4MRc8ZS0jcW/UWmilQ==", "cpu": [ "x64" ], @@ -353,9 +353,9 @@ ] }, "node_modules/@lancedb/vectordb-linux-arm64-gnu": { - "version": "0.4.10", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.10.tgz", - "integrity": "sha512-x40WKH9b+KxorRmKr9G7fv8p5mMj8QJQvRMA0v6v+nbZHr2FLlAZV+9mvhHOnm4AGIkPP5335cUgv6Qz6hgwkQ==", + "version": "0.4.11", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.4.11.tgz", + "integrity": "sha512-5K6IVcTMuH0SZBjlqB5Gg39WC889FpTwIWKufxzQMMXrzxo5J3lKUHVoR28RRlNhDF2d9kZXBEyCpIfDFsV9iQ==", "cpu": [ "arm64" ], @@ -365,9 +365,9 @@ ] }, "node_modules/@lancedb/vectordb-linux-x64-gnu": { - "version": "0.4.10", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.10.tgz", - "integrity": "sha512-CTGPpuzlqq2nVjUxI9gAJOT1oBANIovtIaFsOmBSnEAHgX7oeAxKy2b6L/kJzsgqSzvR5vfLwYcWFrr6ZmBxSA==", + "version": "0.4.11", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.4.11.tgz", + "integrity": "sha512-hF9ZChsdqKqqnivOzd9mE7lC3PmhZadXtwThi2RrsPiOLoEaGDfmr6Ni3amVQnB3bR8YEJtTxdQxe0NC4uW/8g==", "cpu": [ "x64" ], @@ -377,9 +377,9 @@ ] }, "node_modules/@lancedb/vectordb-win32-x64-msvc": { - "version": "0.4.10", - "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.10.tgz", - "integrity": "sha512-Fd7r74coZyrKzkfXg4WthqOL+uKyJyPTia6imcrMNqKOlTGdKmHf02Qi2QxWZrFaabkRYo4Tpn5FeRJ3yYX8CA==", + "version": "0.4.11", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.4.11.tgz", + "integrity": "sha512-0+9ut1ccKoqIyGxsVixwx3771Z+DXpl5WfSmOeA8kf3v3jlOg2H+0YUahiXLDid2ju+yeLPrAUYm7A1gKHVhew==", "cpu": [ "x64" ], diff --git a/node/package.json b/node/package.json index d5dcb744..ec18d629 100644 --- a/node/package.json +++ b/node/package.json @@ -1,12 +1,12 @@ { "name": "vectordb", - "version": "0.4.10", + "version": "0.4.11", "description": " Serverless, low-latency vector database for AI applications", "main": "dist/index.js", "types": "dist/index.d.ts", "scripts": { "tsc": "tsc -b", - "build": "npm run tsc && cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json", + "build": "npm run tsc && cargo-cp-artifact --artifact cdylib lancedb-node index.node -- cargo build --message-format=json", "build-release": "npm run build -- --release", "test": "npm run tsc && mocha -recursive dist/test", "integration-test": "npm run tsc && mocha -recursive dist/integration_test", @@ -61,11 +61,13 @@ "uuid": "^9.0.0" }, "dependencies": { - "@apache-arrow/ts": "^14.0.2", "@neon-rs/load": "^0.0.74", - "apache-arrow": "^14.0.2", "axios": "^1.4.0" }, + "peerDependencies": { + "@apache-arrow/ts": "^14.0.2", + "apache-arrow": "^14.0.2" + }, "os": [ "darwin", "linux", @@ -85,10 +87,10 @@ } }, "optionalDependencies": { - "@lancedb/vectordb-darwin-arm64": "0.4.10", - "@lancedb/vectordb-darwin-x64": "0.4.10", - "@lancedb/vectordb-linux-arm64-gnu": "0.4.10", - "@lancedb/vectordb-linux-x64-gnu": "0.4.10", - "@lancedb/vectordb-win32-x64-msvc": "0.4.10" + "@lancedb/vectordb-darwin-arm64": "0.4.11", + "@lancedb/vectordb-darwin-x64": "0.4.11", + "@lancedb/vectordb-linux-arm64-gnu": "0.4.11", + "@lancedb/vectordb-linux-x64-gnu": "0.4.11", + "@lancedb/vectordb-win32-x64-msvc": "0.4.11" } -} +} \ No newline at end of file diff --git a/node/src/index.ts b/node/src/index.ts index 73626eb5..69aea280 100644 --- a/node/src/index.ts +++ b/node/src/index.ts @@ -42,7 +42,10 @@ const { tableCompactFiles, tableListIndices, tableIndexStats, - tableSchema + tableSchema, + tableAddColumns, + tableAlterColumns, + tableDropColumns // eslint-disable-next-line @typescript-eslint/no-var-requires } = require('../native.js') @@ -96,6 +99,19 @@ export interface ConnectionOptions { * This is useful for local testing. */ hostOverride?: string + + /** + * (For LanceDB OSS only): The interval, in seconds, at which to check for + * updates to the table from other processes. If None, then consistency is not + * checked. For performance reasons, this is the default. For strong + * consistency, set this to zero seconds. Then every read will check for + * updates from other processes. As a compromise, you can set this to a + * non-zero value for eventual consistency. If more than that interval + * has passed since the last check, then the table will be checked for updates. + * Note: this consistency only applies to read operations. Write operations are + * always consistent. + */ + readConsistencyInterval?: number } function getAwsArgs (opts: ConnectionOptions): any[] { @@ -181,7 +197,8 @@ export async function connect ( opts.awsCredentials?.accessKeyId, opts.awsCredentials?.secretKey, opts.awsCredentials?.sessionToken, - opts.awsRegion + opts.awsRegion, + opts.readConsistencyInterval ) return new LocalConnection(db, opts) } @@ -324,6 +341,7 @@ export interface Table { * * @param column The column to index * @param replace If false, fail if an index already exists on the column + * it is always set to true for remote connections * * Scalar indices, like vector indices, can be used to speed up scans. A scalar * index can speed up scans that contain filter expressions on the indexed column. @@ -367,7 +385,7 @@ export interface Table { * await table.createScalarIndex('my_col') * ``` */ - createScalarIndex: (column: string, replace: boolean) => Promise + createScalarIndex: (column: string, replace?: boolean) => Promise /** * Returns the number of rows in this table. @@ -486,6 +504,59 @@ export interface Table { filter(value: string): Query schema: Promise + + // TODO: Support BatchUDF + /** + * Add new columns with defined values. + * + * @param newColumnTransforms pairs of column names and the SQL expression to use + * to calculate the value of the new column. These + * expressions will be evaluated for each row in the + * table, and can reference existing columns in the table. + */ + addColumns(newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise + + /** + * Alter the name or nullability of columns. + * + * @param columnAlterations One or more alterations to apply to columns. + */ + alterColumns(columnAlterations: ColumnAlteration[]): Promise + + /** + * Drop one or more columns from the dataset + * + * This is a metadata-only operation and does not remove the data from the + * underlying storage. In order to remove the data, you must subsequently + * call ``compact_files`` to rewrite the data without the removed columns and + * then call ``cleanup_files`` to remove the old files. + * + * @param columnNames The names of the columns to drop. These can be nested + * column references (e.g. "a.b.c") or top-level column + * names (e.g. "a"). + */ + dropColumns(columnNames: string[]): Promise +} + +/** + * A definition of a column alteration. The alteration changes the column at + * `path` to have the new name `name`, to be nullable if `nullable` is true, + * and to have the data type `data_type`. At least one of `rename` or `nullable` + * must be provided. + */ +export interface ColumnAlteration { + /** + * The path to the column to alter. This is a dot-separated path to the column. + * If it is a top-level column then it is just the name of the column. If it is + * a nested column then it is the path to the column, e.g. "a.b.c" for a column + * `c` nested inside a column `b` nested inside a column `a`. + */ + path: string + rename?: string + /** + * Set the new nullability. Note that a nullable column cannot be made non-nullable. + */ + nullable?: boolean } export interface UpdateArgs { @@ -844,7 +915,10 @@ export class LocalTable implements Table { }) } - async createScalarIndex (column: string, replace: boolean): Promise { + async createScalarIndex (column: string, replace?: boolean): Promise { + if (replace === undefined) { + replace = true + } return tableCreateScalarIndex.call(this._tbl, column, replace) } @@ -1014,6 +1088,18 @@ export class LocalTable implements Table { return false } } + + async addColumns (newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise { + return tableAddColumns.call(this._tbl, newColumnTransforms) + } + + async alterColumns (columnAlterations: ColumnAlteration[]): Promise { + return tableAlterColumns.call(this._tbl, columnAlterations) + } + + async dropColumns (columnNames: string[]): Promise { + return tableDropColumns.call(this._tbl, columnNames) + } } export interface CleanupStats { diff --git a/node/src/remote/index.ts b/node/src/remote/index.ts index 9255f31f..6e6e590a 100644 --- a/node/src/remote/index.ts +++ b/node/src/remote/index.ts @@ -25,7 +25,8 @@ import { type UpdateArgs, type UpdateSqlArgs, makeArrowTable, - type MergeInsertArgs + type MergeInsertArgs, + type ColumnAlteration } from '../index' import { Query } from '../query' @@ -396,7 +397,7 @@ export class RemoteTable implements Table { } const column = indexParams.column ?? 'vector' - const indexType = 'vector' // only vector index is supported for remote connections + const indexType = 'vector' const metricType = indexParams.metric_type ?? 'L2' const indexCacheSize = indexParams.index_cache_size ?? null @@ -419,8 +420,25 @@ export class RemoteTable implements Table { } } - async createScalarIndex (column: string, replace: boolean): Promise { - throw new Error('Not implemented') + async createScalarIndex (column: string): Promise { + const indexType = 'scalar' + + const data = { + column, + index_type: indexType, + replace: true + } + const res = await this._client.post( + `/v1/table/${this._name}/create_scalar_index/`, + data + ) + if (res.status !== 200) { + throw new Error( + `Server Error, status: ${res.status}, ` + + // eslint-disable-next-line @typescript-eslint/restrict-template-expressions + `message: ${res.statusText}: ${res.data}` + ) + } } async countRows (): Promise { @@ -474,4 +492,16 @@ export class RemoteTable implements Table { numUnindexedRows: results.data.num_unindexed_rows } } + + async addColumns (newColumnTransforms: Array<{ name: string, valueSql: string }>): Promise { + throw new Error('Add columns is not yet supported in LanceDB Cloud.') + } + + async alterColumns (columnAlterations: ColumnAlteration[]): Promise { + throw new Error('Alter columns is not yet supported in LanceDB Cloud.') + } + + async dropColumns (columnNames: string[]): Promise { + throw new Error('Drop columns is not yet supported in LanceDB Cloud.') + } } diff --git a/node/src/test/test.ts b/node/src/test/test.ts index 20b05087..fce5adba 100644 --- a/node/src/test/test.ts +++ b/node/src/test/test.ts @@ -37,8 +37,10 @@ import { Utf8, Table as ArrowTable, vectorFromArray, + Float64, Float32, - Float16 + Float16, + Int64 } from 'apache-arrow' const expect = chai.expect @@ -196,7 +198,7 @@ describe('LanceDB client', function () { const table = await con.openTable('vectors') const results = await table .search([0.1, 0.1]) - .select(['is_active']) + .select(['is_active', 'vector']) .execute() assert.equal(results.length, 2) // vector and _distance are always returned @@ -1057,3 +1059,63 @@ describe('Compact and cleanup', function () { assert.equal(await table.countRows(), 3) }) }) + +describe('schema evolution', function () { + // Create a new sample table + it('can add a new column to the schema', async function () { + const dir = await track().mkdir('lancejs') + const con = await lancedb.connect(dir) + const table = await con.createTable('vectors', [ + { id: 1n, vector: [0.1, 0.2] } + ]) + + await table.addColumns([{ name: 'price', valueSql: 'cast(10.0 as float)' }]) + + const expectedSchema = new Schema([ + new Field('id', new Int64()), + new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true))), + new Field('price', new Float32()) + ]) + expect(await table.schema).to.deep.equal(expectedSchema) + }) + + it('can alter the columns in the schema', async function () { + const dir = await track().mkdir('lancejs') + const con = await lancedb.connect(dir) + const schema = new Schema([ + new Field('id', new Int64(), false), + new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true))), + new Field('price', new Float64(), false) + ]) + const table = await con.createTable('vectors', [ + { id: 1n, vector: [0.1, 0.2], price: 10.0 } + ]) + expect(await table.schema).to.deep.equal(schema) + + await table.alterColumns([ + { path: 'id', rename: 'new_id' }, + { path: 'price', nullable: true } + ]) + + const expectedSchema = new Schema([ + new Field('new_id', new Int64(), false), + new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true))), + new Field('price', new Float64(), true) + ]) + expect(await table.schema).to.deep.equal(expectedSchema) + }) + + it('can drop a column from the schema', async function () { + const dir = await track().mkdir('lancejs') + const con = await lancedb.connect(dir) + const table = await con.createTable('vectors', [ + { id: 1n, vector: [0.1, 0.2] } + ]) + await table.dropColumns(['vector']) + + const expectedSchema = new Schema([ + new Field('id', new Int64(), false) + ]) + expect(await table.schema).to.deep.equal(expectedSchema) + }) +}) diff --git a/nodejs/.eslintrc.js b/nodejs/.eslintrc.js index ecf09807..cb47e56f 100644 --- a/nodejs/.eslintrc.js +++ b/nodejs/.eslintrc.js @@ -18,5 +18,5 @@ module.exports = { "@typescript-eslint/method-signature-style": "off", "@typescript-eslint/no-explicit-any": "off", }, - ignorePatterns: ["node_modules/", "dist/", "build/", "vectordb/native.*"], + ignorePatterns: ["node_modules/", "dist/", "build/", "lancedb/native.*"], }; diff --git a/nodejs/Cargo.toml b/nodejs/Cargo.toml index 740e34ef..0d734209 100644 --- a/nodejs/Cargo.toml +++ b/nodejs/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "vectordb-nodejs" +name = "lancedb-nodejs" edition.workspace = true version = "0.0.0" license.workspace = true @@ -16,7 +16,7 @@ arrow-ipc.workspace = true futures.workspace = true lance-linalg.workspace = true lance.workspace = true -vectordb = { path = "../rust/vectordb" } +lancedb = { path = "../rust/lancedb" } napi = { version = "2.15", default-features = false, features = [ "napi7", "async" diff --git a/nodejs/__test__/arrow.test.ts b/nodejs/__test__/arrow.test.ts index 907e25b1..cb4a300f 100644 --- a/nodejs/__test__/arrow.test.ts +++ b/nodejs/__test__/arrow.test.ts @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -import { makeArrowTable, toBuffer } from "../vectordb/arrow"; +import { makeArrowTable, toBuffer } from "../lancedb/arrow"; import { Int64, Field, diff --git a/nodejs/__test__/connection.test.ts b/nodejs/__test__/connection.test.ts new file mode 100644 index 00000000..4ffcb906 --- /dev/null +++ b/nodejs/__test__/connection.test.ts @@ -0,0 +1,34 @@ +// Copyright 2024 Lance Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import * as os from "os"; +import * as path from "path"; +import * as fs from "fs"; + +import { connect } from "../dist/index.js"; + +describe("when working with a connection", () => { + + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "test-connection")); + + it("should fail if creating table twice, unless overwrite is true", async() => { + const db = await connect(tmpDir); + let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]); + await expect(tbl.countRows()).resolves.toBe(2); + await expect(db.createTable("test", [{ id: 1 }, { id: 2 }])).rejects.toThrow(); + tbl = await db.createTable("test", [{ id: 3 }], { mode: "overwrite" }); + await expect(tbl.countRows()).resolves.toBe(1); + }) + +}); diff --git a/nodejs/__test__/index.test.ts b/nodejs/__test__/index.test.ts index 4a7c8305..dd7266ec 100644 --- a/nodejs/__test__/index.test.ts +++ b/nodejs/__test__/index.test.ts @@ -29,6 +29,6 @@ test("open database", async () => { const tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]); expect(await db.tableNames()).toStrictEqual(["test"]); - const schema = tbl.schema; + const schema = await tbl.schema(); expect(schema).toEqual(new Schema([new Field("id", new Float64(), true)])); }); diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index a8ccf989..6de039ad 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -17,7 +17,7 @@ import * as path from "path"; import * as fs from "fs"; import { connect } from "../dist"; -import { Schema, Field, Float32, Int32, FixedSizeList } from "apache-arrow"; +import { Schema, Field, Float32, Int32, FixedSizeList, Int64, Float64 } from "apache-arrow"; import { makeArrowTable } from "../dist/arrow"; describe("Test creating index", () => { @@ -181,3 +181,102 @@ describe("Test creating index", () => { // TODO: check index type. }); }); + +describe("Read consistency interval", () => { + let tmpDir: string; + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "read-consistency-")); + }); + + // const intervals = [undefined, 0, 0.1]; + const intervals = [0]; + test.each(intervals)("read consistency interval %p", async (interval) => { + const db = await connect({ uri: tmpDir }); + const table = await db.createTable("my_table", [{ id: 1 }]); + + const db2 = await connect({ uri: tmpDir, readConsistencyInterval: interval }); + const table2 = await db2.openTable("my_table"); + expect(await table2.countRows()).toEqual(await table.countRows()); + + await table.add([{ id: 2 }]); + + if (interval === undefined) { + expect(await table2.countRows()).toEqual(1); + // TODO: once we implement time travel we can uncomment this part of the test. + // await table2.checkout_latest(); + // expect(await table2.countRows()).toEqual(2); + } else if (interval === 0) { + expect(await table2.countRows()).toEqual(2); + } else { + // interval == 0.1 + expect(await table2.countRows()).toEqual(1); + await new Promise(r => setTimeout(r, 100)); + expect(await table2.countRows()).toEqual(2); + } + }); +}); + + +describe('schema evolution', function () { + let tmpDir: string; + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "schema-evolution-")); + }); + + // Create a new sample table + it('can add a new column to the schema', async function () { + const con = await connect(tmpDir) + const table = await con.createTable('vectors', [ + { id: 1n, vector: [0.1, 0.2] } + ]) + + await table.addColumns([{ name: 'price', valueSql: 'cast(10.0 as float)' }]) + + const expectedSchema = new Schema([ + new Field('id', new Int64(), true), + new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true)), true), + new Field('price', new Float32(), false) + ]) + expect(await table.schema()).toEqual(expectedSchema) + }); + + it('can alter the columns in the schema', async function () { + const con = await connect(tmpDir) + const schema = new Schema([ + new Field('id', new Int64(), true), + new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true)), true), + new Field('price', new Float64(), false) + ]) + const table = await con.createTable('vectors', [ + { id: 1n, vector: [0.1, 0.2] } + ]) + // Can create a non-nullable column only through addColumns at the moment. + await table.addColumns([{ name: 'price', valueSql: 'cast(10.0 as double)' }]) + expect(await table.schema()).toEqual(schema) + + await table.alterColumns([ + { path: 'id', rename: 'new_id' }, + { path: 'price', nullable: true } + ]) + + const expectedSchema = new Schema([ + new Field('new_id', new Int64(), true), + new Field('vector', new FixedSizeList(2, new Field('item', new Float32(), true)), true), + new Field('price', new Float64(), true) + ]) + expect(await table.schema()).toEqual(expectedSchema) + }); + + it('can drop a column from the schema', async function () { + const con = await connect(tmpDir) + const table = await con.createTable('vectors', [ + { id: 1n, vector: [0.1, 0.2] } + ]) + await table.dropColumns(['vector']) + + const expectedSchema = new Schema([ + new Field('id', new Int64(), true) + ]) + expect(await table.schema()).toEqual(expectedSchema) + }); +}); \ No newline at end of file diff --git a/nodejs/__test__/tsconfig.json b/nodejs/__test__/tsconfig.json new file mode 100644 index 00000000..f127268c --- /dev/null +++ b/nodejs/__test__/tsconfig.json @@ -0,0 +1,15 @@ +{ + "extends": "../tsconfig.json", + "compilerOptions": { + "outDir": "./dist/spec", + "module": "commonjs", + "target": "es2022", + "types": [ + "jest", + "node" + ] + }, + "include": [ + "**/*", + ] +} diff --git a/nodejs/vectordb/arrow.ts b/nodejs/lancedb/arrow.ts similarity index 100% rename from nodejs/vectordb/arrow.ts rename to nodejs/lancedb/arrow.ts diff --git a/nodejs/vectordb/connection.ts b/nodejs/lancedb/connection.ts similarity index 69% rename from nodejs/vectordb/connection.ts rename to nodejs/lancedb/connection.ts index 723d8c85..46b1109f 100644 --- a/nodejs/vectordb/connection.ts +++ b/nodejs/lancedb/connection.ts @@ -17,6 +17,24 @@ import { Connection as _NativeConnection } from "./native"; import { Table } from "./table"; import { Table as ArrowTable } from "apache-arrow"; +export interface CreateTableOptions { + /** + * The mode to use when creating the table. + * + * If this is set to "create" and the table already exists then either + * an error will be thrown or, if existOk is true, then nothing will + * happen. Any provided data will be ignored. + * + * If this is set to "overwrite" then any existing table will be replaced. + */ + mode: "create" | "overwrite"; + /** + * If this is true and the table already exists and the mode is "create" + * then no error will be raised. + */ + existOk: boolean; +} + /** * A LanceDB Connection that allows you to open tables and create new ones. * @@ -53,10 +71,18 @@ export class Connection { */ async createTable( name: string, - data: Record[] | ArrowTable + data: Record[] | ArrowTable, + options?: Partial ): Promise { + let mode: string = options?.mode ?? "create"; + const existOk = options?.existOk ?? false; + + if (mode === "create" && existOk) { + mode = "exist_ok"; + } + const buf = toBuffer(data); - const innerTable = await this.inner.createTable(name, buf); + const innerTable = await this.inner.createTable(name, buf, mode); return new Table(innerTable); } diff --git a/nodejs/vectordb/index.ts b/nodejs/lancedb/index.ts similarity index 94% rename from nodejs/vectordb/index.ts rename to nodejs/lancedb/index.ts index 3e3cafaf..fab396c4 100644 --- a/nodejs/vectordb/index.ts +++ b/nodejs/lancedb/index.ts @@ -53,12 +53,12 @@ export async function connect( opts = Object.assign( { uri: "", - apiKey: "", - hostOverride: "", + apiKey: undefined, + hostOverride: undefined, }, args ); } - const nativeConn = await NativeConnection.new(opts.uri); + const nativeConn = await NativeConnection.new(opts); return new Connection(nativeConn); } diff --git a/nodejs/vectordb/indexer.ts b/nodejs/lancedb/indexer.ts similarity index 100% rename from nodejs/vectordb/indexer.ts rename to nodejs/lancedb/indexer.ts diff --git a/nodejs/lancedb/native.d.ts b/nodejs/lancedb/native.d.ts new file mode 100644 index 00000000..e72b54cb --- /dev/null +++ b/nodejs/lancedb/native.d.ts @@ -0,0 +1,127 @@ +/* tslint:disable */ +/* eslint-disable */ + +/* auto-generated by NAPI-RS */ + +export const enum IndexType { + Scalar = 0, + IvfPq = 1 +} +export const enum MetricType { + L2 = 0, + Cosine = 1, + Dot = 2 +} +/** + * A definition of a column alteration. The alteration changes the column at + * `path` to have the new name `name`, to be nullable if `nullable` is true, + * and to have the data type `data_type`. At least one of `rename` or `nullable` + * must be provided. + */ +export interface ColumnAlteration { + /** + * The path to the column to alter. This is a dot-separated path to the column. + * If it is a top-level column then it is just the name of the column. If it is + * a nested column then it is the path to the column, e.g. "a.b.c" for a column + * `c` nested inside a column `b` nested inside a column `a`. + */ + path: string + /** + * The new name of the column. If not provided then the name will not be changed. + * This must be distinct from the names of all other columns in the table. + */ + rename?: string + /** Set the new nullability. Note that a nullable column cannot be made non-nullable. */ + nullable?: boolean +} +/** A definition of a new column to add to a table. */ +export interface AddColumnsSql { + /** The name of the new column. */ + name: string + /** + * The values to populate the new column with, as a SQL expression. + * The expression can reference other columns in the table. + */ + valueSql: string +} +export interface ConnectionOptions { + uri: string + apiKey?: string + hostOverride?: string + /** + * (For LanceDB OSS only): The interval, in seconds, at which to check for + * updates to the table from other processes. If None, then consistency is not + * checked. For performance reasons, this is the default. For strong + * consistency, set this to zero seconds. Then every read will check for + * updates from other processes. As a compromise, you can set this to a + * non-zero value for eventual consistency. If more than that interval + * has passed since the last check, then the table will be checked for updates. + * Note: this consistency only applies to read operations. Write operations are + * always consistent. + */ + readConsistencyInterval?: number +} +/** Write mode for writing a table. */ +export const enum WriteMode { + Create = 'Create', + Append = 'Append', + Overwrite = 'Overwrite' +} +/** Write options when creating a Table. */ +export interface WriteOptions { + mode?: WriteMode +} +export function connect(options: ConnectionOptions): Promise +export class Connection { + /** Create a new Connection instance from the given URI. */ + static new(options: ConnectionOptions): Promise + /** List all tables in the dataset. */ + tableNames(): Promise> + /** + * Create table from a Apache Arrow IPC (file) buffer. + * + * Parameters: + * - name: The name of the table. + * - buf: The buffer containing the IPC file. + * + */ + createTable(name: string, buf: Buffer, mode: string): Promise
+ openTable(name: string): Promise
+ /** Drop table with the name. Or raise an error if the table does not exist. */ + dropTable(name: string): Promise +} +export class IndexBuilder { + replace(v: boolean): void + column(c: string): void + name(name: string): void + ivfPq(metricType?: MetricType | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, numBits?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): void + scalar(): void + build(): Promise +} +/** Typescript-style Async Iterator over RecordBatches */ +export class RecordBatchIterator { + next(): Promise +} +export class Query { + column(column: string): void + filter(filter: string): void + select(columns: Array): void + limit(limit: number): void + prefilter(prefilter: boolean): void + nearestTo(vector: Float32Array): void + refineFactor(refineFactor: number): void + nprobes(nprobe: number): void + executeStream(): Promise +} +export class Table { + /** Return Schema as empty Arrow IPC file. */ + schema(): Promise + add(buf: Buffer): Promise + countRows(filter?: string | undefined | null): Promise + delete(predicate: string): Promise + createIndex(): IndexBuilder + query(): Query + addColumns(transforms: Array): Promise + alterColumns(alterations: Array): Promise + dropColumns(columns: Array): Promise +} diff --git a/nodejs/vectordb/native.js b/nodejs/lancedb/native.js similarity index 60% rename from nodejs/vectordb/native.js rename to nodejs/lancedb/native.js index 4abf5eb5..a4dedff7 100644 --- a/nodejs/vectordb/native.js +++ b/nodejs/lancedb/native.js @@ -32,24 +32,24 @@ switch (platform) { case 'android': switch (arch) { case 'arm64': - localFileExisted = existsSync(join(__dirname, 'vectordb-nodejs.android-arm64.node')) + localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.android-arm64.node')) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.android-arm64.node') + nativeBinding = require('./lancedb-nodejs.android-arm64.node') } else { - nativeBinding = require('vectordb-android-arm64') + nativeBinding = require('lancedb-android-arm64') } } catch (e) { loadError = e } break case 'arm': - localFileExisted = existsSync(join(__dirname, 'vectordb-nodejs.android-arm-eabi.node')) + localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.android-arm-eabi.node')) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.android-arm-eabi.node') + nativeBinding = require('./lancedb-nodejs.android-arm-eabi.node') } else { - nativeBinding = require('vectordb-android-arm-eabi') + nativeBinding = require('lancedb-android-arm-eabi') } } catch (e) { loadError = e @@ -63,13 +63,13 @@ switch (platform) { switch (arch) { case 'x64': localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.win32-x64-msvc.node') + join(__dirname, 'lancedb-nodejs.win32-x64-msvc.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.win32-x64-msvc.node') + nativeBinding = require('./lancedb-nodejs.win32-x64-msvc.node') } else { - nativeBinding = require('vectordb-win32-x64-msvc') + nativeBinding = require('lancedb-win32-x64-msvc') } } catch (e) { loadError = e @@ -77,13 +77,13 @@ switch (platform) { break case 'ia32': localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.win32-ia32-msvc.node') + join(__dirname, 'lancedb-nodejs.win32-ia32-msvc.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.win32-ia32-msvc.node') + nativeBinding = require('./lancedb-nodejs.win32-ia32-msvc.node') } else { - nativeBinding = require('vectordb-win32-ia32-msvc') + nativeBinding = require('lancedb-win32-ia32-msvc') } } catch (e) { loadError = e @@ -91,13 +91,13 @@ switch (platform) { break case 'arm64': localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.win32-arm64-msvc.node') + join(__dirname, 'lancedb-nodejs.win32-arm64-msvc.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.win32-arm64-msvc.node') + nativeBinding = require('./lancedb-nodejs.win32-arm64-msvc.node') } else { - nativeBinding = require('vectordb-win32-arm64-msvc') + nativeBinding = require('lancedb-win32-arm64-msvc') } } catch (e) { loadError = e @@ -108,23 +108,23 @@ switch (platform) { } break case 'darwin': - localFileExisted = existsSync(join(__dirname, 'vectordb-nodejs.darwin-universal.node')) + localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.darwin-universal.node')) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.darwin-universal.node') + nativeBinding = require('./lancedb-nodejs.darwin-universal.node') } else { - nativeBinding = require('vectordb-darwin-universal') + nativeBinding = require('lancedb-darwin-universal') } break } catch {} switch (arch) { case 'x64': - localFileExisted = existsSync(join(__dirname, 'vectordb-nodejs.darwin-x64.node')) + localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.darwin-x64.node')) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.darwin-x64.node') + nativeBinding = require('./lancedb-nodejs.darwin-x64.node') } else { - nativeBinding = require('vectordb-darwin-x64') + nativeBinding = require('lancedb-darwin-x64') } } catch (e) { loadError = e @@ -132,13 +132,13 @@ switch (platform) { break case 'arm64': localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.darwin-arm64.node') + join(__dirname, 'lancedb-nodejs.darwin-arm64.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.darwin-arm64.node') + nativeBinding = require('./lancedb-nodejs.darwin-arm64.node') } else { - nativeBinding = require('vectordb-darwin-arm64') + nativeBinding = require('lancedb-darwin-arm64') } } catch (e) { loadError = e @@ -152,12 +152,12 @@ switch (platform) { if (arch !== 'x64') { throw new Error(`Unsupported architecture on FreeBSD: ${arch}`) } - localFileExisted = existsSync(join(__dirname, 'vectordb-nodejs.freebsd-x64.node')) + localFileExisted = existsSync(join(__dirname, 'lancedb-nodejs.freebsd-x64.node')) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.freebsd-x64.node') + nativeBinding = require('./lancedb-nodejs.freebsd-x64.node') } else { - nativeBinding = require('vectordb-freebsd-x64') + nativeBinding = require('lancedb-freebsd-x64') } } catch (e) { loadError = e @@ -168,26 +168,26 @@ switch (platform) { case 'x64': if (isMusl()) { localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-x64-musl.node') + join(__dirname, 'lancedb-nodejs.linux-x64-musl.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-x64-musl.node') + nativeBinding = require('./lancedb-nodejs.linux-x64-musl.node') } else { - nativeBinding = require('vectordb-linux-x64-musl') + nativeBinding = require('lancedb-linux-x64-musl') } } catch (e) { loadError = e } } else { localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-x64-gnu.node') + join(__dirname, 'lancedb-nodejs.linux-x64-gnu.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-x64-gnu.node') + nativeBinding = require('./lancedb-nodejs.linux-x64-gnu.node') } else { - nativeBinding = require('vectordb-linux-x64-gnu') + nativeBinding = require('lancedb-linux-x64-gnu') } } catch (e) { loadError = e @@ -197,26 +197,26 @@ switch (platform) { case 'arm64': if (isMusl()) { localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-arm64-musl.node') + join(__dirname, 'lancedb-nodejs.linux-arm64-musl.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-arm64-musl.node') + nativeBinding = require('./lancedb-nodejs.linux-arm64-musl.node') } else { - nativeBinding = require('vectordb-linux-arm64-musl') + nativeBinding = require('lancedb-linux-arm64-musl') } } catch (e) { loadError = e } } else { localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-arm64-gnu.node') + join(__dirname, 'lancedb-nodejs.linux-arm64-gnu.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-arm64-gnu.node') + nativeBinding = require('./lancedb-nodejs.linux-arm64-gnu.node') } else { - nativeBinding = require('vectordb-linux-arm64-gnu') + nativeBinding = require('lancedb-linux-arm64-gnu') } } catch (e) { loadError = e @@ -225,13 +225,13 @@ switch (platform) { break case 'arm': localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-arm-gnueabihf.node') + join(__dirname, 'lancedb-nodejs.linux-arm-gnueabihf.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-arm-gnueabihf.node') + nativeBinding = require('./lancedb-nodejs.linux-arm-gnueabihf.node') } else { - nativeBinding = require('vectordb-linux-arm-gnueabihf') + nativeBinding = require('lancedb-linux-arm-gnueabihf') } } catch (e) { loadError = e @@ -240,26 +240,26 @@ switch (platform) { case 'riscv64': if (isMusl()) { localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-riscv64-musl.node') + join(__dirname, 'lancedb-nodejs.linux-riscv64-musl.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-riscv64-musl.node') + nativeBinding = require('./lancedb-nodejs.linux-riscv64-musl.node') } else { - nativeBinding = require('vectordb-linux-riscv64-musl') + nativeBinding = require('lancedb-linux-riscv64-musl') } } catch (e) { loadError = e } } else { localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-riscv64-gnu.node') + join(__dirname, 'lancedb-nodejs.linux-riscv64-gnu.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-riscv64-gnu.node') + nativeBinding = require('./lancedb-nodejs.linux-riscv64-gnu.node') } else { - nativeBinding = require('vectordb-linux-riscv64-gnu') + nativeBinding = require('lancedb-linux-riscv64-gnu') } } catch (e) { loadError = e @@ -268,13 +268,13 @@ switch (platform) { break case 's390x': localFileExisted = existsSync( - join(__dirname, 'vectordb-nodejs.linux-s390x-gnu.node') + join(__dirname, 'lancedb-nodejs.linux-s390x-gnu.node') ) try { if (localFileExisted) { - nativeBinding = require('./vectordb-nodejs.linux-s390x-gnu.node') + nativeBinding = require('./lancedb-nodejs.linux-s390x-gnu.node') } else { - nativeBinding = require('vectordb-linux-s390x-gnu') + nativeBinding = require('lancedb-linux-s390x-gnu') } } catch (e) { loadError = e diff --git a/nodejs/vectordb/query.ts b/nodejs/lancedb/query.ts similarity index 100% rename from nodejs/vectordb/query.ts rename to nodejs/lancedb/query.ts diff --git a/nodejs/vectordb/table.ts b/nodejs/lancedb/table.ts similarity index 70% rename from nodejs/vectordb/table.ts rename to nodejs/lancedb/table.ts index 2dd3bec1..e2ef723a 100644 --- a/nodejs/vectordb/table.ts +++ b/nodejs/lancedb/table.ts @@ -13,7 +13,7 @@ // limitations under the License. import { Schema, tableFromIPC } from "apache-arrow"; -import { Table as _NativeTable } from "./native"; +import { AddColumnsSql, ColumnAlteration, Table as _NativeTable } from "./native"; import { toBuffer, Data } from "./arrow"; import { Query } from "./query"; import { IndexBuilder } from "./indexer"; @@ -32,8 +32,8 @@ export class Table { } /** Get the schema of the table. */ - get schema(): Schema { - const schemaBuf = this.inner.schema(); + async schema(): Promise { + const schemaBuf = await this.inner.schema(); const tbl = tableFromIPC(schemaBuf); return tbl.schema; } @@ -50,7 +50,7 @@ export class Table { } /** Count the total number of rows in the dataset. */ - async countRows(filter?: string): Promise { + async countRows(filter?: string): Promise { return await this.inner.countRows(filter); } @@ -150,4 +150,42 @@ export class Table { } return q; } + + // TODO: Support BatchUDF + /** + * Add new columns with defined values. + * + * @param newColumnTransforms pairs of column names and the SQL expression to use + * to calculate the value of the new column. These + * expressions will be evaluated for each row in the + * table, and can reference existing columns in the table. + */ + async addColumns(newColumnTransforms: AddColumnsSql[]): Promise { + await this.inner.addColumns(newColumnTransforms); + } + + /** + * Alter the name or nullability of columns. + * + * @param columnAlterations One or more alterations to apply to columns. + */ + async alterColumns(columnAlterations: ColumnAlteration[]): Promise { + await this.inner.alterColumns(columnAlterations); + } + + /** + * Drop one or more columns from the dataset + * + * This is a metadata-only operation and does not remove the data from the + * underlying storage. In order to remove the data, you must subsequently + * call ``compact_files`` to rewrite the data without the removed columns and + * then call ``cleanup_files`` to remove the old files. + * + * @param columnNames The names of the columns to drop. These can be nested + * column references (e.g. "a.b.c") or top-level column + * names (e.g. "a"). + */ + async dropColumns(columnNames: string[]): Promise { + await this.inner.dropColumns(columnNames); + } } diff --git a/nodejs/npm/darwin-arm64/README.md b/nodejs/npm/darwin-arm64/README.md index ead551dc..b324e37b 100644 --- a/nodejs/npm/darwin-arm64/README.md +++ b/nodejs/npm/darwin-arm64/README.md @@ -1,3 +1,3 @@ -# `vectordb-darwin-arm64` +# `lancedb-darwin-arm64` -This is the **aarch64-apple-darwin** binary for `vectordb` +This is the **aarch64-apple-darwin** binary for `lancedb` diff --git a/nodejs/npm/darwin-arm64/package.json b/nodejs/npm/darwin-arm64/package.json index 546eacea..afae035a 100644 --- a/nodejs/npm/darwin-arm64/package.json +++ b/nodejs/npm/darwin-arm64/package.json @@ -1,5 +1,5 @@ { - "name": "vectordb-darwin-arm64", + "name": "lancedb-darwin-arm64", "version": "0.4.3", "os": [ "darwin" @@ -7,9 +7,9 @@ "cpu": [ "arm64" ], - "main": "vectordb.darwin-arm64.node", + "main": "lancedb.darwin-arm64.node", "files": [ - "vectordb.darwin-arm64.node" + "lancedb.darwin-arm64.node" ], "license": "MIT", "engines": { diff --git a/nodejs/npm/darwin-x64/README.md b/nodejs/npm/darwin-x64/README.md index 3fb8cc8f..dee23695 100644 --- a/nodejs/npm/darwin-x64/README.md +++ b/nodejs/npm/darwin-x64/README.md @@ -1,3 +1,3 @@ -# `vectordb-darwin-x64` +# `lancedb-darwin-x64` -This is the **x86_64-apple-darwin** binary for `vectordb` +This is the **x86_64-apple-darwin** binary for `lancedb` diff --git a/nodejs/npm/darwin-x64/package.json b/nodejs/npm/darwin-x64/package.json index 7617bd05..95353cec 100644 --- a/nodejs/npm/darwin-x64/package.json +++ b/nodejs/npm/darwin-x64/package.json @@ -1,5 +1,5 @@ { - "name": "vectordb-darwin-x64", + "name": "lancedb-darwin-x64", "version": "0.4.3", "os": [ "darwin" @@ -7,9 +7,9 @@ "cpu": [ "x64" ], - "main": "vectordb.darwin-x64.node", + "main": "lancedb.darwin-x64.node", "files": [ - "vectordb.darwin-x64.node" + "lancedb.darwin-x64.node" ], "license": "MIT", "engines": { diff --git a/nodejs/npm/linux-arm64-gnu/README.md b/nodejs/npm/linux-arm64-gnu/README.md index 04c0e44d..b2fda68d 100644 --- a/nodejs/npm/linux-arm64-gnu/README.md +++ b/nodejs/npm/linux-arm64-gnu/README.md @@ -1,3 +1,3 @@ -# `vectordb-linux-arm64-gnu` +# `lancedb-linux-arm64-gnu` -This is the **aarch64-unknown-linux-gnu** binary for `vectordb` +This is the **aarch64-unknown-linux-gnu** binary for `lancedb` diff --git a/nodejs/npm/linux-arm64-gnu/package.json b/nodejs/npm/linux-arm64-gnu/package.json index 0d6f6840..ec668413 100644 --- a/nodejs/npm/linux-arm64-gnu/package.json +++ b/nodejs/npm/linux-arm64-gnu/package.json @@ -1,5 +1,5 @@ { - "name": "vectordb-linux-arm64-gnu", + "name": "lancedb-linux-arm64-gnu", "version": "0.4.3", "os": [ "linux" @@ -7,9 +7,9 @@ "cpu": [ "arm64" ], - "main": "vectordb.linux-arm64-gnu.node", + "main": "lancedb.linux-arm64-gnu.node", "files": [ - "vectordb.linux-arm64-gnu.node" + "lancedb.linux-arm64-gnu.node" ], "license": "MIT", "engines": { diff --git a/nodejs/npm/linux-x64-gnu/README.md b/nodejs/npm/linux-x64-gnu/README.md index 63d5f7be..e4a94cdd 100644 --- a/nodejs/npm/linux-x64-gnu/README.md +++ b/nodejs/npm/linux-x64-gnu/README.md @@ -1,3 +1,3 @@ -# `vectordb-linux-x64-gnu` +# `lancedb-linux-x64-gnu` -This is the **x86_64-unknown-linux-gnu** binary for `vectordb` +This is the **x86_64-unknown-linux-gnu** binary for `lancedb` diff --git a/nodejs/npm/linux-x64-gnu/package.json b/nodejs/npm/linux-x64-gnu/package.json index 0991cde7..7fcfdf1b 100644 --- a/nodejs/npm/linux-x64-gnu/package.json +++ b/nodejs/npm/linux-x64-gnu/package.json @@ -1,5 +1,5 @@ { - "name": "vectordb-linux-x64-gnu", + "name": "lancedb-linux-x64-gnu", "version": "0.4.3", "os": [ "linux" @@ -7,9 +7,9 @@ "cpu": [ "x64" ], - "main": "vectordb.linux-x64-gnu.node", + "main": "lancedb.linux-x64-gnu.node", "files": [ - "vectordb.linux-x64-gnu.node" + "lancedb.linux-x64-gnu.node" ], "license": "MIT", "engines": { diff --git a/nodejs/package-lock.json b/nodejs/package-lock.json index 0e21ad87..b0f33580 100644 --- a/nodejs/package-lock.json +++ b/nodejs/package-lock.json @@ -22,7 +22,7 @@ }, "devDependencies": { "@napi-rs/cli": "^2.18.0", - "@types/jest": "^29.5.11", + "@types/jest": "^29.1.2", "@typescript-eslint/eslint-plugin": "^6.19.0", "@typescript-eslint/parser": "^6.19.0", "eslint": "^8.56.0", @@ -949,37 +949,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/@jest/console/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/@jest/console/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/@jest/console/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -1036,46 +1005,6 @@ } } }, - "node_modules/@jest/core/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/@jest/core/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/@jest/core/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/@jest/core/node_modules/ci-info": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", @@ -1100,18 +1029,6 @@ "node": ">=8" } }, - "node_modules/@jest/core/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/@jest/environment": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/@jest/environment/-/environment-29.7.0.tgz", @@ -1227,46 +1144,6 @@ } } }, - "node_modules/@jest/reporters/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/@jest/reporters/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/@jest/reporters/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/@jest/reporters/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -1276,18 +1153,6 @@ "node": ">=8" } }, - "node_modules/@jest/reporters/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/@jest/schemas": { "version": "29.6.3", "resolved": "https://registry.npmjs.org/@jest/schemas/-/schemas-29.6.3.tgz", @@ -1388,37 +1253,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/@jest/transform/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/@jest/transform/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/@jest/transform/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -1458,37 +1292,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/@jest/types/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/@jest/types/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/@jridgewell/gen-mapping": { "version": "0.3.3", "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.3.tgz", @@ -1705,9 +1508,9 @@ } }, "node_modules/@types/jest": { - "version": "29.5.11", - "resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.11.tgz", - "integrity": "sha512-S2mHmYIVe13vrm6q4kN6fLYYAka15ALQki/vgDC3mIukEOx8WJlv0kQPM+d4w8Gp6u0uSdKND04IlTXBv0rwnQ==", + "version": "29.5.12", + "resolved": "https://registry.npmjs.org/@types/jest/-/jest-29.5.12.tgz", + "integrity": "sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==", "dev": true, "dependencies": { "expect": "^29.0.0", @@ -2068,12 +1871,35 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/ansi-sequence-parser": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.1.tgz", "integrity": "sha512-vJXt3yiaUL4UU546s3rPXlsry/RnM730G1+HkpKE012AN0sx1eOrxSu95oKDIonskeLTijMgqWZ3uDEe3NFvyg==", "dev": true }, + "node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, "node_modules/anymatch": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", @@ -2087,18 +1913,6 @@ "node": ">= 8" } }, - "node_modules/anymatch/node_modules/picomatch": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", - "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", - "dev": true, - "engines": { - "node": ">=8.6" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/apache-arrow": { "version": "15.0.0", "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-15.0.0.tgz", @@ -2165,37 +1979,6 @@ "@babel/core": "^7.8.0" } }, - "node_modules/babel-jest/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/babel-jest/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/babel-jest/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -2416,35 +2199,7 @@ } ] }, - "node_modules/chalk-template": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/chalk-template/-/chalk-template-0.4.0.tgz", - "integrity": "sha512-/ghrgmhfY8RaSdeo43hNXxpoHAtxdbskUHjPpfqUWGttFgycUhYPGx3YZBCnUCvOa7Doivn1IZec3DEGFoMgLg==", - "dependencies": { - "chalk": "^4.1.2" - }, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/chalk-template?sponsor=1" - } - }, - "node_modules/chalk-template/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/chalk-template/node_modules/chalk": { + "node_modules/chalk": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", @@ -2459,6 +2214,20 @@ "url": "https://github.com/chalk/chalk?sponsor=1" } }, + "node_modules/chalk-template": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/chalk-template/-/chalk-template-0.4.0.tgz", + "integrity": "sha512-/ghrgmhfY8RaSdeo43hNXxpoHAtxdbskUHjPpfqUWGttFgycUhYPGx3YZBCnUCvOa7Doivn1IZec3DEGFoMgLg==", + "dependencies": { + "chalk": "^4.1.2" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/chalk-template?sponsor=1" + } + }, "node_modules/char-regex": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/char-regex/-/char-regex-1.0.2.tgz", @@ -2488,56 +2257,6 @@ "node": ">=12" } }, - "node_modules/cliui/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/cliui/node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", - "dev": true - }, - "node_modules/cliui/node_modules/is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/cliui/node_modules/string-width": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "dev": true, - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/cliui/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -2647,37 +2366,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/create-jest/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/create-jest/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/cross-spawn": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", @@ -2801,6 +2489,12 @@ "integrity": "sha512-M4+u22ZJGpk4RY7tne6W+APkZhnnhmAH48FNl8iEFK2lEgob+U5rUQsIqQhvAwCXYpfd3H20pHK/ENsCvwTbsA==", "dev": true }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "dev": true + }, "node_modules/error-ex": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", @@ -2819,6 +2513,18 @@ "node": ">=6" } }, + "node_modules/escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/eslint": { "version": "8.56.0", "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.56.0.tgz", @@ -2902,64 +2608,12 @@ "url": "https://opencollective.com/eslint" } }, - "node_modules/eslint/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/eslint/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, "node_modules/eslint/node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", "dev": true }, - "node_modules/eslint/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, - "node_modules/eslint/node_modules/escape-string-regexp": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", - "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", - "dev": true, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/eslint/node_modules/glob-parent": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", @@ -2984,18 +2638,6 @@ "js-yaml": "bin/js-yaml.js" } }, - "node_modules/eslint/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/espree": { "version": "9.6.1", "resolved": "https://registry.npmjs.org/espree/-/espree-9.6.1.tgz", @@ -3512,6 +3154,15 @@ "node": ">=0.10.0" } }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/is-generator-fn": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/is-generator-fn/-/is-generator-fn-2.1.0.tgz", @@ -3798,37 +3449,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-circus/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-circus/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-circus/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -3871,37 +3491,6 @@ } } }, - "node_modules/jest-cli/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-cli/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-config": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest-config/-/jest-config-29.7.0.tgz", @@ -3947,37 +3536,6 @@ } } }, - "node_modules/jest-config/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-config/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-config/node_modules/ci-info": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", @@ -4017,37 +3575,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-diff/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-diff/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-docblock": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest-docblock/-/jest-docblock-29.7.0.tgz", @@ -4076,37 +3603,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-each/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-each/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-environment-node": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest-environment-node/-/jest-environment-node-29.7.0.tgz", @@ -4186,37 +3682,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-matcher-utils/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-matcher-utils/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-message-util": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest-message-util/-/jest-message-util-29.7.0.tgz", @@ -4237,37 +3702,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-message-util/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-message-util/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-message-util/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -4350,37 +3784,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-resolve/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-resolve/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-resolve/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -4422,37 +3825,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-runner/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-runner/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-runner/node_modules/emittery": { "version": "0.13.1", "resolved": "https://registry.npmjs.org/emittery/-/emittery-0.13.1.tgz", @@ -4498,37 +3870,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-runtime/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-runtime/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-runtime/node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", @@ -4569,37 +3910,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-snapshot/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-snapshot/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-util": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest-util/-/jest-util-29.7.0.tgz", @@ -4617,37 +3927,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-util/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-util/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-util/node_modules/ci-info": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", @@ -4663,18 +3942,6 @@ "node": ">=8" } }, - "node_modules/jest-util/node_modules/picomatch": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", - "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", - "dev": true, - "engines": { - "node": ">=8.6" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/jest-validate": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest-validate/-/jest-validate-29.7.0.tgz", @@ -4692,21 +3959,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-validate/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, "node_modules/jest-validate/node_modules/camelcase": { "version": "6.3.0", "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", @@ -4719,22 +3971,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/jest-validate/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-watcher": { "version": "29.7.0", "resolved": "https://registry.npmjs.org/jest-watcher/-/jest-watcher-29.7.0.tgz", @@ -4754,37 +3990,6 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, - "node_modules/jest-watcher/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/jest-watcher/node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/jest-watcher/node_modules/emittery": { "version": "0.13.1", "resolved": "https://registry.npmjs.org/emittery/-/emittery-0.13.1.tgz", @@ -5064,18 +4269,6 @@ "node": ">=8.6" } }, - "node_modules/micromatch/node_modules/picomatch": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", - "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", - "dev": true, - "engines": { - "node": ">=8.6" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/minimatch": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", @@ -5273,6 +4466,18 @@ "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==", "dev": true }, + "node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "dev": true, + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, "node_modules/pirates": { "version": "4.0.6", "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.6.tgz", @@ -5676,16 +4881,21 @@ "node": ">=10" } }, - "node_modules/string-length/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", "dev": true, + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, "engines": { "node": ">=8" } }, - "node_modules/string-length/node_modules/strip-ansi": { + "node_modules/strip-ansi": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", @@ -6121,71 +5331,6 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, - "node_modules/wrap-ansi/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/wrap-ansi/node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", - "dev": true - }, - "node_modules/wrap-ansi/node_modules/is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi/node_modules/string-width": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "dev": true, - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/wrappy": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", @@ -6234,56 +5379,6 @@ "node": ">=12" } }, - "node_modules/yargs/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/yargs/node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", - "dev": true - }, - "node_modules/yargs/node_modules/is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/yargs/node_modules/string-width": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "dev": true, - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/yargs/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/yocto-queue": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", diff --git a/nodejs/package.json b/nodejs/package.json index b49a5cb0..39473320 100644 --- a/nodejs/package.json +++ b/nodejs/package.json @@ -1,10 +1,10 @@ { - "name": "vectordb", + "name": "lancedb", "version": "0.4.3", "main": "./dist/index.js", "types": "./dist/index.d.ts", "napi": { - "name": "vectordb-nodejs", + "name": "lancedb-nodejs", "triples": { "defaults": false, "additional": [ @@ -18,7 +18,7 @@ "license": "Apache 2.0", "devDependencies": { "@napi-rs/cli": "^2.18.0", - "@types/jest": "^29.5.11", + "@types/jest": "^29.1.2", "@typescript-eslint/eslint-plugin": "^6.19.0", "@typescript-eslint/parser": "^6.19.0", "eslint": "^8.56.0", @@ -45,23 +45,23 @@ ], "scripts": { "artifacts": "napi artifacts", - "build:native": "napi build --platform --release --js vectordb/native.js --dts vectordb/native.d.ts dist/", - "build:debug": "napi build --platform --dts ../vectordb/native.d.ts --js ../vectordb/native.js dist/", + "build:native": "napi build --platform --release --js lancedb/native.js --dts lancedb/native.d.ts dist/", + "build:debug": "napi build --platform --dts ../lancedb/native.d.ts --js ../lancedb/native.js dist/", "build": "npm run build:debug && tsc -b", - "docs": "typedoc --plugin typedoc-plugin-markdown vectordb/index.ts", - "lint": "eslint vectordb --ext .js,.ts", + "docs": "typedoc --plugin typedoc-plugin-markdown lancedb/index.ts", + "lint": "eslint lancedb --ext .js,.ts", "prepublishOnly": "napi prepublish -t npm", - "test": "npm run build && jest", + "test": "npm run build && jest --verbose", "universal": "napi universal", "version": "napi version" }, "optionalDependencies": { - "vectordb-darwin-arm64": "0.4.3", - "vectordb-darwin-x64": "0.4.3", - "vectordb-linux-arm64-gnu": "0.4.3", - "vectordb-linux-x64-gnu": "0.4.3" + "lancedb-darwin-arm64": "0.4.3", + "lancedb-darwin-x64": "0.4.3", + "lancedb-linux-arm64-gnu": "0.4.3", + "lancedb-linux-x64-gnu": "0.4.3" }, - "dependencies": { + "peerDependencies": { "apache-arrow": "^15.0.0" } } diff --git a/nodejs/src/connection.rs b/nodejs/src/connection.rs index 7f4a82d9..9bef5eec 100644 --- a/nodejs/src/connection.rs +++ b/nodejs/src/connection.rs @@ -12,29 +12,51 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use napi::bindgen_prelude::*; use napi_derive::*; use crate::table::Table; -use vectordb::connection::{Connection as LanceDBConnection, Database}; -use vectordb::ipc::ipc_file_to_batches; +use crate::ConnectionOptions; +use lancedb::connection::{ConnectBuilder, Connection as LanceDBConnection, CreateTableMode}; +use lancedb::ipc::ipc_file_to_batches; #[napi] pub struct Connection { - conn: Arc, + conn: LanceDBConnection, +} + +impl Connection { + fn parse_create_mode_str(mode: &str) -> napi::Result { + match mode { + "create" => Ok(CreateTableMode::Create), + "overwrite" => Ok(CreateTableMode::Overwrite), + "exist_ok" => Ok(CreateTableMode::exist_ok(|builder| builder)), + _ => Err(napi::Error::from_reason(format!("Invalid mode {}", mode))), + } + } } #[napi] impl Connection { /// Create a new Connection instance from the given URI. #[napi(factory)] - pub async fn new(uri: String) -> napi::Result { + pub async fn new(options: ConnectionOptions) -> napi::Result { + let mut builder = ConnectBuilder::new(&options.uri); + if let Some(api_key) = options.api_key { + builder = builder.api_key(&api_key); + } + if let Some(host_override) = options.host_override { + builder = builder.host_override(&host_override); + } + if let Some(interval) = options.read_consistency_interval { + builder = + builder.read_consistency_interval(std::time::Duration::from_secs_f64(interval)); + } Ok(Self { - conn: Arc::new(Database::connect(&uri).await.map_err(|e| { - napi::Error::from_reason(format!("Failed to connect to database: {}", e)) - })?), + conn: builder + .execute() + .await + .map_err(|e| napi::Error::from_reason(format!("{}", e)))?, }) } @@ -54,12 +76,20 @@ impl Connection { /// - buf: The buffer containing the IPC file. /// #[napi] - pub async fn create_table(&self, name: String, buf: Buffer) -> napi::Result
{ + pub async fn create_table( + &self, + name: String, + buf: Buffer, + mode: String, + ) -> napi::Result
{ let batches = ipc_file_to_batches(buf.to_vec()) .map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?; + let mode = Self::parse_create_mode_str(&mode)?; let tbl = self .conn - .create_table(&name, Box::new(batches), None) + .create_table(&name, Box::new(batches)) + .mode(mode) + .execute() .await .map_err(|e| napi::Error::from_reason(format!("{}", e)))?; Ok(Table::new(tbl)) @@ -70,6 +100,7 @@ impl Connection { let tbl = self .conn .open_table(&name) + .execute() .await .map_err(|e| napi::Error::from_reason(format!("{}", e)))?; Ok(Table::new(tbl)) diff --git a/nodejs/src/index.rs b/nodejs/src/index.rs index c8b06257..91d3a7d6 100644 --- a/nodejs/src/index.rs +++ b/nodejs/src/index.rs @@ -40,12 +40,12 @@ impl From for LanceMetricType { #[napi] pub struct IndexBuilder { - inner: vectordb::index::IndexBuilder, + inner: lancedb::index::IndexBuilder, } #[napi] impl IndexBuilder { - pub fn new(tbl: &dyn vectordb::Table) -> Self { + pub fn new(tbl: &dyn lancedb::Table) -> Self { let inner = tbl.create_index(&[]); Self { inner } } diff --git a/nodejs/src/iterator.rs b/nodejs/src/iterator.rs index 50b3b110..55ee0dca 100644 --- a/nodejs/src/iterator.rs +++ b/nodejs/src/iterator.rs @@ -14,9 +14,9 @@ use futures::StreamExt; use lance::io::RecordBatchStream; +use lancedb::ipc::batches_to_ipc_file; use napi::bindgen_prelude::*; use napi_derive::napi; -use vectordb::ipc::batches_to_ipc_file; /** Typescript-style Async Iterator over RecordBatches */ #[napi] diff --git a/nodejs/src/lib.rs b/nodejs/src/lib.rs index 463ec4ce..8913e1d5 100644 --- a/nodejs/src/lib.rs +++ b/nodejs/src/lib.rs @@ -22,10 +22,21 @@ mod query; mod table; #[napi(object)] +#[derive(Debug)] pub struct ConnectionOptions { pub uri: String, pub api_key: Option, pub host_override: Option, + /// (For LanceDB OSS only): The interval, in seconds, at which to check for + /// updates to the table from other processes. If None, then consistency is not + /// checked. For performance reasons, this is the default. For strong + /// consistency, set this to zero seconds. Then every read will check for + /// updates from other processes. As a compromise, you can set this to a + /// non-zero value for eventual consistency. If more than that interval + /// has passed since the last check, then the table will be checked for updates. + /// Note: this consistency only applies to read operations. Write operations are + /// always consistent. + pub read_consistency_interval: Option, } /// Write mode for writing a table. @@ -44,5 +55,5 @@ pub struct WriteOptions { #[napi] pub async fn connect(options: ConnectionOptions) -> napi::Result { - Connection::new(options.uri.clone()).await + Connection::new(options).await } diff --git a/nodejs/src/query.rs b/nodejs/src/query.rs index 5bea8714..891a6454 100644 --- a/nodejs/src/query.rs +++ b/nodejs/src/query.rs @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use lancedb::query::Query as LanceDBQuery; use napi::bindgen_prelude::*; use napi_derive::napi; -use vectordb::query::Query as LanceDBQuery; use crate::{iterator::RecordBatchIterator, table::Table}; diff --git a/nodejs/src/table.rs b/nodejs/src/table.rs index fdb16b26..6d46e466 100644 --- a/nodejs/src/table.rs +++ b/nodejs/src/table.rs @@ -13,9 +13,13 @@ // limitations under the License. use arrow_ipc::writer::FileWriter; +use lance::dataset::ColumnAlteration as LanceColumnAlteration; +use lancedb::{ + ipc::ipc_file_to_batches, + table::{AddDataOptions, TableRef}, +}; use napi::bindgen_prelude::*; use napi_derive::napi; -use vectordb::{ipc::ipc_file_to_batches, table::TableRef}; use crate::index::IndexBuilder; use crate::query::Query; @@ -33,8 +37,12 @@ impl Table { /// Return Schema as empty Arrow IPC file. #[napi] - pub fn schema(&self) -> napi::Result { - let mut writer = FileWriter::try_new(vec![], &self.table.schema()) + pub async fn schema(&self) -> napi::Result { + let schema = + self.table.schema().await.map_err(|e| { + napi::Error::from_reason(format!("Failed to create IPC file: {}", e)) + })?; + let mut writer = FileWriter::try_new(vec![], &schema) .map_err(|e| napi::Error::from_reason(format!("Failed to create IPC file: {}", e)))?; writer .finish() @@ -48,22 +56,29 @@ impl Table { pub async fn add(&self, buf: Buffer) -> napi::Result<()> { let batches = ipc_file_to_batches(buf.to_vec()) .map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?; - self.table.add(Box::new(batches), None).await.map_err(|e| { - napi::Error::from_reason(format!( - "Failed to add batches to table {}: {}", - self.table, e - )) - }) + self.table + .add(Box::new(batches), AddDataOptions::default()) + .await + .map_err(|e| { + napi::Error::from_reason(format!( + "Failed to add batches to table {}: {}", + self.table, e + )) + }) } #[napi] - pub async fn count_rows(&self, filter: Option) -> napi::Result { - self.table.count_rows(filter).await.map_err(|e| { - napi::Error::from_reason(format!( - "Failed to count rows in table {}: {}", - self.table, e - )) - }) + pub async fn count_rows(&self, filter: Option) -> napi::Result { + self.table + .count_rows(filter) + .await + .map(|val| val as i64) + .map_err(|e| { + napi::Error::from_reason(format!( + "Failed to count rows in table {}: {}", + self.table, e + )) + }) } #[napi] @@ -85,4 +100,106 @@ impl Table { pub fn query(&self) -> Query { Query::new(self) } + + #[napi] + pub async fn add_columns(&self, transforms: Vec) -> napi::Result<()> { + let transforms = transforms + .into_iter() + .map(|sql| (sql.name, sql.value_sql)) + .collect::>(); + let transforms = lance::dataset::NewColumnTransform::SqlExpressions(transforms); + self.table + .add_columns(transforms, None) + .await + .map_err(|err| { + napi::Error::from_reason(format!( + "Failed to add columns to table {}: {}", + self.table, err + )) + })?; + Ok(()) + } + + #[napi] + pub async fn alter_columns(&self, alterations: Vec) -> napi::Result<()> { + for alteration in &alterations { + if alteration.rename.is_none() && alteration.nullable.is_none() { + return Err(napi::Error::from_reason( + "Alteration must have a 'rename' or 'nullable' field.", + )); + } + } + let alterations = alterations + .into_iter() + .map(LanceColumnAlteration::from) + .collect::>(); + + self.table + .alter_columns(&alterations) + .await + .map_err(|err| { + napi::Error::from_reason(format!( + "Failed to alter columns in table {}: {}", + self.table, err + )) + })?; + Ok(()) + } + + #[napi] + pub async fn drop_columns(&self, columns: Vec) -> napi::Result<()> { + let col_refs = columns.iter().map(String::as_str).collect::>(); + self.table.drop_columns(&col_refs).await.map_err(|err| { + napi::Error::from_reason(format!( + "Failed to drop columns from table {}: {}", + self.table, err + )) + })?; + Ok(()) + } +} + +/// A definition of a column alteration. The alteration changes the column at +/// `path` to have the new name `name`, to be nullable if `nullable` is true, +/// and to have the data type `data_type`. At least one of `rename` or `nullable` +/// must be provided. +#[napi(object)] +pub struct ColumnAlteration { + /// The path to the column to alter. This is a dot-separated path to the column. + /// If it is a top-level column then it is just the name of the column. If it is + /// a nested column then it is the path to the column, e.g. "a.b.c" for a column + /// `c` nested inside a column `b` nested inside a column `a`. + pub path: String, + /// The new name of the column. If not provided then the name will not be changed. + /// This must be distinct from the names of all other columns in the table. + pub rename: Option, + /// Set the new nullability. Note that a nullable column cannot be made non-nullable. + pub nullable: Option, +} + +impl From for LanceColumnAlteration { + fn from(js: ColumnAlteration) -> Self { + let ColumnAlteration { + path, + rename, + nullable, + } = js; + Self { + path, + rename, + nullable, + // TODO: wire up this field + data_type: None, + } + } +} + +/// A definition of a new column to add to a table. +#[napi(object)] +pub struct AddColumnsSql { + /// The name of the new column. + pub name: String, + /// The values to populate the new column with, as a SQL expression. + /// The expression can reference other columns in the table. + pub value_sql: String, } diff --git a/nodejs/tsconfig.json b/nodejs/tsconfig.json index 08943b38..ba4e4a37 100644 --- a/nodejs/tsconfig.json +++ b/nodejs/tsconfig.json @@ -1,8 +1,8 @@ { "include": [ - "vectordb/*.ts", - "vectordb/**/*.ts", - "vectordb/*.js", + "lancedb/*.ts", + "lancedb/**/*.ts", + "lancedb/*.js", ], "compilerOptions": { "target": "es2022", @@ -18,7 +18,7 @@ ], "typedocOptions": { "entryPoints": [ - "vectordb/index.ts" + "lancedb/index.ts" ], "out": "../docs/src/javascript/", "visibilityFilters": { diff --git a/nodejs/vectordb/native.d.ts b/nodejs/vectordb/native.d.ts deleted file mode 100644 index 1fc8c9b2..00000000 --- a/nodejs/vectordb/native.d.ts +++ /dev/null @@ -1,80 +0,0 @@ -/* tslint:disable */ -/* eslint-disable */ - -/* auto-generated by NAPI-RS */ - -export const enum IndexType { - Scalar = 0, - IvfPq = 1 -} -export const enum MetricType { - L2 = 0, - Cosine = 1, - Dot = 2 -} -export interface ConnectionOptions { - uri: string - apiKey?: string - hostOverride?: string -} -/** Write mode for writing a table. */ -export const enum WriteMode { - Create = 'Create', - Append = 'Append', - Overwrite = 'Overwrite' -} -/** Write options when creating a Table. */ -export interface WriteOptions { - mode?: WriteMode -} -export function connect(options: ConnectionOptions): Promise -export class Connection { - /** Create a new Connection instance from the given URI. */ - static new(uri: string): Promise - /** List all tables in the dataset. */ - tableNames(): Promise> - /** - * Create table from a Apache Arrow IPC (file) buffer. - * - * Parameters: - * - name: The name of the table. - * - buf: The buffer containing the IPC file. - * - */ - createTable(name: string, buf: Buffer): Promise
- openTable(name: string): Promise
- /** Drop table with the name. Or raise an error if the table does not exist. */ - dropTable(name: string): Promise -} -export class IndexBuilder { - replace(v: boolean): void - column(c: string): void - name(name: string): void - ivfPq(metricType?: MetricType | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, numBits?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): void - scalar(): void - build(): Promise -} -/** Typescript-style Async Iterator over RecordBatches */ -export class RecordBatchIterator { - next(): Promise -} -export class Query { - column(column: string): void - filter(filter: string): void - select(columns: Array): void - limit(limit: number): void - prefilter(prefilter: boolean): void - nearestTo(vector: Float32Array): void - refineFactor(refineFactor: number): void - nprobes(nprobe: number): void - executeStream(): Promise -} -export class Table { - /** Return Schema as empty Arrow IPC file. */ - schema(): Buffer - add(buf: Buffer): Promise - countRows(filter?: string | undefined | null): Promise - delete(predicate: string): Promise - createIndex(): IndexBuilder - query(): Query -} diff --git a/python/.bumpversion.cfg b/python/.bumpversion.cfg index 9b5bee15..7244d485 100644 --- a/python/.bumpversion.cfg +++ b/python/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.5 +current_version = 0.6.1 commit = True message = [python] Bump version: {current_version} → {new_version} tag = True diff --git a/python/ASYNC_MIGRATION.md b/python/ASYNC_MIGRATION.md new file mode 100644 index 00000000..6a9231c4 --- /dev/null +++ b/python/ASYNC_MIGRATION.md @@ -0,0 +1,24 @@ +# Migration from Sync to Async API + +A new asynchronous API has been added to LanceDb. This API is built +on top of the rust lancedb crate (instead of being built on top of +pylance). This will help keep the various language bindings in sync. +There are some slight changes between the synchronous and the asynchronous +APIs. This document will help you migrate. These changes relate mostly +to the Connection and Table classes. + +## Almost all functions are async + +The most important change is that almost all functions are now async. +This means the functions now return `asyncio` coroutines. You will +need to use `await` to call these functions. + +## Connection + +No changes yet. + +## Table + +* Previously `Table.schema` was a property. Now it is an async method. +* The method `Table.__len__` was removed and `len(table)` will no longer + work. Use `Table.count_rows` instead. diff --git a/python/Cargo.toml b/python/Cargo.toml new file mode 100644 index 00000000..9d1101ea --- /dev/null +++ b/python/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "lancedb-python" +version = "0.4.10" +edition.workspace = true +description = "Python bindings for LanceDB" +license.workspace = true +repository.workspace = true +keywords.workspace = true +categories.workspace = true + + +[lib] +name = "_lancedb" +crate-type = ["cdylib"] + +[dependencies] +arrow = { version = "50.0.0", features = ["pyarrow"] } +lancedb = { path = "../rust/lancedb" } +env_logger = "0.10" +pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] } +pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] } + +# Prevent dynamic linking of lzma, which comes from datafusion +lzma-sys = { version = "*", features = ["static"] } + +[build-dependencies] +pyo3-build-config = { version = "0.20.3", features = [ + "extension-module", + "abi3-py38", +] } diff --git a/python/README.md b/python/README.md index 94e27d6a..cbcec70a 100644 --- a/python/README.md +++ b/python/README.md @@ -20,10 +20,10 @@ results = table.search([0.1, 0.3]).limit(20).to_list() print(results) ``` - ## Development -Create a virtual environment and activate it: +LanceDb is based on the rust crate `lancedb` and is built with maturin. In order to build with maturin +you will either need a conda environment or a virtual environment (venv). ```bash python -m venv venv @@ -33,7 +33,15 @@ python -m venv venv Install the necessary packages: ```bash -python -m pip install . +python -m pip install .[tests,dev] +``` + +To build the python package you can use maturin: + +```bash +# This will build the rust bindings and place them in the appropriate place +# in your venv or conda environment +matruin develop ``` To run the unit tests: @@ -45,7 +53,7 @@ pytest To run the doc tests: ```bash -pytest --doctest-modules lancedb +pytest --doctest-modules python/lancedb ``` To run linter and automatically fix all errors: @@ -61,31 +69,27 @@ If any packages are missing, install them with: pip install ``` - ___ For **Windows** users, there may be errors when installing packages, so these commands may be helpful: Activate the virtual environment: + ```bash . .\venv\Scripts\activate ``` You may need to run the installs separately: + ```bash pip install -e .[tests] pip install -e .[dev] ``` - `tantivy` requires `rust` to be installed, so install it with `conda`, as it doesn't support windows installation: + ```bash pip install wheel pip install cargo conda install rust pip install tantivy ``` - -To run the unit tests: -```bash -pytest -``` diff --git a/python/build.rs b/python/build.rs new file mode 100644 index 00000000..dace4a9b --- /dev/null +++ b/python/build.rs @@ -0,0 +1,3 @@ +fn main() { + pyo3_build_config::add_extension_module_link_args(); +} diff --git a/python/lancedb/common.py b/python/lancedb/common.py deleted file mode 100644 index 54c7c9e0..00000000 --- a/python/lancedb/common.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2023 LanceDB Developers -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pathlib import Path -from typing import Iterable, List, Union - -import numpy as np -import pyarrow as pa - -from .util import safe_import_pandas - -pd = safe_import_pandas() - -DATA = Union[List[dict], dict, "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]] -VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray] -URI = Union[str, Path] -VECTOR_COLUMN_NAME = "vector" - - -class Credential(str): - """Credential field""" - - def __repr__(self) -> str: - return "********" - - def __str__(self) -> str: - return "********" diff --git a/python/pyproject.toml b/python/pyproject.toml index 8de443dd..cd848e02 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,9 +1,9 @@ [project] name = "lancedb" -version = "0.5.5" +version = "0.6.1" dependencies = [ "deprecation", - "pylance==0.9.16", + "pylance==0.10.1", "ratelimiter~=1.0", "retry>=0.9.2", "tqdm>=4.27.0", @@ -14,7 +14,7 @@ dependencies = [ "pyyaml>=6.0", "click>=8.1.7", "requests>=2.31.0", - "overrides>=0.7" + "overrides>=0.7", ] description = "lancedb" authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }] @@ -26,7 +26,7 @@ keywords = [ "data-science", "machine-learning", "arrow", - "data-analytics" + "data-analytics", ] classifiers = [ "Development Status :: 3 - Alpha", @@ -48,21 +48,53 @@ classifiers = [ repository = "https://github.com/lancedb/lancedb" [project.optional-dependencies] -tests = ["aiohttp", "pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "duckdb", "pytz", "polars>=0.19"] +tests = [ + "aiohttp", + "pandas>=1.4", + "pytest", + "pytest-mock", + "pytest-asyncio", + "duckdb", + "pytz", + "polars>=0.19", +] dev = ["ruff", "pre-commit"] -docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] +docs = [ + "mkdocs", + "mkdocs-jupyter", + "mkdocs-material", + "mkdocstrings[python]", + "mkdocs-ultralytics-plugin==0.0.44", +] clip = ["torch", "pillow", "open-clip"] -embeddings = ["openai>=1.6.1", "sentence-transformers", "torch", "pillow", "open-clip-torch", "cohere", "huggingface_hub", - "InstructorEmbedding", "google.generativeai", "boto3>=1.28.57", "awscli>=1.29.57", "botocore>=1.31.57"] +embeddings = [ + "openai>=1.6.1", + "sentence-transformers", + "torch", + "pillow", + "open-clip-torch", + "cohere", + "huggingface_hub", + "InstructorEmbedding", + "google.generativeai", + "boto3>=1.28.57", + "awscli>=1.29.57", + "botocore>=1.31.57", +] + +[tool.maturin] +python-source = "python" +module-name = "lancedb._lancedb" [project.scripts] lancedb = "lancedb.cli.cli:cli" [build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta" +requires = ["maturin>=1.4"] +build-backend = "maturin" -[tool.ruff] + +[tool.ruff.lint] select = ["F", "E", "W", "I", "G", "TCH", "PERF"] [tool.pytest.ini_options] @@ -70,5 +102,5 @@ addopts = "--strict-markers --ignore-glob=lancedb/embeddings/*.py" markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", - "asyncio" + "asyncio", ] diff --git a/python/lancedb/__init__.py b/python/python/lancedb/__init__.py similarity index 58% rename from python/lancedb/__init__.py rename to python/python/lancedb/__init__.py index 7b58a432..34a19c80 100644 --- a/python/lancedb/__init__.py +++ b/python/python/lancedb/__init__.py @@ -19,8 +19,9 @@ from typing import Optional, Union __version__ = importlib.metadata.version("lancedb") -from .common import URI -from .db import DBConnection, LanceDBConnection +from ._lancedb import connect as lancedb_connect +from .common import URI, sanitize_uri +from .db import AsyncConnection, AsyncLanceDBConnection, DBConnection, LanceDBConnection from .remote.db import RemoteDBConnection from .schema import vector # noqa: F401 from .utils import sentry_log # noqa: F401 @@ -101,3 +102,74 @@ def connect( uri, api_key, region, host_override, request_thread_pool=request_thread_pool ) return LanceDBConnection(uri, read_consistency_interval=read_consistency_interval) + + +async def connect_async( + uri: URI, + *, + api_key: Optional[str] = None, + region: str = "us-east-1", + host_override: Optional[str] = None, + read_consistency_interval: Optional[timedelta] = None, + request_thread_pool: Optional[Union[int, ThreadPoolExecutor]] = None, +) -> AsyncConnection: + """Connect to a LanceDB database. + + Parameters + ---------- + uri: str or Path + The uri of the database. + api_key: str, optional + If present, connect to LanceDB cloud. + Otherwise, connect to a database on file system or cloud storage. + Can be set via environment variable `LANCEDB_API_KEY`. + region: str, default "us-east-1" + The region to use for LanceDB Cloud. + host_override: str, optional + The override url for LanceDB Cloud. + read_consistency_interval: timedelta, default None + (For LanceDB OSS only) + The interval at which to check for updates to the table from other + processes. If None, then consistency is not checked. For performance + reasons, this is the default. For strong consistency, set this to + zero seconds. Then every read will check for updates from other + processes. As a compromise, you can set this to a non-zero timedelta + for eventual consistency. If more than that interval has passed since + the last check, then the table will be checked for updates. Note: this + consistency only applies to read operations. Write operations are + always consistent. + request_thread_pool: int or ThreadPoolExecutor, optional + The thread pool to use for making batch requests to the LanceDB Cloud API. + If an integer, then a ThreadPoolExecutor will be created with that + number of threads. If None, then a ThreadPoolExecutor will be created + with the default number of threads. If a ThreadPoolExecutor, then that + executor will be used for making requests. This is for LanceDB Cloud + only and is only used when making batch requests (i.e., passing in + multiple queries to the search method at once). + + Examples + -------- + + For a local directory, provide a path for the database: + + >>> import lancedb + >>> db = lancedb.connect("~/.lancedb") + + For object storage, use a URI prefix: + + >>> db = lancedb.connect("s3://my-bucket/lancedb") + + Connect to LancdDB cloud: + + >>> db = lancedb.connect("db://my_database", api_key="ldb_...") + + Returns + ------- + conn : DBConnection + A connection to a LanceDB database. + """ + return AsyncLanceDBConnection( + await lancedb_connect( + sanitize_uri(uri), api_key, region, host_override, read_consistency_interval + ) + ) diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi new file mode 100644 index 00000000..d1351084 --- /dev/null +++ b/python/python/lancedb/_lancedb.pyi @@ -0,0 +1,24 @@ +from typing import Optional + +import pyarrow as pa + +class Connection(object): + async def table_names(self) -> list[str]: ... + async def create_table( + self, name: str, mode: str, data: pa.RecordBatchReader + ) -> Table: ... + async def create_empty_table( + self, name: str, mode: str, schema: pa.Schema + ) -> Table: ... + +class Table(object): + def name(self) -> str: ... + async def schema(self) -> pa.Schema: ... + +async def connect( + uri: str, + api_key: Optional[str], + region: Optional[str], + host_override: Optional[str], + read_consistency_interval: Optional[float], +) -> Connection: ... diff --git a/python/lancedb/cli/__init__.py b/python/python/lancedb/cli/__init__.py similarity index 100% rename from python/lancedb/cli/__init__.py rename to python/python/lancedb/cli/__init__.py diff --git a/python/lancedb/cli/cli.py b/python/python/lancedb/cli/cli.py similarity index 100% rename from python/lancedb/cli/cli.py rename to python/python/lancedb/cli/cli.py diff --git a/python/python/lancedb/common.py b/python/python/lancedb/common.py new file mode 100644 index 00000000..cc894a72 --- /dev/null +++ b/python/python/lancedb/common.py @@ -0,0 +1,136 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Iterable, List, Optional, Union + +import numpy as np +import pyarrow as pa + +from .util import safe_import_pandas + +pd = safe_import_pandas() + +DATA = Union[List[dict], dict, "pd.DataFrame", pa.Table, Iterable[pa.RecordBatch]] +VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray] +URI = Union[str, Path] +VECTOR_COLUMN_NAME = "vector" + + +class Credential(str): + """Credential field""" + + def __repr__(self) -> str: + return "********" + + def __str__(self) -> str: + return "********" + + +def sanitize_uri(uri: URI) -> str: + return str(uri) + + +def _casting_recordbatch_iter( + input_iter: Iterable[pa.RecordBatch], schema: pa.Schema +) -> Iterable[pa.RecordBatch]: + """ + Wrapper around an iterator of record batches. If the batches don't match the + schema, try to cast them to the schema. If that fails, raise an error. + + This is helpful for users who might have written the iterator with default + data types in PyArrow, but specified more specific types in the schema. For + example, PyArrow defaults to float64 for floating point types, but Lance + uses float32 for vectors. + """ + for batch in input_iter: + if not isinstance(batch, pa.RecordBatch): + raise TypeError(f"Expected RecordBatch, got {type(batch)}") + if batch.schema != schema: + try: + # RecordBatch doesn't have a cast method, but table does. + batch = pa.Table.from_batches([batch]).cast(schema).to_batches()[0] + except pa.lib.ArrowInvalid: + raise ValueError( + f"Input RecordBatch iterator yielded a batch with schema that " + f"does not match the expected schema.\nExpected:\n{schema}\n" + f"Got:\n{batch.schema}" + ) + yield batch + + +def data_to_reader( + data: DATA, schema: Optional[pa.Schema] = None +) -> pa.RecordBatchReader: + """Convert various types of input into a RecordBatchReader""" + if pd is not None and isinstance(data, pd.DataFrame): + return pa.Table.from_pandas(data, schema=schema).to_reader() + elif isinstance(data, pa.Table): + return data.to_reader() + elif isinstance(data, pa.RecordBatch): + return pa.Table.from_batches([data]).to_reader() + # elif isinstance(data, LanceDataset): + # return data_obj.scanner().to_reader() + elif isinstance(data, pa.dataset.Dataset): + return pa.dataset.Scanner.from_dataset(data).to_reader() + elif isinstance(data, pa.dataset.Scanner): + return data.to_reader() + elif isinstance(data, pa.RecordBatchReader): + return data + elif ( + type(data).__module__.startswith("polars") + and data.__class__.__name__ == "DataFrame" + ): + return data.to_arrow().to_reader() + # for other iterables, assume they are of type Iterable[RecordBatch] + elif isinstance(data, Iterable): + if schema is not None: + data = _casting_recordbatch_iter(data, schema) + return pa.RecordBatchReader.from_batches(schema, data) + else: + raise ValueError( + "Must provide schema to write dataset from RecordBatch iterable" + ) + else: + raise TypeError( + f"Unknown data type {type(data)}. " + "Please check " + "https://lancedb.github.io/lance/read_and_write.html " + "to see supported types." + ) + + +def validate_schema(schema: pa.Schema): + """ + Make sure the metadata is valid utf8 + """ + if schema.metadata is not None: + _validate_metadata(schema.metadata) + + +def _validate_metadata(metadata: dict): + """ + Make sure the metadata values are valid utf8 (can be nested) + + Raises ValueError if not valid utf8 + """ + for k, v in metadata.items(): + if isinstance(v, bytes): + try: + v.decode("utf8") + except UnicodeDecodeError: + raise ValueError( + f"Metadata key {k} is not valid utf8. " + "Consider base64 encode for generic binary metadata." + ) + elif isinstance(v, dict): + _validate_metadata(v) diff --git a/python/lancedb/conftest.py b/python/python/lancedb/conftest.py similarity index 100% rename from python/lancedb/conftest.py rename to python/python/lancedb/conftest.py diff --git a/python/lancedb/context.py b/python/python/lancedb/context.py similarity index 100% rename from python/lancedb/context.py rename to python/python/lancedb/context.py diff --git a/python/lancedb/db.py b/python/python/lancedb/db.py similarity index 55% rename from python/lancedb/db.py rename to python/python/lancedb/db.py index 41b56494..d87983da 100644 --- a/python/lancedb/db.py +++ b/python/python/lancedb/db.py @@ -13,6 +13,7 @@ from __future__ import annotations +import inspect import os from abc import abstractmethod from pathlib import Path @@ -22,15 +23,20 @@ import pyarrow as pa from overrides import EnforceOverrides, override from pyarrow import fs -from .table import LanceTable, Table +from lancedb.common import data_to_reader, validate_schema +from lancedb.embeddings.registry import EmbeddingFunctionRegistry +from lancedb.utils.events import register_event + +from .pydantic import LanceModel +from .table import AsyncLanceTable, LanceTable, Table, _sanitize_data from .util import fs_from_uri, get_uri_location, get_uri_scheme, join_uri if TYPE_CHECKING: from datetime import timedelta + from ._lancedb import Connection as LanceDbConnection from .common import DATA, URI from .embeddings import EmbeddingFunctionConfig - from .pydantic import LanceModel class DBConnection(EnforceOverrides): @@ -40,14 +46,21 @@ class DBConnection(EnforceOverrides): def table_names( self, page_token: Optional[str] = None, limit: int = 10 ) -> Iterable[str]: - """List all table in this database + """List all tables in this database, in sorted order Parameters ---------- page_token: str, optional The token to use for pagination. If not present, start from the beginning. + Typically, this token is last table name from the previous page. + Only supported by LanceDb Cloud. limit: int, default 10 The size of the page to return. + Only supported by LanceDb Cloud. + + Returns + ------- + Iterable of str """ pass @@ -412,3 +425,313 @@ class LanceDBConnection(DBConnection): def drop_database(self): filesystem, path = fs_from_uri(self.uri) filesystem.delete_dir(path) + + +class AsyncConnection(EnforceOverrides): + """An active LanceDB connection interface.""" + + @abstractmethod + async def table_names( + self, *, page_token: Optional[str] = None, limit: int = 10 + ) -> Iterable[str]: + """List all tables in this database, in sorted order + + Parameters + ---------- + page_token: str, optional + The token to use for pagination. If not present, start from the beginning. + Typically, this token is last table name from the previous page. + Only supported by LanceDb Cloud. + limit: int, default 10 + The size of the page to return. + Only supported by LanceDb Cloud. + + Returns + ------- + Iterable of str + """ + pass + + @abstractmethod + async def create_table( + self, + name: str, + data: Optional[DATA] = None, + schema: Optional[Union[pa.Schema, LanceModel]] = None, + mode: str = "create", + exist_ok: bool = False, + on_bad_vectors: str = "error", + fill_value: float = 0.0, + embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None, + ) -> Table: + """Create a [Table][lancedb.table.Table] in the database. + + Parameters + ---------- + name: str + The name of the table. + data: The data to initialize the table, *optional* + User must provide at least one of `data` or `schema`. + Acceptable types are: + + - dict or list-of-dict + + - pandas.DataFrame + + - pyarrow.Table or pyarrow.RecordBatch + schema: The schema of the table, *optional* + Acceptable types are: + + - pyarrow.Schema + + - [LanceModel][lancedb.pydantic.LanceModel] + mode: str; default "create" + The mode to use when creating the table. + Can be either "create" or "overwrite". + By default, if the table already exists, an exception is raised. + If you want to overwrite the table, use mode="overwrite". + exist_ok: bool, default False + If a table by the same name already exists, then raise an exception + if exist_ok=False. If exist_ok=True, then open the existing table; + it will not add the provided data but will validate against any + schema that's specified. + on_bad_vectors: str, default "error" + What to do if any of the vectors are not the same size or contains NaNs. + One of "error", "drop", "fill". + fill_value: float + The value to use when filling vectors. Only used if on_bad_vectors="fill". + + Returns + ------- + LanceTable + A reference to the newly created table. + + !!! note + + The vector index won't be created by default. + To create the index, call the `create_index` method on the table. + + Examples + -------- + + Can create with list of tuples or dictionaries: + + >>> import lancedb + >>> db = lancedb.connect("./.lancedb") + >>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7}, + ... {"vector": [0.2, 1.8], "lat": 40.1, "long": -74.1}] + >>> db.create_table("my_table", data) + LanceTable(connection=..., name="my_table") + >>> db["my_table"].head() + pyarrow.Table + vector: fixed_size_list[2] + child 0, item: float + lat: double + long: double + ---- + vector: [[[1.1,1.2],[0.2,1.8]]] + lat: [[45.5,40.1]] + long: [[-122.7,-74.1]] + + You can also pass a pandas DataFrame: + + >>> import pandas as pd + >>> data = pd.DataFrame({ + ... "vector": [[1.1, 1.2], [0.2, 1.8]], + ... "lat": [45.5, 40.1], + ... "long": [-122.7, -74.1] + ... }) + >>> db.create_table("table2", data) + LanceTable(connection=..., name="table2") + >>> db["table2"].head() + pyarrow.Table + vector: fixed_size_list[2] + child 0, item: float + lat: double + long: double + ---- + vector: [[[1.1,1.2],[0.2,1.8]]] + lat: [[45.5,40.1]] + long: [[-122.7,-74.1]] + + Data is converted to Arrow before being written to disk. For maximum + control over how data is saved, either provide the PyArrow schema to + convert to or else provide a [PyArrow Table](pyarrow.Table) directly. + + >>> custom_schema = pa.schema([ + ... pa.field("vector", pa.list_(pa.float32(), 2)), + ... pa.field("lat", pa.float32()), + ... pa.field("long", pa.float32()) + ... ]) + >>> db.create_table("table3", data, schema = custom_schema) + LanceTable(connection=..., name="table3") + >>> db["table3"].head() + pyarrow.Table + vector: fixed_size_list[2] + child 0, item: float + lat: float + long: float + ---- + vector: [[[1.1,1.2],[0.2,1.8]]] + lat: [[45.5,40.1]] + long: [[-122.7,-74.1]] + + + It is also possible to create an table from `[Iterable[pa.RecordBatch]]`: + + + >>> import pyarrow as pa + >>> def make_batches(): + ... for i in range(5): + ... yield pa.RecordBatch.from_arrays( + ... [ + ... pa.array([[3.1, 4.1], [5.9, 26.5]], + ... pa.list_(pa.float32(), 2)), + ... pa.array(["foo", "bar"]), + ... pa.array([10.0, 20.0]), + ... ], + ... ["vector", "item", "price"], + ... ) + >>> schema=pa.schema([ + ... pa.field("vector", pa.list_(pa.float32(), 2)), + ... pa.field("item", pa.utf8()), + ... pa.field("price", pa.float32()), + ... ]) + >>> db.create_table("table4", make_batches(), schema=schema) + LanceTable(connection=..., name="table4") + + """ + raise NotImplementedError + + async def open_table(self, name: str) -> Table: + """Open a Lance Table in the database. + + Parameters + ---------- + name: str + The name of the table. + + Returns + ------- + A LanceTable object representing the table. + """ + raise NotImplementedError + + async def drop_table(self, name: str): + """Drop a table from the database. + + Parameters + ---------- + name: str + The name of the table. + """ + raise NotImplementedError + + async def drop_database(self): + """ + Drop database + This is the same thing as dropping all the tables + """ + raise NotImplementedError + + +class AsyncLanceDBConnection(AsyncConnection): + def __init__(self, connection: LanceDbConnection): + self._inner = connection + + async def __repr__(self) -> str: + pass + + @override + async def table_names( + self, + *, + page_token=None, + limit=None, + ) -> Iterable[str]: + # TODO: hook in page_token and limit + return await self._inner.table_names() + + @override + async def create_table( + self, + name: str, + data: Optional[DATA] = None, + schema: Optional[Union[pa.Schema, LanceModel]] = None, + mode: str = "create", + exist_ok: bool = False, + on_bad_vectors: str = "error", + fill_value: float = 0.0, + embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None, + ) -> Table: + if mode.lower() not in ["create", "overwrite"]: + raise ValueError("mode must be either 'create' or 'overwrite'") + + if inspect.isclass(schema) and issubclass(schema, LanceModel): + # convert LanceModel to pyarrow schema + # note that it's possible this contains + # embedding function metadata already + schema = schema.to_arrow_schema() + + metadata = None + if embedding_functions is not None: + # If we passed in embedding functions explicitly + # then we'll override any schema metadata that + # may was implicitly specified by the LanceModel schema + registry = EmbeddingFunctionRegistry.get_instance() + metadata = registry.get_table_metadata(embedding_functions) + + if data is not None: + data = _sanitize_data( + data, + schema, + metadata=metadata, + on_bad_vectors=on_bad_vectors, + fill_value=fill_value, + ) + + if schema is None: + if data is None: + raise ValueError("Either data or schema must be provided") + elif hasattr(data, "schema"): + schema = data.schema + elif isinstance(data, Iterable): + if metadata: + raise TypeError( + ( + "Persistent embedding functions not yet " + "supported for generator data input" + ) + ) + + if metadata: + schema = schema.with_metadata(metadata) + validate_schema(schema) + + if mode == "create" and exist_ok: + mode = "exist_ok" + + if data is None: + new_table = await self._inner.create_empty_table(name, mode, schema) + else: + data = data_to_reader(data, schema) + new_table = await self._inner.create_table( + name, + mode, + data, + ) + + register_event("create_table") + return AsyncLanceTable(new_table) + + @override + async def open_table(self, name: str) -> LanceTable: + raise NotImplementedError + + @override + async def drop_table(self, name: str, ignore_missing: bool = False): + raise NotImplementedError + + @override + async def drop_database(self): + raise NotImplementedError diff --git a/python/lancedb/embeddings/__init__.py b/python/python/lancedb/embeddings/__init__.py similarity index 100% rename from python/lancedb/embeddings/__init__.py rename to python/python/lancedb/embeddings/__init__.py diff --git a/python/lancedb/embeddings/base.py b/python/python/lancedb/embeddings/base.py similarity index 100% rename from python/lancedb/embeddings/base.py rename to python/python/lancedb/embeddings/base.py diff --git a/python/lancedb/embeddings/bedrock.py b/python/python/lancedb/embeddings/bedrock.py similarity index 100% rename from python/lancedb/embeddings/bedrock.py rename to python/python/lancedb/embeddings/bedrock.py diff --git a/python/lancedb/embeddings/cohere.py b/python/python/lancedb/embeddings/cohere.py similarity index 100% rename from python/lancedb/embeddings/cohere.py rename to python/python/lancedb/embeddings/cohere.py diff --git a/python/lancedb/embeddings/gemini_text.py b/python/python/lancedb/embeddings/gemini_text.py similarity index 100% rename from python/lancedb/embeddings/gemini_text.py rename to python/python/lancedb/embeddings/gemini_text.py diff --git a/python/lancedb/embeddings/gte.py b/python/python/lancedb/embeddings/gte.py similarity index 100% rename from python/lancedb/embeddings/gte.py rename to python/python/lancedb/embeddings/gte.py diff --git a/python/lancedb/embeddings/gte_mlx_model.py b/python/python/lancedb/embeddings/gte_mlx_model.py similarity index 100% rename from python/lancedb/embeddings/gte_mlx_model.py rename to python/python/lancedb/embeddings/gte_mlx_model.py diff --git a/python/python/lancedb/embeddings/imagebind.py b/python/python/lancedb/embeddings/imagebind.py new file mode 100644 index 00000000..eb89d505 --- /dev/null +++ b/python/python/lancedb/embeddings/imagebind.py @@ -0,0 +1,172 @@ +# Copyright (c) 2023. LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import cached_property +from typing import List, Union + +import numpy as np +import pyarrow as pa + +from ..util import attempt_import_or_raise +from .base import EmbeddingFunction +from .registry import register +from .utils import AUDIO, IMAGES, TEXT + + +@register("imagebind") +class ImageBindEmbeddings(EmbeddingFunction): + """ + An embedding function that uses the ImageBind API + For generating multi-modal embeddings across + six different modalities: images, text, audio, depth, thermal, and IMU data + + to download package, run : + `pip install imagebind@git+https://github.com/raghavdixit99/ImageBind` + """ + + name: str = "imagebind_huge" + device: str = "cpu" + normalize: bool = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._ndims = 1024 + self._audio_extensions = (".mp3", ".wav", ".flac", ".ogg", ".aac") + self._image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp") + + @cached_property + def embedding_model(self): + """ + Get the embedding model. This is cached so that the model is only loaded + once per process. + """ + return self.get_embedding_model() + + @cached_property + def _data(self): + """ + Get the data module from imagebind + """ + data = attempt_import_or_raise("imagebind.data", "imagebind") + return data + + @cached_property + def _ModalityType(self): + """ + Get the ModalityType from imagebind + """ + imagebind = attempt_import_or_raise("imagebind", "imagebind") + return imagebind.imagebind_model.ModalityType + + def ndims(self): + return self._ndims + + def compute_query_embeddings( + self, query: Union[str], *args, **kwargs + ) -> List[np.ndarray]: + """ + Compute the embeddings for a given user query + + Parameters + ---------- + query : Union[str] + The query to embed. A query can be either text, image paths or audio paths. + """ + query = self.sanitize_input(query) + if query[0].endswith(self._audio_extensions): + return [self.generate_audio_embeddings(query)] + elif query[0].endswith(self._image_extensions): + return [self.generate_image_embeddings(query)] + else: + return [self.generate_text_embeddings(query)] + + def generate_image_embeddings(self, image: IMAGES) -> np.ndarray: + torch = attempt_import_or_raise("torch") + inputs = { + self._ModalityType.VISION: self._data.load_and_transform_vision_data( + image, self.device + ) + } + with torch.no_grad(): + image_features = self.embedding_model(inputs)[self._ModalityType.VISION] + if self.normalize: + image_features /= image_features.norm(dim=-1, keepdim=True) + return image_features.cpu().numpy().squeeze() + + def generate_audio_embeddings(self, audio: AUDIO) -> np.ndarray: + torch = attempt_import_or_raise("torch") + inputs = { + self._ModalityType.AUDIO: self._data.load_and_transform_audio_data( + audio, self.device + ) + } + with torch.no_grad(): + audio_features = self.embedding_model(inputs)[self._ModalityType.AUDIO] + if self.normalize: + audio_features /= audio_features.norm(dim=-1, keepdim=True) + return audio_features.cpu().numpy().squeeze() + + def generate_text_embeddings(self, text: TEXT) -> np.ndarray: + torch = attempt_import_or_raise("torch") + inputs = { + self._ModalityType.TEXT: self._data.load_and_transform_text( + text, self.device + ) + } + with torch.no_grad(): + text_features = self.embedding_model(inputs)[self._ModalityType.TEXT] + if self.normalize: + text_features /= text_features.norm(dim=-1, keepdim=True) + return text_features.cpu().numpy().squeeze() + + def compute_source_embeddings( + self, source: Union[IMAGES, AUDIO], *args, **kwargs + ) -> List[np.array]: + """ + Get the embeddings for the given sourcefield column in the pydantic model. + """ + source = self.sanitize_input(source) + embeddings = [] + if source[0].endswith(self._audio_extensions): + embeddings.extend(self.generate_audio_embeddings(source)) + return embeddings + elif source[0].endswith(self._image_extensions): + embeddings.extend(self.generate_image_embeddings(source)) + return embeddings + else: + embeddings.extend(self.generate_text_embeddings(source)) + return embeddings + + def sanitize_input( + self, input: Union[IMAGES, AUDIO] + ) -> Union[List[bytes], np.ndarray]: + """ + Sanitize the input to the embedding function. + """ + if isinstance(input, (str, bytes)): + input = [input] + elif isinstance(input, pa.Array): + input = input.to_pylist() + elif isinstance(input, pa.ChunkedArray): + input = input.combine_chunks().to_pylist() + return input + + def get_embedding_model(self): + """ + fetches the imagebind embedding model + """ + imagebind = attempt_import_or_raise("imagebind", "imagebind") + model = imagebind.imagebind_model.imagebind_huge(pretrained=True) + model.eval() + model.to(self.device) + return model diff --git a/python/lancedb/embeddings/instructor.py b/python/python/lancedb/embeddings/instructor.py similarity index 98% rename from python/lancedb/embeddings/instructor.py rename to python/python/lancedb/embeddings/instructor.py index e6481e19..98206bc5 100644 --- a/python/lancedb/embeddings/instructor.py +++ b/python/python/lancedb/embeddings/instructor.py @@ -103,9 +103,9 @@ class InstructorEmbeddingFunction(TextEmbeddingFunction): # convert_to_numpy: bool = True # Hardcoding this as numpy can be ingested directly source_instruction: str = "represent the document for retrieval" - query_instruction: ( - str - ) = "represent the document for retrieving the most similar documents" + query_instruction: str = ( + "represent the document for retrieving the most similar documents" + ) @weak_lru(maxsize=1) def ndims(self): diff --git a/python/lancedb/embeddings/open_clip.py b/python/python/lancedb/embeddings/open_clip.py similarity index 100% rename from python/lancedb/embeddings/open_clip.py rename to python/python/lancedb/embeddings/open_clip.py diff --git a/python/lancedb/embeddings/openai.py b/python/python/lancedb/embeddings/openai.py similarity index 100% rename from python/lancedb/embeddings/openai.py rename to python/python/lancedb/embeddings/openai.py diff --git a/python/lancedb/embeddings/registry.py b/python/python/lancedb/embeddings/registry.py similarity index 100% rename from python/lancedb/embeddings/registry.py rename to python/python/lancedb/embeddings/registry.py diff --git a/python/lancedb/embeddings/sentence_transformers.py b/python/python/lancedb/embeddings/sentence_transformers.py similarity index 100% rename from python/lancedb/embeddings/sentence_transformers.py rename to python/python/lancedb/embeddings/sentence_transformers.py diff --git a/python/lancedb/embeddings/utils.py b/python/python/lancedb/embeddings/utils.py similarity index 98% rename from python/lancedb/embeddings/utils.py rename to python/python/lancedb/embeddings/utils.py index ed9162ba..fe997bbc 100644 --- a/python/lancedb/embeddings/utils.py +++ b/python/python/lancedb/embeddings/utils.py @@ -36,6 +36,7 @@ TEXT = Union[str, List[str], pa.Array, pa.ChunkedArray, np.ndarray] IMAGES = Union[ str, bytes, List[str], List[bytes], pa.Array, pa.ChunkedArray, np.ndarray ] +AUDIO = Union[str, bytes, List[str], List[bytes], pa.Array, pa.ChunkedArray, np.ndarray] @deprecated diff --git a/python/lancedb/exceptions.py b/python/python/lancedb/exceptions.py similarity index 100% rename from python/lancedb/exceptions.py rename to python/python/lancedb/exceptions.py diff --git a/python/lancedb/fts.py b/python/python/lancedb/fts.py similarity index 99% rename from python/lancedb/fts.py rename to python/python/lancedb/fts.py index 750e3076..cb36aa79 100644 --- a/python/lancedb/fts.py +++ b/python/python/lancedb/fts.py @@ -12,6 +12,7 @@ # limitations under the License. """Full text search index using tantivy-py""" + import os from typing import List, Tuple diff --git a/python/lancedb/merge.py b/python/python/lancedb/merge.py similarity index 100% rename from python/lancedb/merge.py rename to python/python/lancedb/merge.py diff --git a/python/lancedb/pydantic.py b/python/python/lancedb/pydantic.py similarity index 100% rename from python/lancedb/pydantic.py rename to python/python/lancedb/pydantic.py diff --git a/python/lancedb/query.py b/python/python/lancedb/query.py similarity index 97% rename from python/lancedb/query.py rename to python/python/lancedb/query.py index 1338f0f2..c4efa505 100644 --- a/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -16,7 +16,7 @@ from __future__ import annotations from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Type, Union import deprecation import numpy as np @@ -93,7 +93,7 @@ class Query(pydantic.BaseModel): metric: str = "L2" # which columns to return in the results - columns: Optional[List[str]] = None + columns: Optional[Union[List[str], Dict[str, str]]] = None # optional query parameters for tuning the results, # e.g. `{"nprobes": "10", "refine_factor": "10"}` @@ -332,20 +332,25 @@ class LanceQueryBuilder(ABC): self._limit = limit return self - def select(self, columns: list) -> LanceQueryBuilder: + def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder: """Set the columns to return. Parameters ---------- - columns: list - The columns to return. + columns: list of str, or dict of str to str default None + List of column names to be fetched. + Or a dictionary of column names to SQL expressions. + All columns are fetched if None or unspecified. Returns ------- LanceQueryBuilder The LanceQueryBuilder object. """ - self._columns = columns + if isinstance(columns, list) or isinstance(columns, dict): + self._columns = columns + else: + raise ValueError("columns must be a list or a dictionary") return self def where(self, where: str, prefilter: bool = False) -> LanceQueryBuilder: @@ -403,7 +408,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): >>> (table.search([0.4, 0.4]) ... .metric("cosine") ... .where("b < 10") - ... .select(["b"]) + ... .select(["b", "vector"]) ... .limit(2) ... .to_pandas()) b vector _distance diff --git a/python/lancedb/remote/__init__.py b/python/python/lancedb/remote/__init__.py similarity index 100% rename from python/lancedb/remote/__init__.py rename to python/python/lancedb/remote/__init__.py diff --git a/python/lancedb/remote/arrow.py b/python/python/lancedb/remote/arrow.py similarity index 100% rename from python/lancedb/remote/arrow.py rename to python/python/lancedb/remote/arrow.py diff --git a/python/lancedb/remote/client.py b/python/python/lancedb/remote/client.py similarity index 100% rename from python/lancedb/remote/client.py rename to python/python/lancedb/remote/client.py diff --git a/python/lancedb/remote/connection_timeout.py b/python/python/lancedb/remote/connection_timeout.py similarity index 100% rename from python/lancedb/remote/connection_timeout.py rename to python/python/lancedb/remote/connection_timeout.py diff --git a/python/lancedb/remote/db.py b/python/python/lancedb/remote/db.py similarity index 100% rename from python/lancedb/remote/db.py rename to python/python/lancedb/remote/db.py diff --git a/python/lancedb/remote/errors.py b/python/python/lancedb/remote/errors.py similarity index 100% rename from python/lancedb/remote/errors.py rename to python/python/lancedb/remote/errors.py diff --git a/python/lancedb/remote/table.py b/python/python/lancedb/remote/table.py similarity index 92% rename from python/lancedb/remote/table.py rename to python/python/lancedb/remote/table.py index a8766eef..bbae9921 100644 --- a/python/lancedb/remote/table.py +++ b/python/python/lancedb/remote/table.py @@ -15,7 +15,7 @@ import logging import uuid from concurrent.futures import Future from functools import cached_property -from typing import Dict, Optional, Union +from typing import Dict, Iterable, Optional, Union import pyarrow as pa from lance import json_to_schema @@ -66,12 +66,36 @@ class RemoteTable(Table): """to_pandas() is not yet supported on LanceDB cloud.""" return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.") - def create_scalar_index(self, *args, **kwargs): - """Creates a scalar index""" - return NotImplementedError( - "create_scalar_index() is not yet supported on LanceDB cloud." + def list_indices(self): + """List all the indices on the table""" + print(self._name) + resp = self._conn._client.post(f"/v1/table/{self._name}/index/list/") + return resp + + def create_scalar_index( + self, + column: str, + ): + """Creates a scalar index + Parameters + ---------- + column : str + The column to be indexed. Must be a boolean, integer, float, + or string column. + """ + index_type = "scalar" + + data = { + "column": column, + "index_type": index_type, + "replace": True, + } + resp = self._conn._client.post( + f"/v1/table/{self._name}/create_scalar_index/", data=data ) + return resp + def create_index( self, metric="L2", @@ -277,6 +301,7 @@ class RemoteTable(Table): f = Future() f.set_result(self._conn._client.query(name, q)) return f + else: def submit(name, q): @@ -473,6 +498,21 @@ class RemoteTable(Table): "count_rows() is not yet supported on the LanceDB cloud" ) + def add_columns(self, transforms: Dict[str, str]): + raise NotImplementedError( + "add_columns() is not yet supported on the LanceDB cloud" + ) + + def alter_columns(self, alterations: Iterable[Dict[str, str]]): + raise NotImplementedError( + "alter_columns() is not yet supported on the LanceDB cloud" + ) + + def drop_columns(self, columns: Iterable[str]): + raise NotImplementedError( + "drop_columns() is not yet supported on the LanceDB cloud" + ) + def add_index(tbl: pa.Table, i: int) -> pa.Table: return tbl.add_column( diff --git a/python/lancedb/rerankers/__init__.py b/python/python/lancedb/rerankers/__init__.py similarity index 100% rename from python/lancedb/rerankers/__init__.py rename to python/python/lancedb/rerankers/__init__.py diff --git a/python/lancedb/rerankers/base.py b/python/python/lancedb/rerankers/base.py similarity index 100% rename from python/lancedb/rerankers/base.py rename to python/python/lancedb/rerankers/base.py diff --git a/python/lancedb/rerankers/cohere.py b/python/python/lancedb/rerankers/cohere.py similarity index 100% rename from python/lancedb/rerankers/cohere.py rename to python/python/lancedb/rerankers/cohere.py diff --git a/python/lancedb/rerankers/colbert.py b/python/python/lancedb/rerankers/colbert.py similarity index 100% rename from python/lancedb/rerankers/colbert.py rename to python/python/lancedb/rerankers/colbert.py diff --git a/python/lancedb/rerankers/cross_encoder.py b/python/python/lancedb/rerankers/cross_encoder.py similarity index 100% rename from python/lancedb/rerankers/cross_encoder.py rename to python/python/lancedb/rerankers/cross_encoder.py diff --git a/python/lancedb/rerankers/linear_combination.py b/python/python/lancedb/rerankers/linear_combination.py similarity index 100% rename from python/lancedb/rerankers/linear_combination.py rename to python/python/lancedb/rerankers/linear_combination.py diff --git a/python/lancedb/rerankers/openai.py b/python/python/lancedb/rerankers/openai.py similarity index 100% rename from python/lancedb/rerankers/openai.py rename to python/python/lancedb/rerankers/openai.py diff --git a/python/lancedb/schema.py b/python/python/lancedb/schema.py similarity index 99% rename from python/lancedb/schema.py rename to python/python/lancedb/schema.py index 9b5dd5e7..1a604dd5 100644 --- a/python/lancedb/schema.py +++ b/python/python/lancedb/schema.py @@ -12,6 +12,7 @@ # limitations under the License. """Schema related utilities.""" + import pyarrow as pa diff --git a/python/lancedb/table.py b/python/python/lancedb/table.py similarity index 69% rename from python/lancedb/table.py rename to python/python/lancedb/table.py index c6bf3332..dba684cb 100644 --- a/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -28,6 +28,7 @@ import pyarrow.compute as pc import pyarrow.fs as pa_fs from lance import LanceDataset from lance.vector import vec_to_table +from overrides import override from .common import DATA, VEC, VECTOR_COLUMN_NAME from .embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry @@ -48,6 +49,7 @@ if TYPE_CHECKING: import PIL from lance.dataset import CleanupStats, ReaderLike + from ._lancedb import Table as LanceDBTable from .db import LanceDBConnection @@ -160,7 +162,7 @@ class Table(ABC): Can query the table with [Table.search][lancedb.table.Table.search]. - >>> table.search([0.4, 0.4]).select(["b"]).to_pandas() + >>> table.search([0.4, 0.4]).select(["b", "vector"]).to_pandas() b vector _distance 0 4 [0.5, 1.3] 0.82 1 2 [1.1, 1.2] 1.13 @@ -438,7 +440,7 @@ class Table(ABC): >>> query = [0.4, 1.4, 2.4] >>> (table.search(query) ... .where("original_width > 1000", prefilter=True) - ... .select(["caption", "original_width"]) + ... .select(["caption", "original_width", "vector"]) ... .limit(2) ... .to_pandas()) caption original_width vector _distance @@ -662,6 +664,56 @@ class Table(ABC): For most cases, the default should be fine. """ + @abstractmethod + def add_columns(self, transforms: Dict[str, str]): + """ + Add new columns with defined values. + + This is not yet available in LanceDB Cloud. + + Parameters + ---------- + transforms: Dict[str, str] + A map of column name to a SQL expression to use to calculate the + value of the new column. These expressions will be evaluated for + each row in the table, and can reference existing columns. + """ + + @abstractmethod + def alter_columns(self, alterations: Iterable[Dict[str, str]]): + """ + Alter column names and nullability. + + This is not yet available in LanceDB Cloud. + + alterations : Iterable[Dict[str, Any]] + A sequence of dictionaries, each with the following keys: + - "path": str + The column path to alter. For a top-level column, this is the name. + For a nested column, this is the dot-separated path, e.g. "a.b.c". + - "name": str, optional + The new name of the column. If not specified, the column name is + not changed. + - "nullable": bool, optional + Whether the column should be nullable. If not specified, the column + nullability is not changed. Only non-nullable columns can be changed + to nullable. Currently, you cannot change a nullable column to + non-nullable. + """ + + @abstractmethod + def drop_columns(self, columns: Iterable[str]): + """ + Drop columns from the table. + + This is not yet available in LanceDB Cloud. + + Parameters + ---------- + columns : Iterable[str] + The names of the columns to drop. + """ + class _LanceDatasetRef(ABC): @property @@ -1223,7 +1275,7 @@ class LanceTable(Table): >>> query = [0.4, 1.4, 2.4] >>> (table.search(query) ... .where("original_width > 1000", prefilter=True) - ... .select(["caption", "original_width"]) + ... .select(["caption", "original_width", "vector"]) ... .limit(2) ... .to_pandas()) caption original_width vector _distance @@ -1550,6 +1602,22 @@ class LanceTable(Table): """ return self.to_lance().optimize.compact_files(*args, **kwargs) + def add_columns(self, transforms: Dict[str, str]): + self._dataset_mut.add_columns(transforms) + + def alter_columns(self, *alterations: Iterable[Dict[str, str]]): + modified = [] + # I called this name in pylance, but I think I regret that now. So we + # allow both name and rename. + for alter in alterations: + if "rename" in alter: + alter["name"] = alter.pop("rename") + modified.append(alter) + self._dataset_mut.alter_columns(*modified) + + def drop_columns(self, columns: Iterable[str]): + self._dataset_mut.drop_columns(columns) + def _sanitize_schema( data: pa.Table, @@ -1728,3 +1796,715 @@ def _sanitize_nans(data, fill_value, on_bad_vectors, vec_arr, vector_column_name is_full = np.any(~is_value_nan.reshape(-1, vec_arr.type.list_size), axis=1) data = data.filter(is_full) return data + + +class AsyncTable(ABC): + """ + A Table is a collection of Records in a LanceDB Database. + + Examples + -------- + + Create using [DBConnection.create_table][lancedb.DBConnection.create_table] + (more examples in that method's documentation). + + >>> import lancedb + >>> db = lancedb.connect("./.lancedb") + >>> table = db.create_table("my_table", data=[{"vector": [1.1, 1.2], "b": 2}]) + >>> table.head() + pyarrow.Table + vector: fixed_size_list[2] + child 0, item: float + b: int64 + ---- + vector: [[[1.1,1.2]]] + b: [[2]] + + Can append new data with [Table.add()][lancedb.table.Table.add]. + + >>> table.add([{"vector": [0.5, 1.3], "b": 4}]) + + Can query the table with [Table.search][lancedb.table.Table.search]. + + >>> table.search([0.4, 0.4]).select(["b", "vector"]).to_pandas() + b vector _distance + 0 4 [0.5, 1.3] 0.82 + 1 2 [1.1, 1.2] 1.13 + + Search queries are much faster when an index is created. See + [Table.create_index][lancedb.table.Table.create_index]. + """ + + @property + @abstractmethod + def name(self) -> str: + """The name of the table.""" + raise NotImplementedError + + @abstractmethod + async def schema(self) -> pa.Schema: + """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#) + of this Table + + """ + raise NotImplementedError + + @abstractmethod + async def count_rows(self, filter: Optional[str] = None) -> int: + """ + Count the number of rows in the table. + + Parameters + ---------- + filter: str, optional + A SQL where clause to filter the rows to count. + """ + raise NotImplementedError + + async def to_pandas(self) -> "pd.DataFrame": + """Return the table as a pandas DataFrame. + + Returns + ------- + pd.DataFrame + """ + return self.to_arrow().to_pandas() + + @abstractmethod + async def to_arrow(self) -> pa.Table: + """Return the table as a pyarrow Table. + + Returns + ------- + pa.Table + """ + raise NotImplementedError + + async def create_index( + self, + metric="L2", + num_partitions=256, + num_sub_vectors=96, + vector_column_name: str = VECTOR_COLUMN_NAME, + replace: bool = True, + accelerator: Optional[str] = None, + index_cache_size: Optional[int] = None, + ): + """Create an index on the table. + + Parameters + ---------- + metric: str, default "L2" + The distance metric to use when creating the index. + Valid values are "L2", "cosine", or "dot". + L2 is euclidean distance. + num_partitions: int, default 256 + The number of IVF partitions to use when creating the index. + Default is 256. + num_sub_vectors: int, default 96 + The number of PQ sub-vectors to use when creating the index. + Default is 96. + vector_column_name: str, default "vector" + The vector column name to create the index. + replace: bool, default True + - If True, replace the existing index if it exists. + + - If False, raise an error if duplicate index exists. + accelerator: str, default None + If set, use the given accelerator to create the index. + Only support "cuda" for now. + index_cache_size : int, optional + The size of the index cache in number of entries. Default value is 256. + """ + raise NotImplementedError + + @abstractmethod + async def create_scalar_index( + self, + column: str, + *, + replace: bool = True, + ): + """Create a scalar index on a column. + + Scalar indices, like vector indices, can be used to speed up scans. A scalar + index can speed up scans that contain filter expressions on the indexed column. + For example, the following scan will be faster if the column ``my_col`` has + a scalar index: + + .. code-block:: python + + import lancedb + + db = lancedb.connect("/data/lance") + img_table = db.open_table("images") + my_df = img_table.search().where("my_col = 7", prefilter=True).to_pandas() + + Scalar indices can also speed up scans containing a vector search and a + prefilter: + + .. code-block::python + + import lancedb + + db = lancedb.connect("/data/lance") + img_table = db.open_table("images") + img_table.search([1, 2, 3, 4], vector_column_name="vector") + .where("my_col != 7", prefilter=True) + .to_pandas() + + Scalar indices can only speed up scans for basic filters using + equality, comparison, range (e.g. ``my_col BETWEEN 0 AND 100``), and set + membership (e.g. `my_col IN (0, 1, 2)`) + + Scalar indices can be used if the filter contains multiple indexed columns and + the filter criteria are AND'd or OR'd together + (e.g. ``my_col < 0 AND other_col> 100``) + + Scalar indices may be used if the filter contains non-indexed columns but, + depending on the structure of the filter, they may not be usable. For example, + if the column ``not_indexed`` does not have a scalar index then the filter + ``my_col = 0 OR not_indexed = 1`` will not be able to use any scalar index on + ``my_col``. + + **Experimental API** + + Parameters + ---------- + column : str + The column to be indexed. Must be a boolean, integer, float, + or string column. + replace : bool, default True + Replace the existing index if it exists. + + Examples + -------- + + .. code-block:: python + + import lance + + dataset = lance.dataset("./images.lance") + dataset.create_scalar_index("category") + """ + raise NotImplementedError + + @abstractmethod + async def add( + self, + data: DATA, + mode: str = "append", + on_bad_vectors: str = "error", + fill_value: float = 0.0, + ): + """Add more data to the [Table](Table). + + Parameters + ---------- + data: DATA + The data to insert into the table. Acceptable types are: + + - dict or list-of-dict + + - pandas.DataFrame + + - pyarrow.Table or pyarrow.RecordBatch + mode: str + The mode to use when writing the data. Valid values are + "append" and "overwrite". + on_bad_vectors: str, default "error" + What to do if any of the vectors are not the same size or contains NaNs. + One of "error", "drop", "fill". + fill_value: float, default 0. + The value to use when filling vectors. Only used if on_bad_vectors="fill". + + """ + raise NotImplementedError + + def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder: + """ + Returns a [`LanceMergeInsertBuilder`][lancedb.merge.LanceMergeInsertBuilder] + that can be used to create a "merge insert" operation + + This operation can add rows, update rows, and remove rows all in a single + transaction. It is a very generic tool that can be used to create + behaviors like "insert if not exists", "update or insert (i.e. upsert)", + or even replace a portion of existing data with new data (e.g. replace + all data where month="january") + + The merge insert operation works by combining new data from a + **source table** with existing data in a **target table** by using a + join. There are three categories of records. + + "Matched" records are records that exist in both the source table and + the target table. "Not matched" records exist only in the source table + (e.g. these are new data) "Not matched by source" records exist only + in the target table (this is old data) + + The builder returned by this method can be used to customize what + should happen for each category of data. + + Please note that the data may appear to be reordered as part of this + operation. This is because updated rows will be deleted from the + dataset and then reinserted at the end with the new values. + + Parameters + ---------- + + on: Union[str, Iterable[str]] + A column (or columns) to join on. This is how records from the + source table and target table are matched. Typically this is some + kind of key or id column. + + Examples + -------- + >>> import lancedb + >>> data = pa.table({"a": [2, 1, 3], "b": ["a", "b", "c"]}) + >>> db = lancedb.connect("./.lancedb") + >>> table = db.create_table("my_table", data) + >>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]}) + >>> # Perform a "upsert" operation + >>> table.merge_insert("a") \\ + ... .when_matched_update_all() \\ + ... .when_not_matched_insert_all() \\ + ... .execute(new_data) + >>> # The order of new rows is non-deterministic since we use + >>> # a hash-join as part of this operation and so we sort here + >>> table.to_arrow().sort_by("a").to_pandas() + a b + 0 1 b + 1 2 x + 2 3 y + 3 4 z + """ + on = [on] if isinstance(on, str) else list(on.iter()) + + return LanceMergeInsertBuilder(self, on) + + @abstractmethod + async def search( + self, + query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None, + vector_column_name: Optional[str] = None, + query_type: str = "auto", + ) -> LanceQueryBuilder: + """Create a search query to find the nearest neighbors + of the given query vector. We currently support [vector search][search] + and [full-text search][experimental-full-text-search]. + + All query options are defined in [Query][lancedb.query.Query]. + + Examples + -------- + >>> import lancedb + >>> db = lancedb.connect("./.lancedb") + >>> data = [ + ... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]}, + ... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]}, + ... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]} + ... ] + >>> table = db.create_table("my_table", data) + >>> query = [0.4, 1.4, 2.4] + >>> (table.search(query) + ... .where("original_width > 1000", prefilter=True) + ... .select(["caption", "original_width", "vector"]) + ... .limit(2) + ... .to_pandas()) + caption original_width vector _distance + 0 foo 2000 [0.5, 3.4, 1.3] 5.220000 + 1 test 3000 [0.3, 6.2, 2.6] 23.089996 + + Parameters + ---------- + query: list/np.ndarray/str/PIL.Image.Image, default None + The targetted vector to search for. + + - *default None*. + Acceptable types are: list, np.ndarray, PIL.Image.Image + + - If None then the select/where/limit clauses are applied to filter + the table + vector_column_name: str, optional + The name of the vector column to search. + + The vector column needs to be a pyarrow fixed size list type + + - If not specified then the vector column is inferred from + the table schema + + - If the table has multiple vector columns then the *vector_column_name* + needs to be specified. Otherwise, an error is raised. + query_type: str + *default "auto"*. + Acceptable types are: "vector", "fts", "hybrid", or "auto" + + - If "auto" then the query type is inferred from the query; + + - If `query` is a list/np.ndarray then the query type is + "vector"; + + - If `query` is a PIL.Image.Image then either do vector search, + or raise an error if no corresponding embedding function is found. + + - If `query` is a string, then the query type is "vector" if the + table has embedding functions else the query type is "fts" + + Returns + ------- + LanceQueryBuilder + A query builder object representing the query. + Once executed, the query returns + + - selected columns + + - the vector + + - and also the "_distance" column which is the distance between the query + vector and the returned vector. + """ + raise NotImplementedError + + @abstractmethod + async def _execute_query(self, query: Query) -> pa.Table: + pass + + @abstractmethod + async def _do_merge( + self, + merge: LanceMergeInsertBuilder, + new_data: DATA, + on_bad_vectors: str, + fill_value: float, + ): + pass + + @abstractmethod + async def delete(self, where: str): + """Delete rows from the table. + + This can be used to delete a single row, many rows, all rows, or + sometimes no rows (if your predicate matches nothing). + + Parameters + ---------- + where: str + The SQL where clause to use when deleting rows. + + - For example, 'x = 2' or 'x IN (1, 2, 3)'. + + The filter must not be empty, or it will error. + + Examples + -------- + >>> import lancedb + >>> data = [ + ... {"x": 1, "vector": [1, 2]}, + ... {"x": 2, "vector": [3, 4]}, + ... {"x": 3, "vector": [5, 6]} + ... ] + >>> db = lancedb.connect("./.lancedb") + >>> table = db.create_table("my_table", data) + >>> table.to_pandas() + x vector + 0 1 [1.0, 2.0] + 1 2 [3.0, 4.0] + 2 3 [5.0, 6.0] + >>> table.delete("x = 2") + >>> table.to_pandas() + x vector + 0 1 [1.0, 2.0] + 1 3 [5.0, 6.0] + + If you have a list of values to delete, you can combine them into a + stringified list and use the `IN` operator: + + >>> to_remove = [1, 5] + >>> to_remove = ", ".join([str(v) for v in to_remove]) + >>> to_remove + '1, 5' + >>> table.delete(f"x IN ({to_remove})") + >>> table.to_pandas() + x vector + 0 3 [5.0, 6.0] + """ + raise NotImplementedError + + @abstractmethod + async def update( + self, + where: Optional[str] = None, + values: Optional[dict] = None, + *, + values_sql: Optional[Dict[str, str]] = None, + ): + """ + This can be used to update zero to all rows depending on how many + rows match the where clause. If no where clause is provided, then + all rows will be updated. + + Either `values` or `values_sql` must be provided. You cannot provide + both. + + Parameters + ---------- + where: str, optional + The SQL where clause to use when updating rows. For example, 'x = 2' + or 'x IN (1, 2, 3)'. The filter must not be empty, or it will error. + values: dict, optional + The values to update. The keys are the column names and the values + are the values to set. + values_sql: dict, optional + The values to update, expressed as SQL expression strings. These can + reference existing columns. For example, {"x": "x + 1"} will increment + the x column by 1. + + Examples + -------- + >>> import lancedb + >>> import pandas as pd + >>> data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]}) + >>> db = lancedb.connect("./.lancedb") + >>> table = db.create_table("my_table", data) + >>> table.to_pandas() + x vector + 0 1 [1.0, 2.0] + 1 2 [3.0, 4.0] + 2 3 [5.0, 6.0] + >>> table.update(where="x = 2", values={"vector": [10, 10]}) + >>> table.to_pandas() + x vector + 0 1 [1.0, 2.0] + 1 3 [5.0, 6.0] + 2 2 [10.0, 10.0] + >>> table.update(values_sql={"x": "x + 1"}) + >>> table.to_pandas() + x vector + 0 2 [1.0, 2.0] + 1 4 [5.0, 6.0] + 2 3 [10.0, 10.0] + """ + raise NotImplementedError + + @abstractmethod + async def cleanup_old_versions( + self, + older_than: Optional[timedelta] = None, + *, + delete_unverified: bool = False, + ) -> CleanupStats: + """ + Clean up old versions of the table, freeing disk space. + + Note: This function is not available in LanceDb Cloud (since LanceDb + Cloud manages cleanup for you automatically) + + Parameters + ---------- + older_than: timedelta, default None + The minimum age of the version to delete. If None, then this defaults + to two weeks. + delete_unverified: bool, default False + Because they may be part of an in-progress transaction, files newer + than 7 days old are not deleted by default. If you are sure that + there are no in-progress transactions, then you can set this to True + to delete all files older than `older_than`. + + Returns + ------- + CleanupStats + The stats of the cleanup operation, including how many bytes were + freed. + """ + raise NotImplementedError + + @abstractmethod + async def compact_files(self, *args, **kwargs): + """ + Run the compaction process on the table. + + Note: This function is not available in LanceDb Cloud (since LanceDb + Cloud manages compaction for you automatically) + + This can be run after making several small appends to optimize the table + for faster reads. + + Arguments are passed onto :meth:`lance.dataset.DatasetOptimizer.compact_files`. + For most cases, the default should be fine. + """ + raise NotImplementedError + + @abstractmethod + async def add_columns(self, transforms: Dict[str, str]): + """ + Add new columns with defined values. + + This is not yet available in LanceDB Cloud. + + Parameters + ---------- + transforms: Dict[str, str] + A map of column name to a SQL expression to use to calculate the + value of the new column. These expressions will be evaluated for + each row in the table, and can reference existing columns. + """ + raise NotImplementedError + + @abstractmethod + async def alter_columns(self, alterations: Iterable[Dict[str, str]]): + """ + Alter column names and nullability. + + This is not yet available in LanceDB Cloud. + + alterations : Iterable[Dict[str, Any]] + A sequence of dictionaries, each with the following keys: + - "path": str + The column path to alter. For a top-level column, this is the name. + For a nested column, this is the dot-separated path, e.g. "a.b.c". + - "name": str, optional + The new name of the column. If not specified, the column name is + not changed. + - "nullable": bool, optional + Whether the column should be nullable. If not specified, the column + nullability is not changed. Only non-nullable columns can be changed + to nullable. Currently, you cannot change a nullable column to + non-nullable. + """ + raise NotImplementedError + + @abstractmethod + async def drop_columns(self, columns: Iterable[str]): + """ + Drop columns from the table. + + This is not yet available in LanceDB Cloud. + + Parameters + ---------- + columns : Iterable[str] + The names of the columns to drop. + """ + raise NotImplementedError + + +class AsyncLanceTable(AsyncTable): + def __init__(self, table: LanceDBTable): + self._inner = table + + @property + @override + def name(self) -> str: + return self._inner.name() + + @override + async def schema(self) -> pa.Schema: + return await self._inner.schema() + + @override + async def count_rows(self, filter: Optional[str] = None) -> int: + raise NotImplementedError + + async def to_pandas(self) -> "pd.DataFrame": + return self.to_arrow().to_pandas() + + @override + async def to_arrow(self) -> pa.Table: + raise NotImplementedError + + async def create_index( + self, + metric="L2", + num_partitions=256, + num_sub_vectors=96, + vector_column_name: str = VECTOR_COLUMN_NAME, + replace: bool = True, + accelerator: Optional[str] = None, + index_cache_size: Optional[int] = None, + ): + raise NotImplementedError + + @override + async def create_scalar_index( + self, + column: str, + *, + replace: bool = True, + ): + raise NotImplementedError + + @override + async def add( + self, + data: DATA, + mode: str = "append", + on_bad_vectors: str = "error", + fill_value: float = 0.0, + ): + raise NotImplementedError + + def merge_insert(self, on: Union[str, Iterable[str]]) -> LanceMergeInsertBuilder: + on = [on] if isinstance(on, str) else list(on.iter()) + + return LanceMergeInsertBuilder(self, on) + + @override + async def search( + self, + query: Optional[Union[VEC, str, "PIL.Image.Image", Tuple]] = None, + vector_column_name: Optional[str] = None, + query_type: str = "auto", + ) -> LanceQueryBuilder: + raise NotImplementedError + + @override + async def _execute_query(self, query: Query) -> pa.Table: + pass + + @override + async def _do_merge( + self, + merge: LanceMergeInsertBuilder, + new_data: DATA, + on_bad_vectors: str, + fill_value: float, + ): + pass + + @override + async def delete(self, where: str): + raise NotImplementedError + + @override + async def update( + self, + where: Optional[str] = None, + values: Optional[dict] = None, + *, + values_sql: Optional[Dict[str, str]] = None, + ): + raise NotImplementedError + + @override + async def cleanup_old_versions( + self, + older_than: Optional[timedelta] = None, + *, + delete_unverified: bool = False, + ) -> CleanupStats: + raise NotImplementedError + + @override + async def compact_files(self, *args, **kwargs): + raise NotImplementedError + + @override + async def add_columns(self, transforms: Dict[str, str]): + raise NotImplementedError + + @override + async def alter_columns(self, alterations: Iterable[Dict[str, str]]): + raise NotImplementedError + + @override + async def drop_columns(self, columns: Iterable[str]): + raise NotImplementedError diff --git a/python/lancedb/util.py b/python/python/lancedb/util.py similarity index 100% rename from python/lancedb/util.py rename to python/python/lancedb/util.py diff --git a/python/lancedb/utils/__init__.py b/python/python/lancedb/utils/__init__.py similarity index 100% rename from python/lancedb/utils/__init__.py rename to python/python/lancedb/utils/__init__.py diff --git a/python/lancedb/utils/config.py b/python/python/lancedb/utils/config.py similarity index 100% rename from python/lancedb/utils/config.py rename to python/python/lancedb/utils/config.py diff --git a/python/lancedb/utils/events.py b/python/python/lancedb/utils/events.py similarity index 100% rename from python/lancedb/utils/events.py rename to python/python/lancedb/utils/events.py diff --git a/python/lancedb/utils/general.py b/python/python/lancedb/utils/general.py similarity index 100% rename from python/lancedb/utils/general.py rename to python/python/lancedb/utils/general.py diff --git a/python/lancedb/utils/sentry_log.py b/python/python/lancedb/utils/sentry_log.py similarity index 100% rename from python/lancedb/utils/sentry_log.py rename to python/python/lancedb/utils/sentry_log.py diff --git a/python/tests/test_cli.py b/python/python/tests/test_cli.py similarity index 99% rename from python/tests/test_cli.py rename to python/python/tests/test_cli.py index f43e0fb3..afc88888 100644 --- a/python/tests/test_cli.py +++ b/python/python/tests/test_cli.py @@ -1,5 +1,4 @@ from click.testing import CliRunner - from lancedb.cli.cli import cli from lancedb.utils import CONFIG diff --git a/python/tests/test_context.py b/python/python/tests/test_context.py similarity index 99% rename from python/tests/test_context.py rename to python/python/tests/test_context.py index 75adb348..bcd1d63c 100644 --- a/python/tests/test_context.py +++ b/python/python/tests/test_context.py @@ -13,7 +13,6 @@ import pandas as pd import pytest - from lancedb.context import contextualize diff --git a/python/tests/test_db.py b/python/python/tests/test_db.py similarity index 79% rename from python/tests/test_db.py rename to python/python/tests/test_db.py index 7b716f22..c66131cf 100644 --- a/python/tests/test_db.py +++ b/python/python/tests/test_db.py @@ -11,12 +11,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import lancedb import numpy as np import pandas as pd import pyarrow as pa import pytest - -import lancedb from lancedb.pydantic import LanceModel, Vector @@ -166,6 +165,24 @@ def test_table_names(tmp_path): assert db.table_names() == ["test1", "test2", "test3"] +@pytest.mark.asyncio +async def test_table_names_async(tmp_path): + db = lancedb.connect(tmp_path) + data = pd.DataFrame( + { + "vector": [[3.1, 4.1], [5.9, 26.5]], + "item": ["foo", "bar"], + "price": [10.0, 20.0], + } + ) + db.create_table("test2", data=data) + db.create_table("test1", data=data) + db.create_table("test3", data=data) + + db = await lancedb.connect_async(tmp_path) + assert await db.table_names() == ["test1", "test2", "test3"] + + def test_create_mode(tmp_path): db = lancedb.connect(tmp_path) data = pd.DataFrame( @@ -233,6 +250,78 @@ def test_create_exist_ok(tmp_path): db.create_table("test", schema=bad_schema, exist_ok=True) +@pytest.mark.asyncio +async def test_create_mode_async(tmp_path): + db = await lancedb.connect_async(tmp_path) + data = pd.DataFrame( + { + "vector": [[3.1, 4.1], [5.9, 26.5]], + "item": ["foo", "bar"], + "price": [10.0, 20.0], + } + ) + await db.create_table("test", data=data) + + with pytest.raises(RuntimeError): + await db.create_table("test", data=data) + + new_data = pd.DataFrame( + { + "vector": [[3.1, 4.1], [5.9, 26.5]], + "item": ["fizz", "buzz"], + "price": [10.0, 20.0], + } + ) + _tbl = await db.create_table("test", data=new_data, mode="overwrite") + + # MIGRATION: to_pandas() is not available in async + # assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"] + + +@pytest.mark.asyncio +async def test_create_exist_ok_async(tmp_path): + db = await lancedb.connect_async(tmp_path) + data = pd.DataFrame( + { + "vector": [[3.1, 4.1], [5.9, 26.5]], + "item": ["foo", "bar"], + "price": [10.0, 20.0], + } + ) + tbl = await db.create_table("test", data=data) + + with pytest.raises(RuntimeError): + await db.create_table("test", data=data) + + # open the table but don't add more rows + tbl2 = await db.create_table("test", data=data, exist_ok=True) + assert tbl.name == tbl2.name + assert await tbl.schema() == await tbl2.schema() + + schema = pa.schema( + [ + pa.field("vector", pa.list_(pa.float32(), list_size=2)), + pa.field("item", pa.utf8()), + pa.field("price", pa.float64()), + ] + ) + tbl3 = await db.create_table("test", schema=schema, exist_ok=True) + assert await tbl3.schema() == schema + + # Migration: When creating a table, but the table already exists, but + # the schema is different, it should raise an error. + # bad_schema = pa.schema( + # [ + # pa.field("vector", pa.list_(pa.float32(), list_size=2)), + # pa.field("item", pa.utf8()), + # pa.field("price", pa.float64()), + # pa.field("extra", pa.float32()), + # ] + # ) + # with pytest.raises(ValueError): + # await db.create_table("test", schema=bad_schema, exist_ok=True) + + def test_delete_table(tmp_path): db = lancedb.connect(tmp_path) data = pd.DataFrame( diff --git a/python/tests/test_e2e_remote_db.py b/python/python/tests/test_e2e_remote_db.py similarity index 99% rename from python/tests/test_e2e_remote_db.py rename to python/python/tests/test_e2e_remote_db.py index e9e69c48..f058092e 100644 --- a/python/tests/test_e2e_remote_db.py +++ b/python/python/tests/test_e2e_remote_db.py @@ -13,7 +13,6 @@ import numpy as np import pytest - from lancedb import LanceDBConnection # TODO: setup integ test mark and script diff --git a/python/tests/test_embeddings.py b/python/python/tests/test_embeddings.py similarity index 99% rename from python/tests/test_embeddings.py rename to python/python/tests/test_embeddings.py index 32142a57..af442a16 100644 --- a/python/tests/test_embeddings.py +++ b/python/python/tests/test_embeddings.py @@ -13,11 +13,10 @@ import sys import lance +import lancedb import numpy as np import pyarrow as pa import pytest - -import lancedb from lancedb.conftest import MockTextEmbeddingFunction from lancedb.embeddings import ( EmbeddingFunctionConfig, diff --git a/python/tests/test_embeddings_slow.py b/python/python/tests/test_embeddings_slow.py similarity index 75% rename from python/tests/test_embeddings_slow.py rename to python/python/tests/test_embeddings_slow.py index dff931c1..9dc85bfc 100644 --- a/python/tests/test_embeddings_slow.py +++ b/python/python/tests/test_embeddings_slow.py @@ -14,12 +14,11 @@ import importlib import io import os +import lancedb import numpy as np import pandas as pd import pytest import requests - -import lancedb from lancedb.embeddings import get_registry from lancedb.pydantic import LanceModel, Vector @@ -28,6 +27,23 @@ from lancedb.pydantic import LanceModel, Vector # or connection to external api +try: + if importlib.util.find_spec("mlx.core") is not None: + _mlx = True + else: + _mlx = None +except Exception: + _mlx = None + +try: + if importlib.util.find_spec("imagebind") is not None: + _imagebind = True + else: + _imagebind = None +except Exception: + _imagebind = None + + @pytest.mark.slow @pytest.mark.parametrize("alias", ["sentence-transformers", "openai"]) def test_basic_text_embeddings(alias, tmp_path): @@ -158,6 +174,88 @@ def test_openclip(tmp_path): ) +@pytest.mark.skipif( + _imagebind is None, + reason="skip if imagebind not installed.", +) +@pytest.mark.slow +def test_imagebind(tmp_path): + import os + import shutil + import tempfile + + import lancedb.embeddings.imagebind + import pandas as pd + import requests + from lancedb.embeddings import get_registry + from lancedb.pydantic import LanceModel, Vector + + with tempfile.TemporaryDirectory() as temp_dir: + print(f"Created temporary directory {temp_dir}") + + def download_images(image_uris): + downloaded_image_paths = [] + for uri in image_uris: + try: + response = requests.get(uri, stream=True) + if response.status_code == 200: + # Extract image name from URI + image_name = os.path.basename(uri) + image_path = os.path.join(temp_dir, image_name) + with open(image_path, "wb") as out_file: + shutil.copyfileobj(response.raw, out_file) + downloaded_image_paths.append(image_path) + except Exception as e: # noqa: PERF203 + print(f"Failed to download {uri}. Error: {e}") + return temp_dir, downloaded_image_paths + + db = lancedb.connect(tmp_path) + registry = get_registry() + func = registry.get("imagebind").create(max_retries=0) + + class Images(LanceModel): + label: str + image_uri: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() + + table = db.create_table("images", schema=Images) + labels = ["cat", "cat", "dog", "dog", "horse", "horse"] + uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", + "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", + "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", + ] + temp_dir, downloaded_images = download_images(uris) + table.add(pd.DataFrame({"label": labels, "image_uri": downloaded_images})) + # text search + actual = ( + table.search("man's best friend", vector_column_name="vector") + .limit(1) + .to_pydantic(Images)[0] + ) + assert actual.label == "dog" + + # image search + query_image_uri = [ + "https://live.staticflickr.com/65535/33336453970_491665f66e_h.jpg" + ] + temp_dir, downloaded_images = download_images(query_image_uri) + query_image_uri = downloaded_images[0] + actual = ( + table.search(query_image_uri, vector_column_name="vector") + .limit(1) + .to_pydantic(Images)[0] + ) + assert actual.label == "dog" + + if os.path.isdir(temp_dir): + shutil.rmtree(temp_dir) + print(f"Deleted temporary directory {temp_dir}") + + @pytest.mark.slow @pytest.mark.skipif( os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set" @@ -217,13 +315,6 @@ def test_gemini_embedding(tmp_path): assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world" -try: - if importlib.util.find_spec("mlx.core") is not None: - _mlx = True -except ImportError: - _mlx = None - - @pytest.mark.skipif( _mlx is None, reason="mlx tests only required for apple users.", diff --git a/python/tests/test_fts.py b/python/python/tests/test_fts.py similarity index 98% rename from python/tests/test_fts.py rename to python/python/tests/test_fts.py index a62b1b2e..e884d605 100644 --- a/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -14,13 +14,13 @@ import os import random from unittest import mock +import lancedb as ldb import numpy as np import pandas as pd import pytest -import tantivy -import lancedb as ldb -import lancedb.fts +pytest.importorskip("lancedb.fts") +tantivy = pytest.importorskip("tantivy") @pytest.fixture diff --git a/python/tests/test_io.py b/python/python/tests/test_io.py similarity index 99% rename from python/tests/test_io.py rename to python/python/tests/test_io.py index 0629e809..10b749b2 100644 --- a/python/tests/test_io.py +++ b/python/python/tests/test_io.py @@ -13,9 +13,8 @@ import os -import pytest - import lancedb +import pytest # You need to setup AWS credentials an a base path to run this test. Example # AWS_PROFILE=default TEST_S3_BASE_URL=s3://my_bucket/dataset pytest tests/test_io.py diff --git a/python/tests/test_pydantic.py b/python/python/tests/test_pydantic.py similarity index 99% rename from python/tests/test_pydantic.py rename to python/python/tests/test_pydantic.py index b37373ee..8f9d335c 100644 --- a/python/tests/test_pydantic.py +++ b/python/python/tests/test_pydantic.py @@ -20,9 +20,8 @@ from typing import List, Optional, Tuple import pyarrow as pa import pydantic import pytest -from pydantic import Field - from lancedb.pydantic import PYDANTIC_VERSION, LanceModel, Vector, pydantic_to_schema +from pydantic import Field @pytest.mark.skipif( diff --git a/python/tests/test_query.py b/python/python/tests/test_query.py similarity index 94% rename from python/tests/test_query.py rename to python/python/tests/test_query.py index cefea0c2..d1a08666 100644 --- a/python/tests/test_query.py +++ b/python/python/tests/test_query.py @@ -18,7 +18,6 @@ import numpy as np import pandas.testing as tm import pyarrow as pa import pytest - from lancedb.db import LanceDBConnection from lancedb.pydantic import LanceModel, Vector from lancedb.query import LanceVectorQueryBuilder, Query @@ -88,13 +87,24 @@ def test_query_builder(table): rs = ( LanceVectorQueryBuilder(table, [0, 0], "vector") .limit(1) - .select(["id"]) + .select(["id", "vector"]) .to_list() ) assert rs[0]["id"] == 1 assert all(np.array(rs[0]["vector"]) == [1, 2]) +def test_dynamic_projection(table): + rs = ( + LanceVectorQueryBuilder(table, [0, 0], "vector") + .limit(1) + .select({"id": "id", "id2": "id * 2"}) + .to_list() + ) + assert rs[0]["id"] == 1 + assert rs[0]["id2"] == 2 + + def test_query_builder_with_filter(table): rs = LanceVectorQueryBuilder(table, [0, 0], "vector").where("id = 2").to_list() assert rs[0]["id"] == 2 diff --git a/python/tests/test_remote_client.py b/python/python/tests/test_remote_client.py similarity index 99% rename from python/tests/test_remote_client.py rename to python/python/tests/test_remote_client.py index 73ebf153..e9a2b19f 100644 --- a/python/tests/test_remote_client.py +++ b/python/python/tests/test_remote_client.py @@ -17,7 +17,6 @@ import pandas as pd import pyarrow as pa import pytest from aiohttp import web - from lancedb.remote.client import RestfulLanceDBClient, VectorQuery diff --git a/python/tests/test_remote_db.py b/python/python/tests/test_remote_db.py similarity index 99% rename from python/tests/test_remote_db.py rename to python/python/tests/test_remote_db.py index f4aff298..a775d5c7 100644 --- a/python/tests/test_remote_db.py +++ b/python/python/tests/test_remote_db.py @@ -11,9 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pyarrow as pa - import lancedb +import pyarrow as pa from lancedb.remote.client import VectorQuery, VectorQueryResult diff --git a/python/tests/test_rerankers.py b/python/python/tests/test_rerankers.py similarity index 99% rename from python/tests/test_rerankers.py rename to python/python/tests/test_rerankers.py index 2436adcd..752aaa49 100644 --- a/python/tests/test_rerankers.py +++ b/python/python/tests/test_rerankers.py @@ -1,9 +1,8 @@ import os +import lancedb import numpy as np import pytest - -import lancedb from lancedb.conftest import MockTextEmbeddingFunction # noqa from lancedb.embeddings import EmbeddingFunctionRegistry from lancedb.pydantic import LanceModel, Vector @@ -15,6 +14,9 @@ from lancedb.rerankers import ( ) from lancedb.table import LanceTable +# Tests rely on FTS index +pytest.importorskip("lancedb.fts") + def get_test_table(tmp_path): db = lancedb.connect(tmp_path) diff --git a/python/tests/test_table.py b/python/python/tests/test_table.py similarity index 96% rename from python/tests/test_table.py rename to python/python/tests/test_table.py index 9282a5c6..8b3029e0 100644 --- a/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -20,19 +20,18 @@ from typing import List from unittest.mock import PropertyMock, patch import lance +import lancedb import numpy as np import pandas as pd import polars as pl import pyarrow as pa import pytest -from pydantic import BaseModel - -import lancedb from lancedb.conftest import MockTextEmbeddingFunction from lancedb.db import LanceDBConnection from lancedb.embeddings import EmbeddingFunctionConfig, EmbeddingFunctionRegistry from lancedb.pydantic import LanceModel, Vector from lancedb.table import LanceTable +from pydantic import BaseModel class MockDB: @@ -804,6 +803,9 @@ def test_count_rows(db): def test_hybrid_search(db, tmp_path): + # This test uses an FTS index + pytest.importorskip("lancedb.fts") + db = MockDB(str(tmp_path)) # Create a LanceDB table schema with a vector and a text column emb = EmbeddingFunctionRegistry.get_instance().get("test")() @@ -898,3 +900,29 @@ def test_restore_consistency(tmp_path): table.add([{"id": 2}]) assert table_fixed.version == table.version - 1 assert table_ref_latest.version == table.version + + +# Schema evolution +def test_add_columns(tmp_path): + db = lancedb.connect(tmp_path) + data = pa.table({"id": [0, 1]}) + table = LanceTable.create(db, "my_table", data=data) + table.add_columns({"new_col": "id + 2"}) + assert table.to_arrow().column_names == ["id", "new_col"] + assert table.to_arrow()["new_col"].to_pylist() == [2, 3] + + +def test_alter_columns(tmp_path): + db = lancedb.connect(tmp_path) + data = pa.table({"id": [0, 1]}) + table = LanceTable.create(db, "my_table", data=data) + table.alter_columns({"path": "id", "rename": "new_id"}) + assert table.to_arrow().column_names == ["new_id"] + + +def test_drop_columns(tmp_path): + db = lancedb.connect(tmp_path) + data = pa.table({"id": [0, 1], "category": ["a", "b"]}) + table = LanceTable.create(db, "my_table", data=data) + table.drop_columns(["category"]) + assert table.to_arrow().column_names == ["id"] diff --git a/python/tests/test_telemetry.py b/python/python/tests/test_telemetry.py similarity index 99% rename from python/tests/test_telemetry.py rename to python/python/tests/test_telemetry.py index b8923408..d2865e7e 100644 --- a/python/tests/test_telemetry.py +++ b/python/python/tests/test_telemetry.py @@ -1,8 +1,7 @@ import json -import pytest - import lancedb +import pytest from lancedb.utils.events import _Events diff --git a/python/tests/test_util.py b/python/python/tests/test_util.py similarity index 99% rename from python/tests/test_util.py rename to python/python/tests/test_util.py index 1bf3e693..fa7e75f0 100644 --- a/python/tests/test_util.py +++ b/python/python/tests/test_util.py @@ -15,7 +15,6 @@ import os import pathlib import pytest - from lancedb.util import get_uri_scheme, join_uri diff --git a/python/setup.py b/python/setup.py deleted file mode 100644 index bcabfa2f..00000000 --- a/python/setup.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2023 LanceDB Developers -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import setuptools - -if __name__ == "__main__": - setuptools.setup() diff --git a/python/src/connection.rs b/python/src/connection.rs new file mode 100644 index 00000000..1f0fa759 --- /dev/null +++ b/python/src/connection.rs @@ -0,0 +1,125 @@ +// Copyright 2024 Lance Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{sync::Arc, time::Duration}; + +use arrow::{datatypes::Schema, ffi_stream::ArrowArrayStreamReader, pyarrow::FromPyArrow}; +use lancedb::connection::{Connection as LanceConnection, CreateTableMode}; +use pyo3::{ + exceptions::PyValueError, pyclass, pyfunction, pymethods, PyAny, PyRef, PyResult, Python, +}; +use pyo3_asyncio::tokio::future_into_py; + +use crate::{error::PythonErrorExt, table::Table}; + +#[pyclass] +pub struct Connection { + inner: LanceConnection, +} + +impl Connection { + fn parse_create_mode_str(mode: &str) -> PyResult { + match mode { + "create" => Ok(CreateTableMode::Create), + "overwrite" => Ok(CreateTableMode::Overwrite), + "exist_ok" => Ok(CreateTableMode::exist_ok(|builder| builder)), + _ => Err(PyValueError::new_err(format!("Invalid mode {}", mode))), + } + } +} + +#[pymethods] +impl Connection { + pub fn table_names(self_: PyRef<'_, Self>) -> PyResult<&PyAny> { + let inner = self_.inner.clone(); + future_into_py(self_.py(), async move { + inner.table_names().await.infer_error() + }) + } + + pub fn create_table<'a>( + self_: PyRef<'a, Self>, + name: String, + mode: &str, + data: &PyAny, + ) -> PyResult<&'a PyAny> { + let inner = self_.inner.clone(); + + let mode = Self::parse_create_mode_str(mode)?; + + let batches = Box::new(ArrowArrayStreamReader::from_pyarrow(data)?); + future_into_py(self_.py(), async move { + let table = inner + .create_table(name, batches) + .mode(mode) + .execute() + .await + .infer_error()?; + Ok(Table::new(table)) + }) + } + + pub fn create_empty_table<'a>( + self_: PyRef<'a, Self>, + name: String, + mode: &str, + schema: &PyAny, + ) -> PyResult<&'a PyAny> { + let inner = self_.inner.clone(); + + let mode = Self::parse_create_mode_str(mode)?; + + let schema = Schema::from_pyarrow(schema)?; + + future_into_py(self_.py(), async move { + let table = inner + .create_empty_table(name, Arc::new(schema)) + .mode(mode) + .execute() + .await + .infer_error()?; + Ok(Table::new(table)) + }) + } +} + +#[pyfunction] +pub fn connect( + py: Python, + uri: String, + api_key: Option, + region: Option, + host_override: Option, + read_consistency_interval: Option, +) -> PyResult<&PyAny> { + future_into_py(py, async move { + let mut builder = lancedb::connect(&uri); + if let Some(api_key) = api_key { + builder = builder.api_key(&api_key); + } + if let Some(region) = region { + builder = builder.region(®ion); + } + if let Some(host_override) = host_override { + builder = builder.host_override(&host_override); + } + if let Some(read_consistency_interval) = read_consistency_interval { + let read_consistency_interval = Duration::from_secs_f64(read_consistency_interval); + builder = builder.read_consistency_interval(read_consistency_interval); + } + Ok(Connection { + inner: builder.execute().await.infer_error()?, + }) + }) +} diff --git a/python/src/error.rs b/python/src/error.rs new file mode 100644 index 00000000..67add21b --- /dev/null +++ b/python/src/error.rs @@ -0,0 +1,64 @@ +// Copyright 2024 Lance Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use pyo3::{ + exceptions::{PyOSError, PyRuntimeError, PyValueError}, + PyResult, +}; + +use lancedb::error::Error as LanceError; + +pub trait PythonErrorExt { + /// Convert to a python error based on the Lance error type + fn infer_error(self) -> PyResult; + /// Convert to OSError + fn os_error(self) -> PyResult; + /// Convert to RuntimeError + fn runtime_error(self) -> PyResult; + /// Convert to ValueError + fn value_error(self) -> PyResult; +} + +impl PythonErrorExt for std::result::Result { + fn infer_error(self) -> PyResult { + match &self { + Ok(_) => Ok(self.unwrap()), + Err(err) => match err { + LanceError::InvalidInput { .. } => self.value_error(), + LanceError::InvalidTableName { .. } => self.value_error(), + LanceError::TableNotFound { .. } => self.value_error(), + LanceError::Schema { .. } => self.value_error(), + LanceError::CreateDir { .. } => self.os_error(), + LanceError::TableAlreadyExists { .. } => self.runtime_error(), + LanceError::Store { .. } => self.runtime_error(), + LanceError::Lance { .. } => self.runtime_error(), + LanceError::Runtime { .. } => self.runtime_error(), + LanceError::Http { .. } => self.runtime_error(), + LanceError::Arrow { .. } => self.runtime_error(), + }, + } + } + + fn os_error(self) -> PyResult { + self.map_err(|err| PyOSError::new_err(err.to_string())) + } + + fn runtime_error(self) -> PyResult { + self.map_err(|err| PyRuntimeError::new_err(err.to_string())) + } + + fn value_error(self) -> PyResult { + self.map_err(|err| PyValueError::new_err(err.to_string())) + } +} diff --git a/python/src/lib.rs b/python/src/lib.rs new file mode 100644 index 00000000..fa2f5fc4 --- /dev/null +++ b/python/src/lib.rs @@ -0,0 +1,33 @@ +// Copyright 2024 Lance Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use connection::{connect, Connection}; +use env_logger::Env; +use pyo3::{pymodule, types::PyModule, wrap_pyfunction, PyResult, Python}; + +pub mod connection; +pub mod error; +pub mod table; + +#[pymodule] +pub fn _lancedb(_py: Python, m: &PyModule) -> PyResult<()> { + let env = Env::new() + .filter_or("LANCEDB_LOG", "warn") + .write_style("LANCEDB_LOG_STYLE"); + env_logger::init_from_env(env); + m.add_class::()?; + m.add_function(wrap_pyfunction!(connect, m)?)?; + m.add("__version__", env!("CARGO_PKG_VERSION"))?; + Ok(()) +} diff --git a/python/src/table.rs b/python/src/table.rs new file mode 100644 index 00000000..23bda9a3 --- /dev/null +++ b/python/src/table.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use arrow::pyarrow::ToPyArrow; +use lancedb::table::Table as LanceTable; +use pyo3::{pyclass, pymethods, PyAny, PyRef, PyResult, Python}; +use pyo3_asyncio::tokio::future_into_py; + +use crate::error::PythonErrorExt; + +#[pyclass] +pub struct Table { + inner: Arc, +} + +impl Table { + pub(crate) fn new(inner: Arc) -> Self { + Self { inner } + } +} + +#[pymethods] +impl Table { + pub fn name(&self) -> String { + self.inner.name().to_string() + } + + pub fn schema(self_: PyRef<'_, Self>) -> PyResult<&PyAny> { + let inner = self_.inner.clone(); + future_into_py(self_.py(), async move { + let schema = inner.schema().await.infer_error()?; + Python::with_gil(|py| schema.to_pyarrow(py)) + }) + } +} diff --git a/rust/ffi/node/Cargo.toml b/rust/ffi/node/Cargo.toml index 51bb639d..a129f6bc 100644 --- a/rust/ffi/node/Cargo.toml +++ b/rust/ffi/node/Cargo.toml @@ -1,6 +1,6 @@ [package] -name = "vectordb-node" -version = "0.4.10" +name = "lancedb-node" +version = "0.4.11" description = "Serverless, low-latency vector database for AI applications" license.workspace = true edition.workspace = true @@ -24,9 +24,14 @@ half = { workspace = true } lance = { workspace = true } lance-index = { workspace = true } lance-linalg = { workspace = true } -vectordb = { path = "../../vectordb" } +lancedb = { path = "../../lancedb" } tokio = { version = "1.23", features = ["rt-multi-thread"] } -neon = {version = "0.10.1", default-features = false, features = ["channel-api", "napi-6", "promise-api", "task-api"] } +neon = { version = "0.10.1", default-features = false, features = [ + "channel-api", + "napi-6", + "promise-api", + "task-api", +] } object_store = { workspace = true, features = ["aws"] } snafu = { workspace = true } async-trait = "0" diff --git a/rust/ffi/node/README.md b/rust/ffi/node/README.md index 6cccc952..0b9493d6 100644 --- a/rust/ffi/node/README.md +++ b/rust/ffi/node/README.md @@ -1,3 +1,3 @@ -The LanceDB node bridge (vectordb-node) allows javascript applications to access LanceDB datasets. +The LanceDB node bridge (lancedb-node) allows javascript applications to access LanceDB datasets. It is build using [Neon](https://neon-bindings.com). See the node project for an example of how it is used / tests diff --git a/rust/ffi/node/src/error.rs b/rust/ffi/node/src/error.rs index ce447f02..ae165c12 100644 --- a/rust/ffi/node/src/error.rs +++ b/rust/ffi/node/src/error.rs @@ -34,8 +34,8 @@ pub enum Error { pub type Result = std::result::Result; -impl From for Error { - fn from(e: vectordb::error::Error) -> Self { +impl From for Error { + fn from(e: lancedb::error::Error) -> Self { Self::LanceDB { message: e.to_string(), } diff --git a/rust/ffi/node/src/index/scalar.rs b/rust/ffi/node/src/index/scalar.rs index c9c743fe..6605364c 100644 --- a/rust/ffi/node/src/index/scalar.rs +++ b/rust/ffi/node/src/index/scalar.rs @@ -19,7 +19,7 @@ use neon::{ }; use crate::{error::ResultExt, runtime, table::JsTable}; -use vectordb::Table; +use lancedb::Table; pub fn table_create_scalar_index(mut cx: FunctionContext) -> JsResult { let js_table = cx.this().downcast_or_throw::, _>(&mut cx)?; diff --git a/rust/ffi/node/src/index/vector.rs b/rust/ffi/node/src/index/vector.rs index 7185c892..4fb559dd 100644 --- a/rust/ffi/node/src/index/vector.rs +++ b/rust/ffi/node/src/index/vector.rs @@ -13,10 +13,10 @@ // limitations under the License. use lance_linalg::distance::MetricType; +use lancedb::index::IndexBuilder; use neon::context::FunctionContext; use neon::prelude::*; use std::convert::TryFrom; -use vectordb::index::IndexBuilder; use crate::error::Error::InvalidIndexType; use crate::error::ResultExt; diff --git a/rust/ffi/node/src/lib.rs b/rust/ffi/node/src/lib.rs index 41212030..070e2afc 100644 --- a/rust/ffi/node/src/lib.rs +++ b/rust/ffi/node/src/lib.rs @@ -22,9 +22,9 @@ use object_store::CredentialProvider; use once_cell::sync::OnceCell; use tokio::runtime::Runtime; -use vectordb::connection::Database; -use vectordb::table::ReadParams; -use vectordb::{ConnectOptions, Connection}; +use lancedb::connect; +use lancedb::connection::Connection; +use lancedb::table::ReadParams; use crate::error::ResultExt; use crate::query::JsQuery; @@ -39,7 +39,7 @@ mod query; mod table; struct JsDatabase { - database: Arc, + database: Connection, } impl Finalize for JsDatabase {} @@ -84,28 +84,36 @@ fn database_new(mut cx: FunctionContext) -> JsResult { let path = cx.argument::(0)?.value(&mut cx); let aws_creds = get_aws_creds(&mut cx, 1)?; let region = get_aws_region(&mut cx, 4)?; + let read_consistency_interval = cx + .argument_opt(5) + .and_then(|arg| arg.downcast::(&mut cx).ok()) + .map(|v| v.value(&mut cx)) + .map(std::time::Duration::from_secs_f64); let rt = runtime(&mut cx)?; let channel = cx.channel(); let (deferred, promise) = cx.promise(); - let mut conn_options = ConnectOptions::new(&path); + let mut conn_builder = connect(&path); if let Some(region) = region { - conn_options = conn_options.region(®ion); + conn_builder = conn_builder.region(®ion); } if let Some(aws_creds) = aws_creds { - conn_options = conn_options.aws_creds(AwsCredential { + conn_builder = conn_builder.aws_creds(AwsCredential { key_id: aws_creds.key_id, secret_key: aws_creds.secret_key, token: aws_creds.token, }); } + if let Some(interval) = read_consistency_interval { + conn_builder = conn_builder.read_consistency_interval(interval); + } rt.spawn(async move { - let database = Database::connect_with_options(&conn_options).await; + let database = conn_builder.execute().await; deferred.settle_with(&channel, move |mut cx| { let db = JsDatabase { - database: Arc::new(database.or_throw(&mut cx)?), + database: database.or_throw(&mut cx)?, }; Ok(cx.boxed(db)) }); @@ -217,7 +225,11 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult { let (deferred, promise) = cx.promise(); rt.spawn(async move { - let table_rst = database.open_table_with_params(&table_name, params).await; + let table_rst = database + .open_table(&table_name) + .lance_read_params(params) + .execute() + .await; deferred.settle_with(&channel, move |mut cx| { let js_table = JsTable::from(table_rst.or_throw(&mut cx)?); @@ -274,5 +286,8 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> { index::vector::table_create_vector_index, )?; cx.export_function("tableSchema", JsTable::js_schema)?; + cx.export_function("tableAddColumns", JsTable::js_add_columns)?; + cx.export_function("tableAlterColumns", JsTable::js_alter_columns)?; + cx.export_function("tableDropColumns", JsTable::js_drop_columns)?; Ok(()) } diff --git a/rust/ffi/node/src/query.rs b/rust/ffi/node/src/query.rs index c9ba7b8b..6b63593f 100644 --- a/rust/ffi/node/src/query.rs +++ b/rust/ffi/node/src/query.rs @@ -93,7 +93,7 @@ impl JsQuery { .and_then(|stream| { stream .try_collect::>() - .map_err(vectordb::error::Error::from) + .map_err(lancedb::error::Error::from) }) .await; diff --git a/rust/ffi/node/src/table.rs b/rust/ffi/node/src/table.rs index bb6bbfea..c687f849 100644 --- a/rust/ffi/node/src/table.rs +++ b/rust/ffi/node/src/table.rs @@ -16,14 +16,14 @@ use std::ops::Deref; use arrow_array::{RecordBatch, RecordBatchIterator}; use lance::dataset::optimize::CompactionOptions; -use lance::dataset::{WriteMode, WriteParams}; +use lance::dataset::{ColumnAlteration, NewColumnTransform, WriteMode, WriteParams}; use lance::io::ObjectStoreParams; -use vectordb::table::OptimizeAction; +use lancedb::table::{AddDataOptions, OptimizeAction, WriteOptions}; use crate::arrow::{arrow_buffer_to_record_batch, record_batch_to_buffer}; +use lancedb::TableRef; use neon::prelude::*; use neon::types::buffer::TypedArray; -use vectordb::TableRef; use crate::error::ResultExt; use crate::{convert, get_aws_credential_provider, get_aws_region, runtime, JsDatabase}; @@ -80,7 +80,11 @@ impl JsTable { rt.spawn(async move { let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); let table_rst = database - .create_table(&table_name, Box::new(batch_reader), Some(params)) + .create_table(&table_name, Box::new(batch_reader)) + .write_options(WriteOptions { + lance_write_params: Some(params), + }) + .execute() .await; deferred.settle_with(&channel, move |mut cx| { @@ -121,7 +125,13 @@ impl JsTable { rt.spawn(async move { let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - let add_result = table.add(Box::new(batch_reader), Some(params)).await; + let opts = AddDataOptions { + write_options: WriteOptions { + lance_write_params: Some(params), + }, + ..Default::default() + }; + let add_result = table.add(Box::new(batch_reader), opts).await; deferred.settle_with(&channel, move |mut cx| { add_result.or_throw(&mut cx)?; @@ -524,8 +534,9 @@ impl JsTable { .value(&mut cx); rt.spawn(async move { + let schema = table.schema().await; deferred.settle_with(&channel, move |mut cx| { - let schema = table.schema(); + let schema = schema.or_throw(&mut cx)?; let batches = vec![RecordBatch::new_empty(schema)]; let buffer = record_batch_to_buffer(batches).or_throw(&mut cx)?; convert::new_js_buffer(buffer, &mut cx, is_electron) @@ -533,4 +544,118 @@ impl JsTable { }); Ok(promise) } + + pub(crate) fn js_add_columns(mut cx: FunctionContext) -> JsResult { + let expressions = cx + .argument::(0)? + .to_vec(&mut cx)? + .into_iter() + .map(|val| { + let obj = val.downcast_or_throw::(&mut cx)?; + let name = obj.get::(&mut cx, "name")?.value(&mut cx); + let sql = obj + .get::(&mut cx, "valueSql")? + .value(&mut cx); + Ok((name, sql)) + }) + .collect::>>()?; + + let transforms = NewColumnTransform::SqlExpressions(expressions); + + let js_table = cx.this().downcast_or_throw::, _>(&mut cx)?; + let rt = runtime(&mut cx)?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let table = js_table.table.clone(); + + rt.spawn(async move { + let result = table.add_columns(transforms, None).await; + deferred.settle_with(&channel, move |mut cx| { + result.or_throw(&mut cx)?; + Ok(cx.undefined()) + }) + }); + + Ok(promise) + } + + pub(crate) fn js_alter_columns(mut cx: FunctionContext) -> JsResult { + let alterations = cx + .argument::(0)? + .to_vec(&mut cx)? + .into_iter() + .map(|val| { + let obj = val.downcast_or_throw::(&mut cx)?; + let path = obj.get::(&mut cx, "path")?.value(&mut cx); + let rename = obj + .get_opt::(&mut cx, "rename")? + .map(|val| val.value(&mut cx)); + let nullable = obj + .get_opt::(&mut cx, "nullable")? + .map(|val| val.value(&mut cx)); + // TODO: support data type here. Will need to do some serialization/deserialization + + if rename.is_none() && nullable.is_none() { + return cx.throw_error("At least one of 'name' or 'nullable' must be provided"); + } + + Ok(ColumnAlteration { + path, + rename, + nullable, + // TODO: wire up this field + data_type: None, + }) + }) + .collect::>>()?; + + let js_table = cx.this().downcast_or_throw::, _>(&mut cx)?; + let rt = runtime(&mut cx)?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let table = js_table.table.clone(); + + rt.spawn(async move { + let result = table.alter_columns(&alterations).await; + deferred.settle_with(&channel, move |mut cx| { + result.or_throw(&mut cx)?; + Ok(cx.undefined()) + }) + }); + + Ok(promise) + } + + pub(crate) fn js_drop_columns(mut cx: FunctionContext) -> JsResult { + let columns = cx + .argument::(0)? + .to_vec(&mut cx)? + .into_iter() + .map(|val| { + Ok(val + .downcast_or_throw::(&mut cx)? + .value(&mut cx)) + }) + .collect::>>()?; + + let js_table = cx.this().downcast_or_throw::, _>(&mut cx)?; + let rt = runtime(&mut cx)?; + + let (deferred, promise) = cx.promise(); + let channel = cx.channel(); + let table = js_table.table.clone(); + + rt.spawn(async move { + let col_refs = columns.iter().map(|s| s.as_str()).collect::>(); + let result = table.drop_columns(&col_refs).await; + deferred.settle_with(&channel, move |mut cx| { + result.or_throw(&mut cx)?; + Ok(cx.undefined()) + }) + }); + + Ok(promise) + } } diff --git a/rust/vectordb/Cargo.toml b/rust/lancedb/Cargo.toml similarity index 81% rename from rust/vectordb/Cargo.toml rename to rust/lancedb/Cargo.toml index f5c57f9e..3ca835ab 100644 --- a/rust/vectordb/Cargo.toml +++ b/rust/lancedb/Cargo.toml @@ -1,6 +1,6 @@ [package] -name = "vectordb" -version = "0.4.10" +name = "lancedb" +version = "0.4.11" edition.workspace = true description = "LanceDB: A serverless, low-latency vector database for AI applications" license.workspace = true @@ -31,11 +31,20 @@ async-trait = "0" bytes = "1" futures.workspace = true num-traits.workspace = true -url = { workspace = true } +url.workspace = true serde = { version = "^1" } serde_json = { version = "1" } +# For remote feature + +reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true } + [dev-dependencies] tempfile = "3.5.0" rand = { version = "0.8.3", features = ["small_rng"] } +uuid = { version = "1.7.0", features = ["v4"] } walkdir = "2" + +[features] +default = ["remote"] +remote = ["dep:reqwest"] diff --git a/rust/vectordb/README.md b/rust/lancedb/README.md similarity index 100% rename from rust/vectordb/README.md rename to rust/lancedb/README.md diff --git a/rust/vectordb/examples/simple.rs b/rust/lancedb/examples/simple.rs similarity index 85% rename from rust/vectordb/examples/simple.rs rename to rust/lancedb/examples/simple.rs index 947c6952..a09eca97 100644 --- a/rust/vectordb/examples/simple.rs +++ b/rust/lancedb/examples/simple.rs @@ -19,8 +19,9 @@ use arrow_array::{FixedSizeListArray, Int32Array, RecordBatch, RecordBatchIterat use arrow_schema::{DataType, Field, Schema}; use futures::TryStreamExt; -use vectordb::Connection; -use vectordb::{connect, Result, Table, TableRef}; +use lancedb::connection::Connection; +use lancedb::table::AddDataOptions; +use lancedb::{connect, Result, Table, TableRef}; #[tokio::main] async fn main() -> Result<()> { @@ -29,18 +30,18 @@ async fn main() -> Result<()> { } // --8<-- [start:connect] let uri = "data/sample-lancedb"; - let db = connect(uri).await?; + let db = connect(uri).execute().await?; // --8<-- [end:connect] // --8<-- [start:list_names] println!("{:?}", db.table_names().await?); // --8<-- [end:list_names] - let tbl = create_table(db.clone()).await?; + let tbl = create_table(&db).await?; create_index(tbl.as_ref()).await?; let batches = search(tbl.as_ref()).await?; println!("{:?}", batches); - create_empty_table(db.clone()).await.unwrap(); + create_empty_table(&db).await.unwrap(); // --8<-- [start:delete] tbl.delete("id > 24").await.unwrap(); @@ -55,17 +56,14 @@ async fn main() -> Result<()> { #[allow(dead_code)] async fn open_with_existing_tbl() -> Result<()> { let uri = "data/sample-lancedb"; - let db = connect(uri).await?; + let db = connect(uri).execute().await?; // --8<-- [start:open_with_existing_file] - let _ = db - .open_table_with_params("my_table", Default::default()) - .await - .unwrap(); + let _ = db.open_table("my_table").execute().await.unwrap(); // --8<-- [end:open_with_existing_file] Ok(()) } -async fn create_table(db: Arc) -> Result { +async fn create_table(db: &Connection) -> Result { // --8<-- [start:create_table] const TOTAL: usize = 1000; const DIM: usize = 128; @@ -102,7 +100,8 @@ async fn create_table(db: Arc) -> Result { schema.clone(), ); let tbl = db - .create_table("my_table", Box::new(batches), None) + .create_table("my_table", Box::new(batches)) + .execute() .await .unwrap(); // --8<-- [end:create_table] @@ -126,21 +125,21 @@ async fn create_table(db: Arc) -> Result { schema.clone(), ); // --8<-- [start:add] - tbl.add(Box::new(new_batches), None).await.unwrap(); + tbl.add(Box::new(new_batches), AddDataOptions::default()) + .await + .unwrap(); // --8<-- [end:add] Ok(tbl) } -async fn create_empty_table(db: Arc) -> Result { +async fn create_empty_table(db: &Connection) -> Result { // --8<-- [start:create_empty_table] let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), Field::new("item", DataType::Utf8, true), ])); - let batches = RecordBatchIterator::new(vec![], schema.clone()); - db.create_table("empty_table", Box::new(batches), None) - .await + db.create_empty_table("empty_table", schema).execute().await // --8<-- [end:create_empty_table] } diff --git a/rust/vectordb/src/arrow.rs b/rust/lancedb/src/arrow.rs similarity index 100% rename from rust/vectordb/src/arrow.rs rename to rust/lancedb/src/arrow.rs diff --git a/rust/lancedb/src/connection.rs b/rust/lancedb/src/connection.rs new file mode 100644 index 00000000..37b663f4 --- /dev/null +++ b/rust/lancedb/src/connection.rs @@ -0,0 +1,813 @@ +// Copyright 2023 LanceDB Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! LanceDB Database + +use std::fs::create_dir_all; +use std::path::Path; +use std::sync::Arc; + +use arrow_array::{RecordBatchIterator, RecordBatchReader}; +use arrow_schema::SchemaRef; +use lance::dataset::{ReadParams, WriteMode}; +use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore}; +use object_store::{ + aws::AwsCredential, local::LocalFileSystem, CredentialProvider, StaticCredentialProvider, +}; +use snafu::prelude::*; + +use crate::error::{CreateDirSnafu, Error, InvalidTableNameSnafu, Result}; +use crate::io::object_store::MirroringObjectStoreWrapper; +use crate::table::{NativeTable, TableRef, WriteOptions}; + +pub const LANCE_FILE_EXTENSION: &str = "lance"; + +pub type TableBuilderCallback = Box OpenTableBuilder + Send>; + +/// Describes what happens when creating a table and a table with +/// the same name already exists +pub enum CreateTableMode { + /// If the table already exists, an error is returned + Create, + /// If the table already exists, it is opened. Any provided data is + /// ignored. The function will be passed an OpenTableBuilder to customize + /// how the table is opened + ExistOk(TableBuilderCallback), + /// If the table already exists, it is overwritten + Overwrite, +} + +impl CreateTableMode { + pub fn exist_ok( + callback: impl FnOnce(OpenTableBuilder) -> OpenTableBuilder + Send + 'static, + ) -> Self { + Self::ExistOk(Box::new(callback)) + } +} + +impl Default for CreateTableMode { + fn default() -> Self { + Self::Create + } +} + +/// Describes what happens when a vector either contains NaN or +/// does not have enough values +#[derive(Clone, Debug, Default)] +enum BadVectorHandling { + /// An error is returned + #[default] + Error, + #[allow(dead_code)] // https://github.com/lancedb/lancedb/issues/992 + /// The offending row is droppped + Drop, + #[allow(dead_code)] // https://github.com/lancedb/lancedb/issues/992 + /// The invalid/missing items are replaced by fill_value + Fill(f32), +} + +/// A builder for configuring a [`Connection::create_table`] operation +pub struct CreateTableBuilder { + parent: Arc, + pub(crate) name: String, + pub(crate) data: Option>, + pub(crate) schema: Option, + pub(crate) mode: CreateTableMode, + pub(crate) write_options: WriteOptions, +} + +// Builder methods that only apply when we have initial data +impl CreateTableBuilder { + fn new( + parent: Arc, + name: String, + data: Box, + ) -> Self { + Self { + parent, + name, + data: Some(data), + schema: None, + mode: CreateTableMode::default(), + write_options: WriteOptions::default(), + } + } + + /// Apply the given write options when writing the initial data + pub fn write_options(mut self, write_options: WriteOptions) -> Self { + self.write_options = write_options; + self + } + + /// Execute the create table operation + pub async fn execute(self) -> Result { + self.parent.clone().do_create_table(self).await + } +} + +// Builder methods that only apply when we do not have initial data +impl CreateTableBuilder { + fn new(parent: Arc, name: String, schema: SchemaRef) -> Self { + Self { + parent, + name, + data: None, + schema: Some(schema), + mode: CreateTableMode::default(), + write_options: WriteOptions::default(), + } + } + + /// Execute the create table operation + pub async fn execute(self) -> Result { + self.parent.clone().do_create_empty_table(self).await + } +} + +impl CreateTableBuilder { + /// Set the mode for creating the table + /// + /// This controls what happens if a table with the given name already exists + pub fn mode(mut self, mode: CreateTableMode) -> Self { + self.mode = mode; + self + } +} + +#[derive(Clone, Debug)] +pub struct OpenTableBuilder { + parent: Arc, + name: String, + index_cache_size: u32, + lance_read_params: Option, +} + +impl OpenTableBuilder { + fn new(parent: Arc, name: String) -> Self { + Self { + parent, + name, + index_cache_size: 256, + lance_read_params: None, + } + } + + /// Set the size of the index cache, specified as a number of entries + /// + /// The default value is 256 + /// + /// The exact meaning of an "entry" will depend on the type of index: + /// * IVF - there is one entry for each IVF partition + /// * BTREE - there is one entry for the entire index + /// + /// This cache applies to the entire opened table, across all indices. + /// Setting this value higher will increase performance on larger datasets + /// at the expense of more RAM + pub fn index_cache_size(mut self, index_cache_size: u32) -> Self { + self.index_cache_size = index_cache_size; + self + } + + /// Advanced parameters that can be used to customize table reads + /// + /// If set, these will take precedence over any overlapping `OpenTableOptions` options + pub fn lance_read_params(mut self, params: ReadParams) -> Self { + self.lance_read_params = Some(params); + self + } + + /// Open the table + pub async fn execute(self) -> Result { + self.parent.clone().do_open_table(self).await + } +} + +#[async_trait::async_trait] +pub(crate) trait ConnectionInternal: Send + Sync + std::fmt::Debug + 'static { + async fn table_names(&self) -> Result>; + async fn do_create_table(&self, options: CreateTableBuilder) -> Result; + async fn do_open_table(&self, options: OpenTableBuilder) -> Result; + async fn drop_table(&self, name: &str) -> Result<()>; + async fn drop_db(&self) -> Result<()>; + + async fn do_create_empty_table(&self, options: CreateTableBuilder) -> Result { + let batches = RecordBatchIterator::new(vec![], options.schema.unwrap()); + let opts = CreateTableBuilder::::new(options.parent, options.name, Box::new(batches)) + .mode(options.mode) + .write_options(options.write_options); + self.do_create_table(opts).await + } +} + +/// A connection to LanceDB +#[derive(Clone)] +pub struct Connection { + uri: String, + internal: Arc, +} + +impl Connection { + /// Get the URI of the connection + pub fn uri(&self) -> &str { + self.uri.as_str() + } + + /// Get the names of all tables in the database. + pub async fn table_names(&self) -> Result> { + self.internal.table_names().await + } + + /// Create a new table from data + /// + /// # Parameters + /// + /// * `name` - The name of the table + /// * `initial_data` - The initial data to write to the table + pub fn create_table( + &self, + name: impl Into, + initial_data: Box, + ) -> CreateTableBuilder { + CreateTableBuilder::::new(self.internal.clone(), name.into(), initial_data) + } + + /// Create an empty table with a given schema + /// + /// # Parameters + /// + /// * `name` - The name of the table + /// * `schema` - The schema of the table + pub fn create_empty_table( + &self, + name: impl Into, + schema: SchemaRef, + ) -> CreateTableBuilder { + CreateTableBuilder::::new(self.internal.clone(), name.into(), schema) + } + + /// Open an existing table in the database + /// + /// # Arguments + /// * `name` - The name of the table + /// + /// # Returns + /// Created [`TableRef`], or [`Error::TableNotFound`] if the table does not exist. + pub fn open_table(&self, name: impl Into) -> OpenTableBuilder { + OpenTableBuilder::new(self.internal.clone(), name.into()) + } + + /// Drop a table in the database. + /// + /// # Arguments + /// * `name` - The name of the table to drop + pub async fn drop_table(&self, name: impl AsRef) -> Result<()> { + self.internal.drop_table(name.as_ref()).await + } + + /// Drop the database + /// + /// This is the same as dropping all of the tables + pub async fn drop_db(&self) -> Result<()> { + self.internal.drop_db().await + } +} + +#[derive(Debug)] +pub struct ConnectBuilder { + /// Database URI + /// + /// ### Accpeted URI formats + /// + /// - `/path/to/database` - local database on file system. + /// - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud object store + /// - `db://dbname` - LanceDB Cloud + uri: String, + + /// LanceDB Cloud API key, required if using Lance Cloud + api_key: Option, + /// LanceDB Cloud region, required if using Lance Cloud + region: Option, + /// LanceDB Cloud host override, only required if using an on-premises Lance Cloud instance + host_override: Option, + + /// User provided AWS credentials + aws_creds: Option, + + /// The interval at which to check for updates from other processes. + read_consistency_interval: Option, +} + +impl ConnectBuilder { + /// Create a new [`ConnectOptions`] with the given database URI. + pub fn new(uri: &str) -> Self { + Self { + uri: uri.to_string(), + api_key: None, + region: None, + host_override: None, + aws_creds: None, + read_consistency_interval: None, + } + } + + pub fn api_key(mut self, api_key: &str) -> Self { + self.api_key = Some(api_key.to_string()); + self + } + + pub fn region(mut self, region: &str) -> Self { + self.region = Some(region.to_string()); + self + } + + pub fn host_override(mut self, host_override: &str) -> Self { + self.host_override = Some(host_override.to_string()); + self + } + + /// [`AwsCredential`] to use when connecting to S3. + pub fn aws_creds(mut self, aws_creds: AwsCredential) -> Self { + self.aws_creds = Some(aws_creds); + self + } + + /// The interval at which to check for updates from other processes. This + /// only affects LanceDB OSS. + /// + /// If left unset, consistency is not checked. For maximum read + /// performance, this is the default. For strong consistency, set this to + /// zero seconds. Then every read will check for updates from other processes. + /// As a compromise, set this to a non-zero duration for eventual consistency. + /// If more than that duration has passed since the last read, the read will + /// check for updates from other processes. + /// + /// This only affects read operations. Write operations are always + /// consistent. + /// + /// LanceDB Cloud uses eventual consistency under the hood, and is not + /// currently configurable. + pub fn read_consistency_interval( + mut self, + read_consistency_interval: std::time::Duration, + ) -> Self { + self.read_consistency_interval = Some(read_consistency_interval); + self + } + + #[cfg(feature = "remote")] + fn execute_remote(self) -> Result { + let region = self.region.ok_or_else(|| Error::InvalidInput { + message: "A region is required when connecting to LanceDb Cloud".to_string(), + })?; + let api_key = self.api_key.ok_or_else(|| Error::InvalidInput { + message: "An api_key is required when connecting to LanceDb Cloud".to_string(), + })?; + let internal = Arc::new(crate::remote::db::RemoteDatabase::try_new( + &self.uri, + &api_key, + ®ion, + self.host_override, + )?); + Ok(Connection { + internal, + uri: self.uri, + }) + } + + #[cfg(not(feature = "remote"))] + fn execute_remote(self) -> Result { + Err(Error::Runtime { + message: "cannot connect to LanceDb Cloud unless the 'remote' feature is enabled" + .to_string(), + }) + } + + /// Establishes a connection to the database + pub async fn execute(self) -> Result { + if self.uri.starts_with("db") { + self.execute_remote() + } else { + let internal = Arc::new(Database::connect_with_options(&self).await?); + Ok(Connection { + internal, + uri: self.uri, + }) + } + } +} + +/// Connect to a LanceDB database. +/// +/// # Arguments +/// +/// * `uri` - URI where the database is located, can be a local directory, supported remote cloud storage, +/// or a LanceDB Cloud database. See [ConnectOptions::uri] for a list of accepted formats +pub fn connect(uri: &str) -> ConnectBuilder { + ConnectBuilder::new(uri) +} + +#[derive(Debug)] +struct Database { + object_store: ObjectStore, + query_string: Option, + + pub(crate) uri: String, + pub(crate) base_path: object_store::path::Path, + + // the object store wrapper to use on write path + pub(crate) store_wrapper: Option>, + + read_consistency_interval: Option, +} + +const LANCE_EXTENSION: &str = "lance"; +const ENGINE: &str = "engine"; +const MIRRORED_STORE: &str = "mirroredStore"; + +/// A connection to LanceDB +impl Database { + async fn connect_with_options(options: &ConnectBuilder) -> Result { + let uri = &options.uri; + let parse_res = url::Url::parse(uri); + + // TODO: pass params regardless of OS + match parse_res { + Ok(url) if url.scheme().len() == 1 && cfg!(windows) => { + Self::open_path(uri, options.read_consistency_interval).await + } + Ok(mut url) => { + // iter thru the query params and extract the commit store param + let mut engine = None; + let mut mirrored_store = None; + let mut filtered_querys = vec![]; + + // WARNING: specifying engine is NOT a publicly supported feature in lancedb yet + // THE API WILL CHANGE + for (key, value) in url.query_pairs() { + if key == ENGINE { + engine = Some(value.to_string()); + } else if key == MIRRORED_STORE { + if cfg!(windows) { + return Err(Error::Lance { + message: "mirrored store is not supported on windows".into(), + }); + } + mirrored_store = Some(value.to_string()); + } else { + // to owned so we can modify the url + filtered_querys.push((key.to_string(), value.to_string())); + } + } + + // Filter out the commit store query param -- it's a lancedb param + url.query_pairs_mut().clear(); + url.query_pairs_mut().extend_pairs(filtered_querys); + // Take a copy of the query string so we can propagate it to lance + let query_string = url.query().map(|s| s.to_string()); + // clear the query string so we can use the url as the base uri + // use .set_query(None) instead of .set_query("") because the latter + // will add a trailing '?' to the url + url.set_query(None); + + let table_base_uri = if let Some(store) = engine { + static WARN_ONCE: std::sync::Once = std::sync::Once::new(); + WARN_ONCE.call_once(|| { + log::warn!("Specifing engine is not a publicly supported feature in lancedb yet. THE API WILL CHANGE"); + }); + let old_scheme = url.scheme().to_string(); + let new_scheme = format!("{}+{}", old_scheme, store); + url.to_string().replacen(&old_scheme, &new_scheme, 1) + } else { + url.to_string() + }; + + let plain_uri = url.to_string(); + let os_params: ObjectStoreParams = if let Some(aws_creds) = &options.aws_creds { + let credential_provider: Arc< + dyn CredentialProvider, + > = Arc::new(StaticCredentialProvider::new(AwsCredential { + key_id: aws_creds.key_id.clone(), + secret_key: aws_creds.secret_key.clone(), + token: aws_creds.token.clone(), + })); + ObjectStoreParams::with_aws_credentials( + Some(credential_provider), + options.region.clone(), + ) + } else { + ObjectStoreParams::default() + }; + let (object_store, base_path) = + ObjectStore::from_uri_and_params(&plain_uri, &os_params).await?; + if object_store.is_local() { + Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?; + } + + let write_store_wrapper = match mirrored_store { + Some(path) => { + let mirrored_store = Arc::new(LocalFileSystem::new_with_prefix(path)?); + let wrapper = MirroringObjectStoreWrapper::new(mirrored_store); + Some(Arc::new(wrapper) as Arc) + } + None => None, + }; + + Ok(Self { + uri: table_base_uri, + query_string, + base_path, + object_store, + store_wrapper: write_store_wrapper, + read_consistency_interval: options.read_consistency_interval, + }) + } + Err(_) => Self::open_path(uri, options.read_consistency_interval).await, + } + } + + async fn open_path( + path: &str, + read_consistency_interval: Option, + ) -> Result { + let (object_store, base_path) = ObjectStore::from_uri(path).await?; + if object_store.is_local() { + Self::try_create_dir(path).context(CreateDirSnafu { path })?; + } + Ok(Self { + uri: path.to_string(), + query_string: None, + base_path, + object_store, + store_wrapper: None, + read_consistency_interval, + }) + } + + /// Try to create a local directory to store the lancedb dataset + fn try_create_dir(path: &str) -> core::result::Result<(), std::io::Error> { + let path = Path::new(path); + if !path.try_exists()? { + create_dir_all(path)?; + } + Ok(()) + } + + /// Get the URI of a table in the database. + fn table_uri(&self, name: &str) -> Result { + let path = Path::new(&self.uri); + let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION)); + + let mut uri = table_uri + .as_path() + .to_str() + .context(InvalidTableNameSnafu { name })? + .to_string(); + + // If there are query string set on the connection, propagate to lance + if let Some(query) = self.query_string.as_ref() { + uri.push('?'); + uri.push_str(query.as_str()); + } + + Ok(uri) + } +} + +#[async_trait::async_trait] +impl ConnectionInternal for Database { + async fn table_names(&self) -> Result> { + let mut f = self + .object_store + .read_dir(self.base_path.clone()) + .await? + .iter() + .map(Path::new) + .filter(|path| { + let is_lance = path + .extension() + .and_then(|e| e.to_str()) + .map(|e| e == LANCE_EXTENSION); + is_lance.unwrap_or(false) + }) + .filter_map(|p| p.file_stem().and_then(|s| s.to_str().map(String::from))) + .collect::>(); + f.sort(); + Ok(f) + } + + async fn do_create_table(&self, options: CreateTableBuilder) -> Result { + let table_uri = self.table_uri(&options.name)?; + + let mut write_params = options.write_options.lance_write_params.unwrap_or_default(); + if matches!(&options.mode, CreateTableMode::Overwrite) { + write_params.mode = WriteMode::Overwrite; + } + + match NativeTable::create( + &table_uri, + &options.name, + options.data.unwrap(), + self.store_wrapper.clone(), + Some(write_params), + self.read_consistency_interval, + ) + .await + { + Ok(table) => Ok(Arc::new(table)), + Err(Error::TableAlreadyExists { name }) => match options.mode { + CreateTableMode::Create => Err(Error::TableAlreadyExists { name }), + CreateTableMode::ExistOk(callback) => { + let builder = OpenTableBuilder::new(options.parent, options.name); + let builder = (callback)(builder); + builder.execute().await + } + CreateTableMode::Overwrite => unreachable!(), + }, + Err(err) => Err(err), + } + } + + async fn do_open_table(&self, options: OpenTableBuilder) -> Result { + let table_uri = self.table_uri(&options.name)?; + Ok(Arc::new( + NativeTable::open_with_params( + &table_uri, + &options.name, + self.store_wrapper.clone(), + options.lance_read_params, + self.read_consistency_interval, + ) + .await?, + )) + } + + async fn drop_table(&self, name: &str) -> Result<()> { + let dir_name = format!("{}.{}", name, LANCE_EXTENSION); + let full_path = self.base_path.child(dir_name.clone()); + self.object_store + .remove_dir_all(full_path) + .await + .map_err(|err| match err { + // this error is not lance::Error::DatasetNotFound, + // as the method `remove_dir_all` may be used to remove something not be a dataset + lance::Error::NotFound { .. } => Error::TableNotFound { + name: name.to_owned(), + }, + _ => Error::from(err), + })?; + Ok(()) + } + + async fn drop_db(&self) -> Result<()> { + todo!() + } +} + +#[cfg(test)] +mod tests { + use arrow_schema::{DataType, Field, Schema}; + use tempfile::tempdir; + + use super::*; + + #[tokio::test] + async fn test_connect() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + let db = connect(uri).execute().await.unwrap(); + + assert_eq!(db.uri, uri); + } + + #[cfg(not(windows))] + #[tokio::test] + async fn test_connect_relative() { + let tmp_dir = tempdir().unwrap(); + let uri = std::fs::canonicalize(tmp_dir.path().to_str().unwrap()).unwrap(); + + let current_dir = std::env::current_dir().unwrap(); + let ancestors = current_dir.ancestors(); + let relative_ancestors = vec![".."; ancestors.count()]; + + let relative_root = std::path::PathBuf::from(relative_ancestors.join("/")); + let relative_uri = relative_root.join(&uri); + + let db = connect(relative_uri.to_str().unwrap()) + .execute() + .await + .unwrap(); + + assert_eq!(db.uri, relative_uri.to_str().unwrap().to_string()); + } + + #[tokio::test] + async fn test_table_names() { + let tmp_dir = tempdir().unwrap(); + create_dir_all(tmp_dir.path().join("table1.lance")).unwrap(); + create_dir_all(tmp_dir.path().join("table2.lance")).unwrap(); + create_dir_all(tmp_dir.path().join("invalidlance")).unwrap(); + + let uri = tmp_dir.path().to_str().unwrap(); + let db = connect(uri).execute().await.unwrap(); + let tables = db.table_names().await.unwrap(); + assert_eq!(tables.len(), 2); + assert!(tables[0].eq(&String::from("table1"))); + assert!(tables[1].eq(&String::from("table2"))); + } + + #[tokio::test] + async fn test_connect_s3() { + // let db = Database::connect("s3://bucket/path/to/database").await.unwrap(); + } + + #[tokio::test] + #[ignore = "this can't pass due to https://github.com/lancedb/lancedb/issues/1019, enable it after the bug fixed"] + async fn test_open_table() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + let db = connect(uri).execute().await.unwrap(); + + assert_eq!(db.table_names().await.unwrap().len(), 0); + // open non-exist table + assert!(matches!( + db.open_table("invalid_table").execute().await, + Err(crate::Error::TableNotFound { .. }) + )); + + assert_eq!(db.table_names().await.unwrap().len(), 0); + + let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); + db.create_empty_table("table1", schema) + .execute() + .await + .unwrap(); + db.open_table("table1").execute().await.unwrap(); + let tables = db.table_names().await.unwrap(); + assert_eq!(tables, vec!["table1".to_owned()]); + } + + #[tokio::test] + async fn drop_table() { + let tmp_dir = tempdir().unwrap(); + + let uri = tmp_dir.path().to_str().unwrap(); + let db = connect(uri).execute().await.unwrap(); + + // drop non-exist table + assert!(matches!( + db.drop_table("invalid_table").await, + Err(crate::Error::TableNotFound { .. }), + )); + + create_dir_all(tmp_dir.path().join("table1.lance")).unwrap(); + db.drop_table("table1").await.unwrap(); + + let tables = db.table_names().await.unwrap(); + assert_eq!(tables.len(), 0); + } + + #[tokio::test] + async fn test_create_table_already_exists() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + let db = connect(uri).execute().await.unwrap(); + let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, false)])); + db.create_empty_table("test", schema.clone()) + .execute() + .await + .unwrap(); + // TODO: None of the open table options are "inspectable" right now but once one is we + // should assert we are passing these options in correctly + db.create_empty_table("test", schema) + .mode(CreateTableMode::exist_ok(|builder| { + builder.index_cache_size(16) + })) + .execute() + .await + .unwrap(); + let other_schema = Arc::new(Schema::new(vec![Field::new("y", DataType::Int32, false)])); + assert!(db + .create_empty_table("test", other_schema.clone()) + .execute() + .await + .is_err()); + let overwritten = db + .create_empty_table("test", other_schema.clone()) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + assert_eq!(other_schema, overwritten.schema().await.unwrap()); + } +} diff --git a/rust/vectordb/src/data.rs b/rust/lancedb/src/data.rs similarity index 100% rename from rust/vectordb/src/data.rs rename to rust/lancedb/src/data.rs diff --git a/rust/vectordb/src/data/inspect.rs b/rust/lancedb/src/data/inspect.rs similarity index 100% rename from rust/vectordb/src/data/inspect.rs rename to rust/lancedb/src/data/inspect.rs diff --git a/rust/vectordb/src/data/sanitize.rs b/rust/lancedb/src/data/sanitize.rs similarity index 99% rename from rust/vectordb/src/data/sanitize.rs rename to rust/lancedb/src/data/sanitize.rs index c5efd2bc..fe139b99 100644 --- a/rust/vectordb/src/data/sanitize.rs +++ b/rust/lancedb/src/data/sanitize.rs @@ -174,7 +174,6 @@ fn coerce_schema_batch( } /// Coerce the reader (input data) to match the given [Schema]. -/// pub fn coerce_schema( reader: impl RecordBatchReader + Send + 'static, schema: Arc, diff --git a/rust/vectordb/src/error.rs b/rust/lancedb/src/error.rs similarity index 64% rename from rust/vectordb/src/error.rs rename to rust/lancedb/src/error.rs index 2bdf97d6..611f171b 100644 --- a/rust/vectordb/src/error.rs +++ b/rust/lancedb/src/error.rs @@ -20,32 +20,40 @@ use snafu::Snafu; #[derive(Debug, Snafu)] #[snafu(visibility(pub(crate)))] pub enum Error { - #[snafu(display("LanceDBError: Invalid table name: {name}"))] + #[snafu(display("Invalid table name: {name}"))] InvalidTableName { name: String }, - #[snafu(display("LanceDBError: Table '{name}' was not found"))] + #[snafu(display("Invalid input, {message}"))] + InvalidInput { message: String }, + #[snafu(display("Table '{name}' was not found"))] TableNotFound { name: String }, - #[snafu(display("LanceDBError: Table '{name}' already exists"))] + #[snafu(display("Table '{name}' already exists"))] TableAlreadyExists { name: String }, - #[snafu(display("LanceDBError: Unable to created lance dataset at {path}: {source}"))] + #[snafu(display("Unable to created lance dataset at {path}: {source}"))] CreateDir { path: String, source: std::io::Error, }, - #[snafu(display("LanceDBError: {message}"))] - Store { message: String }, - #[snafu(display("LanceDBError: {message}"))] - Lance { message: String }, - #[snafu(display("LanceDB Schema Error: {message}"))] + #[snafu(display("Schema Error: {message}"))] Schema { message: String }, #[snafu(display("Runtime error: {message}"))] Runtime { message: String }, + + // 3rd party / external errors + #[snafu(display("object_store error: {message}"))] + Store { message: String }, + #[snafu(display("lance error: {message}"))] + Lance { message: String }, + #[snafu(display("Http error: {message}"))] + Http { message: String }, + #[snafu(display("Arrow error: {message}"))] + Arrow { message: String }, } pub type Result = std::result::Result; impl From for Error { fn from(e: ArrowError) -> Self { - Self::Lance { + Self::Arrow { message: e.to_string(), } } @@ -82,3 +90,21 @@ impl From> for Error { } } } + +#[cfg(feature = "remote")] +impl From for Error { + fn from(e: reqwest::Error) -> Self { + Self::Http { + message: e.to_string(), + } + } +} + +#[cfg(feature = "remote")] +impl From for Error { + fn from(e: url::ParseError) -> Self { + Self::Http { + message: e.to_string(), + } + } +} diff --git a/rust/vectordb/src/index.rs b/rust/lancedb/src/index.rs similarity index 98% rename from rust/vectordb/src/index.rs rename to rust/lancedb/src/index.rs index 9131f2da..6d2cbbb2 100644 --- a/rust/vectordb/src/index.rs +++ b/rust/lancedb/src/index.rs @@ -168,7 +168,7 @@ impl IndexBuilder { /// Build the parameters. pub async fn build(&self) -> Result<()> { - let schema = self.table.schema(); + let schema = self.table.schema().await?; // TODO: simplify this after GH lance#1864. let mut index_type = &self.index_type; @@ -230,7 +230,7 @@ impl IndexBuilder { .table .as_native() .expect("Only native table is supported here"); - let mut dataset = tbl.clone_inner_dataset(); + let mut dataset = tbl.dataset.get_mut().await?; match params { IndexParams::Scalar { replace } => { dataset @@ -271,7 +271,6 @@ impl IndexBuilder { .await?; } } - tbl.reset_dataset(dataset); Ok(()) } } diff --git a/rust/vectordb/src/index/vector.rs b/rust/lancedb/src/index/vector.rs similarity index 100% rename from rust/vectordb/src/index/vector.rs rename to rust/lancedb/src/index/vector.rs diff --git a/rust/vectordb/src/io.rs b/rust/lancedb/src/io.rs similarity index 100% rename from rust/vectordb/src/io.rs rename to rust/lancedb/src/io.rs diff --git a/rust/vectordb/src/io/object_store.rs b/rust/lancedb/src/io/object_store.rs similarity index 97% rename from rust/vectordb/src/io/object_store.rs rename to rust/lancedb/src/io/object_store.rs index 22b7d518..e7dc3d78 100644 --- a/rust/vectordb/src/io/object_store.rs +++ b/rust/lancedb/src/io/object_store.rs @@ -342,7 +342,7 @@ mod test { use object_store::local::LocalFileSystem; use tempfile; - use crate::connection::{Connection, Database}; + use crate::{connect, table::WriteOptions}; #[tokio::test] async fn test_e2e() { @@ -354,7 +354,7 @@ mod test { secondary: Arc::new(secondary_store), }); - let db = Database::connect(dir1.to_str().unwrap()).await.unwrap(); + let db = connect(dir1.to_str().unwrap()).execute().await.unwrap(); let mut param = WriteParams::default(); let store_params = ObjectStoreParams { @@ -368,7 +368,11 @@ mod test { datagen = datagen.col(Box::new(RandomVector::default().named("vector".into()))); let res = db - .create_table("test", Box::new(datagen.batch(100)), Some(param.clone())) + .create_table("test", Box::new(datagen.batch(100))) + .write_options(WriteOptions { + lance_write_params: Some(param), + }) + .execute() .await; // leave this here for easy debugging diff --git a/rust/vectordb/src/ipc.rs b/rust/lancedb/src/ipc.rs similarity index 100% rename from rust/vectordb/src/ipc.rs rename to rust/lancedb/src/ipc.rs diff --git a/rust/vectordb/src/lib.rs b/rust/lancedb/src/lib.rs similarity index 73% rename from rust/vectordb/src/lib.rs rename to rust/lancedb/src/lib.rs index fc4ac149..a04826aa 100644 --- a/rust/vectordb/src/lib.rs +++ b/rust/lancedb/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! # VectorDB ([LanceDB](https://github.com/lancedb/lancedb)) -- Developer-friendly, serverless vector database for AI applications -//! //! [LanceDB](https://github.com/lancedb/lancedb) is an open-source database for vector-search built with persistent storage, //! which greatly simplifies retrevial, filtering and management of embeddings. //! @@ -33,7 +31,7 @@ //! LanceDB runs in process, to use it in your Rust project, put the following in your `Cargo.toml`: //! //! ```ignore -//! cargo install vectordb +//! cargo install lancedb //! ``` //! //! ### Quick Start @@ -43,10 +41,9 @@ //! #### Connect to a database. //! //! ```rust -//! use vectordb::connect; //! # use arrow_schema::{Field, Schema}; //! # tokio::runtime::Runtime::new().unwrap().block_on(async { -//! let db = connect("data/sample-lancedb").await.unwrap(); +//! let db = lancedb::connect("data/sample-lancedb").execute().await.unwrap(); //! # }); //! ``` //! @@ -56,14 +53,20 @@ //! - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud object store //! - `db://dbname` - Lance Cloud //! -//! You can also use [`ConnectOptions`] to configure the connectoin to the database. +//! You can also use [`ConnectOptions`] to configure the connection to the database. //! //! ```rust -//! use vectordb::{connect_with_options, ConnectOptions}; +//! use object_store::aws::AwsCredential; //! # tokio::runtime::Runtime::new().unwrap().block_on(async { -//! let options = ConnectOptions::new("data/sample-lancedb") -//! .index_cache_size(1024); -//! let db = connect_with_options(&options).await.unwrap(); +//! let db = lancedb::connect("data/sample-lancedb") +//! .aws_creds(AwsCredential { +//! key_id: "some_key".to_string(), +//! secret_key: "some_secret".to_string(), +//! token: None, +//! }) +//! .execute() +//! .await +//! .unwrap(); //! # }); //! ``` //! @@ -79,31 +82,44 @@ //! //! ```rust //! # use std::sync::Arc; -//! use arrow_schema::{DataType, Schema, Field}; //! use arrow_array::{RecordBatch, RecordBatchIterator}; +//! use arrow_schema::{DataType, Field, Schema}; //! # use arrow_array::{FixedSizeListArray, Float32Array, Int32Array, types::Float32Type}; -//! # use vectordb::connection::{Database, Connection}; -//! # use vectordb::connect; //! //! # tokio::runtime::Runtime::new().unwrap().block_on(async { //! # let tmpdir = tempfile::tempdir().unwrap(); -//! # let db = connect(tmpdir.path().to_str().unwrap()).await.unwrap(); +//! # let db = lancedb::connect(tmpdir.path().to_str().unwrap()).execute().await.unwrap(); //! let schema = Arc::new(Schema::new(vec![ -//! Field::new("id", DataType::Int32, false), -//! Field::new("vector", DataType::FixedSizeList( -//! Arc::new(Field::new("item", DataType::Float32, true)), 128), true), +//! Field::new("id", DataType::Int32, false), +//! Field::new( +//! "vector", +//! DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128), +//! true, +//! ), //! ])); //! // Create a RecordBatch stream. -//! let batches = RecordBatchIterator::new(vec![ -//! RecordBatch::try_new(schema.clone(), +//! let batches = RecordBatchIterator::new( +//! vec![RecordBatch::try_new( +//! schema.clone(), //! vec![ -//! Arc::new(Int32Array::from_iter_values(0..1000)), -//! Arc::new(FixedSizeListArray::from_iter_primitive::( -//! (0..1000).map(|_| Some(vec![Some(1.0); 128])), 128)), -//! ]).unwrap() -//! ].into_iter().map(Ok), -//! schema.clone()); -//! db.create_table("my_table", Box::new(batches), None).await.unwrap(); +//! Arc::new(Int32Array::from_iter_values(0..256)), +//! Arc::new( +//! FixedSizeListArray::from_iter_primitive::( +//! (0..256).map(|_| Some(vec![Some(1.0); 128])), +//! 128, +//! ), +//! ), +//! ], +//! ) +//! .unwrap()] +//! .into_iter() +//! .map(Ok), +//! schema.clone(), +//! ); +//! db.create_table("my_table", Box::new(batches)) +//! .execute() +//! .await +//! .unwrap(); //! # }); //! ``` //! @@ -111,14 +127,13 @@ //! //! ```no_run //! # use std::sync::Arc; -//! # use vectordb::connect; //! # use arrow_array::{FixedSizeListArray, types::Float32Type, RecordBatch, //! # RecordBatchIterator, Int32Array}; //! # use arrow_schema::{Schema, Field, DataType}; //! # tokio::runtime::Runtime::new().unwrap().block_on(async { //! # let tmpdir = tempfile::tempdir().unwrap(); -//! # let db = connect(tmpdir.path().to_str().unwrap()).await.unwrap(); -//! # let tbl = db.open_table("idx_test").await.unwrap(); +//! # let db = lancedb::connect(tmpdir.path().to_str().unwrap()).execute().await.unwrap(); +//! # let tbl = db.open_table("idx_test").execute().await.unwrap(); //! tbl.create_index(&["vector"]) //! .ivf_pq() //! .num_partitions(256) @@ -136,10 +151,9 @@ //! # use arrow_schema::{DataType, Schema, Field}; //! # use arrow_array::{RecordBatch, RecordBatchIterator}; //! # use arrow_array::{FixedSizeListArray, Float32Array, Int32Array, types::Float32Type}; -//! # use vectordb::connection::{Database, Connection}; //! # tokio::runtime::Runtime::new().unwrap().block_on(async { //! # let tmpdir = tempfile::tempdir().unwrap(); -//! # let db = Database::connect(tmpdir.path().to_str().unwrap()).await.unwrap(); +//! # let db = lancedb::connect(tmpdir.path().to_str().unwrap()).execute().await.unwrap(); //! # let schema = Arc::new(Schema::new(vec![ //! # Field::new("id", DataType::Int32, false), //! # Field::new("vector", DataType::FixedSizeList( @@ -154,8 +168,8 @@ //! # ]).unwrap() //! # ].into_iter().map(Ok), //! # schema.clone()); -//! # db.create_table("my_table", Box::new(batches), None).await.unwrap(); -//! # let table = db.open_table("my_table").await.unwrap(); +//! # db.create_table("my_table", Box::new(batches)).execute().await.unwrap(); +//! # let table = db.open_table("my_table").execute().await.unwrap(); //! let results = table //! .search(&[1.0; 128]) //! .execute_stream() @@ -165,8 +179,6 @@ //! .await //! .unwrap(); //! # }); -//! -//! //! ``` pub mod connection; @@ -176,13 +188,13 @@ pub mod index; pub mod io; pub mod ipc; pub mod query; +#[cfg(feature = "remote")] +pub(crate) mod remote; pub mod table; pub mod utils; -pub use connection::{Connection, Database}; pub use error::{Error, Result}; pub use table::{Table, TableRef}; /// Connect to a database -pub use connection::{connect, connect_with_options, ConnectOptions}; -pub use lance::dataset::WriteMode; +pub use connection::connect; diff --git a/rust/vectordb/src/query.rs b/rust/lancedb/src/query.rs similarity index 78% rename from rust/vectordb/src/query.rs rename to rust/lancedb/src/query.rs index 53765ab5..75e5499b 100644 --- a/rust/vectordb/src/query.rs +++ b/rust/lancedb/src/query.rs @@ -12,24 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use arrow_array::Float32Array; use arrow_schema::Schema; use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner}; -use lance::dataset::Dataset; use lance_linalg::distance::MetricType; use crate::error::Result; +use crate::table::dataset::DatasetConsistencyWrapper; use crate::utils::default_vector_column; use crate::Error; const DEFAULT_TOP_K: usize = 10; +#[derive(Debug, Clone)] +pub enum Select { + All, + Simple(Vec), + Projection(Vec<(String, String)>), +} + /// A builder for nearest neighbor queries for LanceDB. #[derive(Clone)] pub struct Query { - dataset: Arc, + dataset: DatasetConsistencyWrapper, // The column to run the query on. If not specified, we will attempt to guess // the column based on the dataset's schema. @@ -46,7 +51,7 @@ pub struct Query { /// Apply filter to the returned rows. filter: Option, /// Select column projection. - select: Option>, + select: Select, /// Default is true. Set to false to enforce a brute force search. use_index: bool, @@ -61,7 +66,7 @@ impl Query { /// /// * `dataset` - Lance dataset. /// - pub(crate) fn new(dataset: Arc) -> Self { + pub(crate) fn new(dataset: DatasetConsistencyWrapper) -> Self { Self { dataset, query_vector: None, @@ -72,7 +77,7 @@ impl Query { metric_type: None, use_index: true, filter: None, - select: None, + select: Select::All, prefilter: false, } } @@ -83,7 +88,8 @@ impl Query { /// /// * A [DatasetRecordBatchStream] with the query's results. pub async fn execute_stream(&self) -> Result { - let mut scanner: Scanner = self.dataset.scan(); + let ds_ref = self.dataset.get().await?; + let mut scanner: Scanner = ds_ref.scan(); if let Some(query) = self.query_vector.as_ref() { // If there is a vector query, default to limit=10 if unspecified @@ -91,10 +97,10 @@ impl Query { col.clone() } else { // Infer a vector column with the same dimension of the query vector. - let arrow_schema = Schema::from(self.dataset.schema()); + let arrow_schema = Schema::from(ds_ref.schema()); default_vector_column(&arrow_schema, Some(query.len() as i32))? }; - let field = self.dataset.schema().field(&column).ok_or(Error::Store { + let field = ds_ref.schema().field(&column).ok_or(Error::Store { message: format!("Column {} not found in dataset schema", column), })?; if !matches!(field.data_type(), arrow_schema::DataType::FixedSizeList(f, dim) if f.data_type().is_floating() && dim == query.len() as i32) @@ -116,7 +122,16 @@ impl Query { scanner.use_index(self.use_index); scanner.prefilter(self.prefilter); - self.select.as_ref().map(|p| scanner.project(p.as_slice())); + match &self.select { + Select::Simple(select) => { + scanner.project(select.as_slice())?; + } + Select::Projection(select_with_transform) => { + scanner.project_with_transform(select_with_transform.as_slice())?; + } + Select::All => { /* Do nothing */ } + } + self.filter.as_ref().map(|f| scanner.filter(f)); self.refine_factor.map(|rf| scanner.refine(rf)); self.metric_type.map(|mt| scanner.distance_metric(mt)); @@ -207,7 +222,23 @@ impl Query { /// /// Only select the specified columns. If not specified, all columns will be returned. pub fn select(mut self, columns: &[impl AsRef]) -> Self { - self.select = Some(columns.iter().map(|c| c.as_ref().to_string()).collect()); + self.select = Select::Simple(columns.iter().map(|c| c.as_ref().to_string()).collect()); + self + } + + /// Return only the specified columns. + /// + /// Only select the specified columns. If not specified, all columns will be returned. + pub fn select_with_projection( + mut self, + columns: &[(impl AsRef, impl AsRef)], + ) -> Self { + self.select = Select::Projection( + columns + .iter() + .map(|(c, t)| (c.as_ref().to_string(), t.as_ref().to_string())) + .collect(), + ); self } @@ -227,7 +258,7 @@ mod tests { RecordBatchReader, }; use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; - use futures::StreamExt; + use futures::{StreamExt, TryStreamExt}; use lance::dataset::Dataset; use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector}; use tempfile::tempdir; @@ -240,8 +271,10 @@ mod tests { let batches = make_test_batches(); let ds = Dataset::write(batches, "memory://foo", None).await.unwrap(); + let ds = DatasetConsistencyWrapper::new_latest(ds, None); + let vector = Some(Float32Array::from_iter_values([0.1, 0.2])); - let query = Query::new(Arc::new(ds)).nearest_to(&[0.1, 0.2]); + let query = Query::new(ds).nearest_to(&[0.1, 0.2]); assert_eq!(query.query_vector, vector); let new_vector = Float32Array::from_iter_values([9.8, 8.7]); @@ -265,7 +298,9 @@ mod tests { #[tokio::test] async fn test_execute() { let batches = make_non_empty_batches(); - let ds = Arc::new(Dataset::write(batches, "memory://foo", None).await.unwrap()); + let ds = Dataset::write(batches, "memory://foo", None).await.unwrap(); + + let ds = DatasetConsistencyWrapper::new_latest(ds, None); let query = Query::new(ds.clone()).nearest_to(&[0.1; 4]); let result = query.limit(10).filter("id % 2 == 0").execute_stream().await; @@ -291,13 +326,47 @@ mod tests { } } + #[tokio::test] + async fn test_select_with_transform() { + let batches = make_non_empty_batches(); + let ds = Dataset::write(batches, "memory://foo", None).await.unwrap(); + + let ds = DatasetConsistencyWrapper::new_latest(ds, None); + + let query = Query::new(ds) + .limit(10) + .select_with_projection(&[("id2", "id * 2"), ("id", "id")]); + let result = query.execute_stream().await; + let mut batches = result + .expect("should have result") + .try_collect::>() + .await + .unwrap(); + assert_eq!(batches.len(), 1); + let batch = batches.pop().unwrap(); + + // id, and id2 + assert_eq!(batch.num_columns(), 2); + + let id: &Int32Array = batch.column_by_name("id").unwrap().as_primitive(); + let id2: &Int32Array = batch.column_by_name("id2").unwrap().as_primitive(); + + id.iter().zip(id2.iter()).for_each(|(id, id2)| { + let id = id.unwrap(); + let id2 = id2.unwrap(); + assert_eq!(id * 2, id2); + }); + } + #[tokio::test] async fn test_execute_no_vector() { // test that it's ok to not specify a query vector (just filter / limit) let batches = make_non_empty_batches(); - let ds = Arc::new(Dataset::write(batches, "memory://foo", None).await.unwrap()); + let ds = Dataset::write(batches, "memory://foo", None).await.unwrap(); - let query = Query::new(ds.clone()); + let ds = DatasetConsistencyWrapper::new_latest(ds, None); + + let query = Query::new(ds); let result = query.filter("id % 2 == 0").execute_stream().await; let mut stream = result.expect("should have result"); // should only have one batch diff --git a/rust/lancedb/src/remote.rs b/rust/lancedb/src/remote.rs new file mode 100644 index 00000000..dfdf6224 --- /dev/null +++ b/rust/lancedb/src/remote.rs @@ -0,0 +1,23 @@ +// Copyright 2024 Lance Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! This module contains a remote client for a LanceDB server. This is used +//! to communicate with LanceDB cloud. It can also serve as an example for +//! building client/server applications with LanceDB or as a client for some +//! other custom LanceDB service. + +pub mod client; +pub mod db; +pub mod table; +pub mod util; diff --git a/rust/lancedb/src/remote/client.rs b/rust/lancedb/src/remote/client.rs new file mode 100644 index 00000000..6ff9811b --- /dev/null +++ b/rust/lancedb/src/remote/client.rs @@ -0,0 +1,124 @@ +// Copyright 2024 LanceDB Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use reqwest::{ + header::{HeaderMap, HeaderValue}, + RequestBuilder, Response, +}; + +use crate::error::{Error, Result}; + +#[derive(Clone, Debug)] +pub struct RestfulLanceDbClient { + client: reqwest::Client, + host: String, +} + +impl RestfulLanceDbClient { + fn default_headers( + api_key: &str, + region: &str, + db_name: &str, + has_host_override: bool, + ) -> Result { + let mut headers = HeaderMap::new(); + headers.insert( + "x-api-key", + HeaderValue::from_str(api_key).map_err(|_| Error::Http { + message: "non-ascii api key provided".to_string(), + })?, + ); + if region == "local" { + let host = format!("{}.local.api.lancedb.com", db_name); + headers.insert( + "Host", + HeaderValue::from_str(&host).map_err(|_| Error::Http { + message: format!("non-ascii database name '{}' provided", db_name), + })?, + ); + } + if has_host_override { + headers.insert( + "x-lancedb-database", + HeaderValue::from_str(db_name).map_err(|_| Error::Http { + message: format!("non-ascii database name '{}' provided", db_name), + })?, + ); + } + + Ok(headers) + } + + pub fn try_new( + db_url: &str, + api_key: &str, + region: &str, + host_override: Option, + ) -> Result { + let parsed_url = url::Url::parse(db_url)?; + debug_assert_eq!(parsed_url.scheme(), "db"); + if !parsed_url.has_host() { + return Err(Error::Http { + message: format!("Invalid database URL (missing host) '{}'", db_url), + }); + } + let db_name = parsed_url.host_str().unwrap(); + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(30)) + .default_headers(Self::default_headers( + api_key, + region, + db_name, + host_override.is_some(), + )?) + .build()?; + let host = match host_override { + Some(host_override) => host_override, + None => format!("https://{}.{}.api.lancedb.com", db_name, region), + }; + Ok(Self { client, host }) + } + + pub fn get(&self, uri: &str) -> RequestBuilder { + let full_uri = format!("{}{}", self.host, uri); + self.client.get(full_uri) + } + + pub fn post(&self, uri: &str) -> RequestBuilder { + let full_uri = format!("{}{}", self.host, uri); + self.client.post(full_uri) + } + + async fn rsp_to_str(response: Response) -> String { + let status = response.status(); + response.text().await.unwrap_or_else(|_| status.to_string()) + } + + pub async fn check_response(&self, response: Response) -> Result { + let status_int: u16 = u16::from(response.status()); + if (400..500).contains(&status_int) { + Err(Error::InvalidInput { + message: Self::rsp_to_str(response).await, + }) + } else if status_int != 200 { + Err(Error::Runtime { + message: Self::rsp_to_str(response).await, + }) + } else { + Ok(response) + } + } +} diff --git a/rust/lancedb/src/remote/db.rs b/rust/lancedb/src/remote/db.rs new file mode 100644 index 00000000..948db4fd --- /dev/null +++ b/rust/lancedb/src/remote/db.rs @@ -0,0 +1,102 @@ +// Copyright 2024 LanceDB Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use async_trait::async_trait; +use reqwest::header::CONTENT_TYPE; +use serde::Deserialize; +use tokio::task::spawn_blocking; + +use crate::connection::{ConnectionInternal, CreateTableBuilder, OpenTableBuilder}; +use crate::error::Result; +use crate::TableRef; + +use super::client::RestfulLanceDbClient; +use super::table::RemoteTable; +use super::util::batches_to_ipc_bytes; + +const ARROW_STREAM_CONTENT_TYPE: &str = "application/vnd.apache.arrow.stream"; + +#[derive(Deserialize)] +struct ListTablesResponse { + tables: Vec, +} + +#[derive(Debug)] +pub struct RemoteDatabase { + client: RestfulLanceDbClient, +} + +impl RemoteDatabase { + pub fn try_new( + uri: &str, + api_key: &str, + region: &str, + host_override: Option, + ) -> Result { + let client = RestfulLanceDbClient::try_new(uri, api_key, region, host_override)?; + Ok(Self { client }) + } +} + +#[async_trait] +impl ConnectionInternal for RemoteDatabase { + async fn table_names(&self) -> Result> { + let rsp = self + .client + .get("/v1/table/") + .query(&[("limit", 10)]) + .query(&[("page_token", "")]) + .send() + .await?; + let rsp = self.client.check_response(rsp).await?; + Ok(rsp.json::().await?.tables) + } + + async fn do_create_table(&self, options: CreateTableBuilder) -> Result { + let data = options.data.unwrap(); + // TODO: https://github.com/lancedb/lancedb/issues/1026 + // We should accept data from an async source. In the meantime, spawn this as blocking + // to make sure we don't block the tokio runtime if the source is slow. + let data_buffer = spawn_blocking(move || batches_to_ipc_bytes(data)) + .await + .unwrap()?; + + self.client + .post(&format!("/v1/table/{}/create", options.name)) + .body(data_buffer) + .header(CONTENT_TYPE, ARROW_STREAM_CONTENT_TYPE) + .header("x-request-id", "na") + .send() + .await?; + + Ok(Arc::new(RemoteTable::new( + self.client.clone(), + options.name, + ))) + } + + async fn do_open_table(&self, _options: OpenTableBuilder) -> Result { + todo!() + } + + async fn drop_table(&self, _name: &str) -> Result<()> { + todo!() + } + + async fn drop_db(&self) -> Result<()> { + todo!() + } +} diff --git a/rust/lancedb/src/remote/table.rs b/rust/lancedb/src/remote/table.rs new file mode 100644 index 00000000..dfbf337f --- /dev/null +++ b/rust/lancedb/src/remote/table.rs @@ -0,0 +1,89 @@ +use arrow_array::RecordBatchReader; +use arrow_schema::SchemaRef; +use async_trait::async_trait; +use lance::dataset::{ColumnAlteration, NewColumnTransform}; + +use crate::{ + error::Result, + index::IndexBuilder, + query::Query, + table::{ + merge::MergeInsertBuilder, AddDataOptions, NativeTable, OptimizeAction, OptimizeStats, + }, + Table, +}; + +use super::client::RestfulLanceDbClient; + +#[derive(Debug)] +pub struct RemoteTable { + #[allow(dead_code)] + client: RestfulLanceDbClient, + name: String, +} + +impl RemoteTable { + pub fn new(client: RestfulLanceDbClient, name: String) -> Self { + Self { client, name } + } +} + +impl std::fmt::Display for RemoteTable { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "RemoteTable({})", self.name) + } +} + +#[async_trait] +impl Table for RemoteTable { + fn as_any(&self) -> &dyn std::any::Any { + self + } + fn as_native(&self) -> Option<&NativeTable> { + None + } + fn name(&self) -> &str { + &self.name + } + async fn schema(&self) -> Result { + todo!() + } + async fn count_rows(&self, _filter: Option) -> Result { + todo!() + } + async fn add( + &self, + _batches: Box, + _options: AddDataOptions, + ) -> Result<()> { + todo!() + } + async fn delete(&self, _predicate: &str) -> Result<()> { + todo!() + } + fn create_index(&self, _column: &[&str]) -> IndexBuilder { + todo!() + } + fn merge_insert(&self, _on: &[&str]) -> MergeInsertBuilder { + todo!() + } + fn query(&self) -> Query { + todo!() + } + async fn optimize(&self, _action: OptimizeAction) -> Result { + todo!() + } + async fn add_columns( + &self, + _transforms: NewColumnTransform, + _read_columns: Option>, + ) -> Result<()> { + todo!() + } + async fn alter_columns(&self, _alterations: &[ColumnAlteration]) -> Result<()> { + todo!() + } + async fn drop_columns(&self, _columns: &[&str]) -> Result<()> { + todo!() + } +} diff --git a/rust/lancedb/src/remote/util.rs b/rust/lancedb/src/remote/util.rs new file mode 100644 index 00000000..b594ed6e --- /dev/null +++ b/rust/lancedb/src/remote/util.rs @@ -0,0 +1,21 @@ +use std::io::Cursor; + +use arrow_array::RecordBatchReader; + +use crate::Result; + +pub fn batches_to_ipc_bytes(batches: impl RecordBatchReader) -> Result> { + const WRITE_BUF_SIZE: usize = 4096; + let buf = Vec::with_capacity(WRITE_BUF_SIZE); + let mut buf = Cursor::new(buf); + { + let mut writer = arrow_ipc::writer::FileWriter::try_new(&mut buf, &batches.schema())?; + + for batch in batches { + let batch = batch?; + writer.write(&batch)?; + } + writer.finish()?; + } + Ok(buf.into_inner()) +} diff --git a/rust/vectordb/src/table.rs b/rust/lancedb/src/table.rs similarity index 76% rename from rust/vectordb/src/table.rs rename to rust/lancedb/src/table.rs index 7c5ac300..c9638f08 100644 --- a/rust/vectordb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -15,9 +15,9 @@ //! LanceDB Table APIs use std::path::Path; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; -use arrow_array::RecordBatchReader; +use arrow_array::{RecordBatchIterator, RecordBatchReader}; use arrow_schema::{Schema, SchemaRef}; use async_trait::async_trait; use chrono::Duration; @@ -27,7 +27,10 @@ use lance::dataset::optimize::{ compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions, }; pub use lance::dataset::ReadParams; -use lance::dataset::{Dataset, UpdateBuilder, WhenMatched, WriteParams}; +use lance::dataset::{ + ColumnAlteration, Dataset, NewColumnTransform, UpdateBuilder, WhenMatched, WriteMode, + WriteParams, +}; use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource}; use lance::io::WrappingObjectStore; use lance_index::{optimize::OptimizeOptions, DatasetIndexExt}; @@ -38,10 +41,11 @@ use crate::index::vector::{VectorIndex, VectorIndexStatistics}; use crate::index::IndexBuilder; use crate::query::Query; use crate::utils::{PatchReadParam, PatchWriteParam}; -use crate::WriteMode; +use self::dataset::DatasetConsistencyWrapper; use self::merge::{MergeInsert, MergeInsertBuilder}; +pub(crate) mod dataset; pub mod merge; /// Optimize the dataset. @@ -85,6 +89,35 @@ pub struct OptimizeStats { pub prune: Option, } +/// Options to use when writing data +#[derive(Clone, Debug, Default)] +pub struct WriteOptions { + // Coming soon: https://github.com/lancedb/lancedb/issues/992 + // /// What behavior to take if the data contains invalid vectors + // pub on_bad_vectors: BadVectorHandling, + /// Advanced parameters that can be used to customize table creation + /// + /// If set, these will take precedence over any overlapping `OpenTableOptions` options + pub lance_write_params: Option, +} + +#[derive(Debug, Clone, Default)] +pub enum AddDataMode { + /// Rows will be appended to the table (the default) + #[default] + Append, + /// The existing table will be overwritten with the new data + Overwrite, +} + +#[derive(Debug, Default, Clone)] +pub struct AddDataOptions { + /// Whether to add new rows (the default) or replace the existing data + pub mode: AddDataMode, + /// Options to use when writing the data + pub write_options: WriteOptions, +} + /// A Table is a collection of strong typed Rows. /// /// The type of the each row is defined in Apache Arrow [Schema]. @@ -99,7 +132,7 @@ pub trait Table: std::fmt::Display + Send + Sync { fn name(&self) -> &str; /// Get the arrow [Schema] of the table. - fn schema(&self) -> SchemaRef; + async fn schema(&self) -> Result; /// Count the number of rows in this dataset. /// @@ -112,12 +145,12 @@ pub trait Table: std::fmt::Display + Send + Sync { /// /// # Arguments /// - /// * `batches` RecordBatch to be saved in the Table - /// * `params` Append / Overwrite existing records. Default: Append + /// * `batches` data to be added to the Table + /// * `options` options to control how data is added async fn add( &self, batches: Box, - params: Option, + options: AddDataOptions, ) -> Result<()>; /// Delete the rows from table that match the predicate. @@ -129,28 +162,43 @@ pub trait Table: std::fmt::Display + Send + Sync { /// /// ```no_run /// # use std::sync::Arc; - /// # use vectordb::connection::{Database, Connection}; /// # use arrow_array::{FixedSizeListArray, types::Float32Type, RecordBatch, /// # RecordBatchIterator, Int32Array}; /// # use arrow_schema::{Schema, Field, DataType}; /// # tokio::runtime::Runtime::new().unwrap().block_on(async { /// let tmpdir = tempfile::tempdir().unwrap(); - /// let db = Database::connect(tmpdir.path().to_str().unwrap()).await.unwrap(); + /// let db = lancedb::connect(tmpdir.path().to_str().unwrap()) + /// .execute() + /// .await + /// .unwrap(); /// # let schema = Arc::new(Schema::new(vec![ /// # Field::new("id", DataType::Int32, false), /// # Field::new("vector", DataType::FixedSizeList( /// # Arc::new(Field::new("item", DataType::Float32, true)), 128), true), /// # ])); - /// let batches = RecordBatchIterator::new(vec![ - /// RecordBatch::try_new(schema.clone(), - /// vec![ - /// Arc::new(Int32Array::from_iter_values(0..10)), - /// Arc::new(FixedSizeListArray::from_iter_primitive::( - /// (0..10).map(|_| Some(vec![Some(1.0); 128])), 128)), - /// ]).unwrap() - /// ].into_iter().map(Ok), - /// schema.clone()); - /// let tbl = db.create_table("delete_test", Box::new(batches), None).await.unwrap(); + /// let batches = RecordBatchIterator::new( + /// vec![RecordBatch::try_new( + /// schema.clone(), + /// vec![ + /// Arc::new(Int32Array::from_iter_values(0..10)), + /// Arc::new( + /// FixedSizeListArray::from_iter_primitive::( + /// (0..10).map(|_| Some(vec![Some(1.0); 128])), + /// 128, + /// ), + /// ), + /// ], + /// ) + /// .unwrap()] + /// .into_iter() + /// .map(Ok), + /// schema.clone(), + /// ); + /// let tbl = db + /// .create_table("delete_test", Box::new(batches)) + /// .execute() + /// .await + /// .unwrap(); /// tbl.delete("id > 5").await.unwrap(); /// # }); /// ``` @@ -162,14 +210,16 @@ pub trait Table: std::fmt::Display + Send + Sync { /// /// ```no_run /// # use std::sync::Arc; - /// # use vectordb::connection::{Database, Connection}; /// # use arrow_array::{FixedSizeListArray, types::Float32Type, RecordBatch, /// # RecordBatchIterator, Int32Array}; /// # use arrow_schema::{Schema, Field, DataType}; /// # tokio::runtime::Runtime::new().unwrap().block_on(async { /// let tmpdir = tempfile::tempdir().unwrap(); - /// let db = Database::connect(tmpdir.path().to_str().unwrap()).await.unwrap(); - /// # let tbl = db.open_table("idx_test").await.unwrap(); + /// let db = lancedb::connect(tmpdir.path().to_str().unwrap()) + /// .execute() + /// .await + /// .unwrap(); + /// # let tbl = db.open_table("idx_test").execute().await.unwrap(); /// tbl.create_index(&["vector"]) /// .ivf_pq() /// .num_partitions(256) @@ -214,32 +264,44 @@ pub trait Table: std::fmt::Display + Send + Sync { /// /// ```no_run /// # use std::sync::Arc; - /// # use vectordb::connection::{Database, Connection}; /// # use arrow_array::{FixedSizeListArray, types::Float32Type, RecordBatch, /// # RecordBatchIterator, Int32Array}; /// # use arrow_schema::{Schema, Field, DataType}; /// # tokio::runtime::Runtime::new().unwrap().block_on(async { /// let tmpdir = tempfile::tempdir().unwrap(); - /// let db = Database::connect(tmpdir.path().to_str().unwrap()).await.unwrap(); - /// # let tbl = db.open_table("idx_test").await.unwrap(); + /// let db = lancedb::connect(tmpdir.path().to_str().unwrap()) + /// .execute() + /// .await + /// .unwrap(); + /// # let tbl = db.open_table("idx_test").execute().await.unwrap(); /// # let schema = Arc::new(Schema::new(vec![ /// # Field::new("id", DataType::Int32, false), /// # Field::new("vector", DataType::FixedSizeList( /// # Arc::new(Field::new("item", DataType::Float32, true)), 128), true), /// # ])); - /// let new_data = RecordBatchIterator::new(vec![ - /// RecordBatch::try_new(schema.clone(), - /// vec![ - /// Arc::new(Int32Array::from_iter_values(0..10)), - /// Arc::new(FixedSizeListArray::from_iter_primitive::( - /// (0..10).map(|_| Some(vec![Some(1.0); 128])), 128)), - /// ]).unwrap() - /// ].into_iter().map(Ok), - /// schema.clone()); + /// let new_data = RecordBatchIterator::new( + /// vec![RecordBatch::try_new( + /// schema.clone(), + /// vec![ + /// Arc::new(Int32Array::from_iter_values(0..10)), + /// Arc::new( + /// FixedSizeListArray::from_iter_primitive::( + /// (0..10).map(|_| Some(vec![Some(1.0); 128])), + /// 128, + /// ), + /// ), + /// ], + /// ) + /// .unwrap()] + /// .into_iter() + /// .map(Ok), + /// schema.clone(), + /// ); /// // Perform an upsert operation /// let mut merge_insert = tbl.merge_insert(&["id"]); - /// merge_insert.when_matched_update_all(None) - /// .when_not_matched_insert_all(); + /// merge_insert + /// .when_matched_update_all(None) + /// .when_not_matched_insert_all(); /// merge_insert.execute(Box::new(new_data)).await.unwrap(); /// # }); /// ``` @@ -265,8 +327,11 @@ pub trait Table: std::fmt::Display + Send + Sync { /// # use arrow_array::RecordBatch; /// # use futures::TryStreamExt; /// # tokio::runtime::Runtime::new().unwrap().block_on(async { - /// # let tbl = vectordb::table::NativeTable::open("/tmp/tbl").await.unwrap(); - /// let stream = tbl.query().nearest_to(&[1.0, 2.0, 3.0]) + /// # let tbl = lancedb::table::NativeTable::open("/tmp/tbl").await.unwrap(); + /// use crate::lancedb::Table; + /// let stream = tbl + /// .query() + /// .nearest_to(&[1.0, 2.0, 3.0]) /// .refine_factor(5) /// .nprobes(10) /// .execute_stream() @@ -281,7 +346,8 @@ pub trait Table: std::fmt::Display + Send + Sync { /// # use arrow_array::RecordBatch; /// # use futures::TryStreamExt; /// # tokio::runtime::Runtime::new().unwrap().block_on(async { - /// # let tbl = vectordb::table::NativeTable::open("/tmp/tbl").await.unwrap(); + /// # let tbl = lancedb::table::NativeTable::open("/tmp/tbl").await.unwrap(); + /// use crate::lancedb::Table; /// let stream = tbl /// .query() /// .filter("id > 5") @@ -298,12 +364,9 @@ pub trait Table: std::fmt::Display + Send + Sync { /// # use arrow_array::RecordBatch; /// # use futures::TryStreamExt; /// # tokio::runtime::Runtime::new().unwrap().block_on(async { - /// # let tbl = vectordb::table::NativeTable::open("/tmp/tbl").await.unwrap(); - /// let stream = tbl - /// .query() - /// .execute_stream() - /// .await - /// .unwrap(); + /// # let tbl = lancedb::table::NativeTable::open("/tmp/tbl").await.unwrap(); + /// use crate::lancedb::Table; + /// let stream = tbl.query().execute_stream().await.unwrap(); /// let batches: Vec = stream.try_collect().await.unwrap(); /// # }); /// ``` @@ -313,9 +376,22 @@ pub trait Table: std::fmt::Display + Send + Sync { /// ///
Experimental API
/// - /// Modeled after ``VACCUM`` in PostgreSQL. + /// Modeled after ``VACUUM`` in PostgreSQL. /// Not all implementations support explicit optimization. async fn optimize(&self, action: OptimizeAction) -> Result; + + /// Add new columns to the table, providing values to fill in. + async fn add_columns( + &self, + transforms: NewColumnTransform, + read_columns: Option>, + ) -> Result<()>; + + /// Change a column's name or nullability. + async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()>; + + /// Remove columns from the table. + async fn drop_columns(&self, columns: &[&str]) -> Result<()>; } /// Reference to a Table pointer. @@ -326,10 +402,14 @@ pub type TableRef = Arc; pub struct NativeTable { name: String, uri: String, - dataset: Arc>, + pub(crate) dataset: dataset::DatasetConsistencyWrapper, // the object store wrapper to use on write path store_wrapper: Option>, + + // This comes from the connection options. We store here so we can pass down + // to the dataset when we recreate it (for example, in checkout_latest). + read_consistency_interval: Option, } impl std::fmt::Display for NativeTable { @@ -351,7 +431,7 @@ impl NativeTable { /// * A [NativeTable] object. pub async fn open(uri: &str) -> Result { let name = Self::get_table_name(uri)?; - Self::open_with_params(uri, &name, None, ReadParams::default()).await + Self::open_with_params(uri, &name, None, None, None).await } /// Opens an existing Table @@ -369,8 +449,10 @@ impl NativeTable { uri: &str, name: &str, write_store_wrapper: Option>, - params: ReadParams, + params: Option, + read_consistency_interval: Option, ) -> Result { + let params = params.unwrap_or_default(); // patch the params if we have a write store wrapper let params = match write_store_wrapper.clone() { Some(wrapper) => params.patch_with_store_wrapper(wrapper)?, @@ -389,24 +471,22 @@ impl NativeTable { message: e.to_string(), }, })?; + + let dataset = DatasetConsistencyWrapper::new_latest(dataset, read_consistency_interval); + Ok(Self { name: name.to_string(), uri: uri.to_string(), - dataset: Arc::new(Mutex::new(dataset)), + dataset, store_wrapper: write_store_wrapper, + read_consistency_interval, }) } - /// Make a new clone of the internal lance dataset. - pub(crate) fn clone_inner_dataset(&self) -> Dataset { - self.dataset.lock().expect("Lock poison").clone() - } - /// Checkout a specific version of this [NativeTable] - /// pub async fn checkout(uri: &str, version: u64) -> Result { let name = Self::get_table_name(uri)?; - Self::checkout_with_params(uri, &name, version, None, ReadParams::default()).await + Self::checkout_with_params(uri, &name, version, None, ReadParams::default(), None).await } pub async fn checkout_with_params( @@ -415,44 +495,35 @@ impl NativeTable { version: u64, write_store_wrapper: Option>, params: ReadParams, + read_consistency_interval: Option, ) -> Result { // patch the params if we have a write store wrapper let params = match write_store_wrapper.clone() { Some(wrapper) => params.patch_with_store_wrapper(wrapper)?, None => params, }; - let dataset = Dataset::checkout_with_params(uri, version, ¶ms) - .await - .map_err(|e| match e { - lance::Error::DatasetNotFound { .. } => Error::TableNotFound { - name: name.to_string(), - }, - e => Error::Lance { - message: e.to_string(), - }, - })?; + let dataset = DatasetBuilder::from_uri(uri) + .with_version(version) + .with_read_params(params) + .load() + .await?; + let dataset = DatasetConsistencyWrapper::new_time_travel(dataset, version); + Ok(Self { name: name.to_string(), uri: uri.to_string(), - dataset: Arc::new(Mutex::new(dataset)), + dataset, store_wrapper: write_store_wrapper, + read_consistency_interval, }) } pub async fn checkout_latest(&self) -> Result { - let dataset = self.clone_inner_dataset(); - let latest_version_id = dataset.latest_version_id().await?; - let dataset = if latest_version_id == dataset.version().version { - dataset - } else { - dataset.checkout_version(latest_version_id).await? - }; - + let mut dataset = self.dataset.duplicate().await; + dataset.as_latest(self.read_consistency_interval).await?; Ok(Self { - name: self.name.clone(), - uri: self.uri.clone(), - dataset: Arc::new(Mutex::new(dataset)), - store_wrapper: self.store_wrapper.clone(), + dataset, + ..self.clone() }) } @@ -488,14 +559,16 @@ impl NativeTable { batches: impl RecordBatchReader + Send + 'static, write_store_wrapper: Option>, params: Option, + read_consistency_interval: Option, ) -> Result { + let params = params.unwrap_or_default(); // patch the params if we have a write store wrapper let params = match write_store_wrapper.clone() { Some(wrapper) => params.patch_with_store_wrapper(wrapper)?, None => params, }; - let dataset = Dataset::write(batches, uri, params) + let dataset = Dataset::write(batches, uri, Some(params)) .await .map_err(|e| match e { lance::Error::DatasetAlreadyExists { .. } => Error::TableAlreadyExists { @@ -508,34 +581,47 @@ impl NativeTable { Ok(Self { name: name.to_string(), uri: uri.to_string(), - dataset: Arc::new(Mutex::new(dataset)), + dataset: DatasetConsistencyWrapper::new_latest(dataset, read_consistency_interval), store_wrapper: write_store_wrapper, + read_consistency_interval, }) } + pub async fn create_empty( + uri: &str, + name: &str, + schema: SchemaRef, + write_store_wrapper: Option>, + params: Option, + read_consistency_interval: Option, + ) -> Result { + let batches = RecordBatchIterator::new(vec![], schema); + Self::create( + uri, + name, + batches, + write_store_wrapper, + params, + read_consistency_interval, + ) + .await + } + /// Version of this Table - pub fn version(&self) -> u64 { - self.dataset.lock().expect("lock poison").version().version + pub async fn version(&self) -> Result { + Ok(self.dataset.get().await?.version().version) } async fn optimize_indices(&self, options: &OptimizeOptions) -> Result<()> { info!("LanceDB: optimizing indices: {:?}", options); - let mut dataset = self.clone_inner_dataset(); - dataset.optimize_indices(options).await?; - + self.dataset + .get_mut() + .await? + .optimize_indices(options) + .await?; Ok(()) } - pub fn query(&self) -> Query { - Query::new(self.clone_inner_dataset().into()) - } - - pub fn filter(&self, expr: String) -> Query { - Query::new(self.clone_inner_dataset().into()).filter(expr) - } - - /// Returns the number of rows in this Table - /// Merge new data into this table. pub async fn merge( &mut self, @@ -543,14 +629,17 @@ impl NativeTable { left_on: &str, right_on: &str, ) -> Result<()> { - let mut dataset = self.clone_inner_dataset(); - dataset.merge(batches, left_on, right_on).await?; - self.dataset = Arc::new(Mutex::new(dataset)); + self.dataset + .get_mut() + .await? + .merge(batches, left_on, right_on) + .await?; Ok(()) } pub async fn update(&self, predicate: Option<&str>, updates: Vec<(&str, &str)>) -> Result<()> { - let mut builder = UpdateBuilder::new(self.clone_inner_dataset().into()); + let dataset = self.dataset.get().await?.clone(); + let mut builder = UpdateBuilder::new(Arc::new(dataset)); if let Some(predicate) = predicate { builder = builder.update_where(predicate)?; } @@ -561,7 +650,7 @@ impl NativeTable { let operation = builder.build()?; let ds = operation.execute().await?; - self.reset_dataset(ds.as_ref().clone()); + self.dataset.set_latest(ds.as_ref().clone()).await; Ok(()) } @@ -581,8 +670,10 @@ impl NativeTable { older_than: Duration, delete_unverified: Option, ) -> Result { - let dataset = self.clone_inner_dataset(); - Ok(dataset + Ok(self + .dataset + .get_mut() + .await? .cleanup_old_versions(older_than, delete_unverified) .await?) } @@ -598,24 +689,27 @@ impl NativeTable { options: CompactionOptions, remap_options: Option>, ) -> Result { - let mut dataset = self.clone_inner_dataset(); - let metrics = compact_files(&mut dataset, options, remap_options).await?; - self.reset_dataset(dataset); + let mut dataset_mut = self.dataset.get_mut().await?; + let metrics = compact_files(&mut dataset_mut, options, remap_options).await?; Ok(metrics) } - pub fn count_fragments(&self) -> usize { - self.dataset.lock().expect("lock poison").count_fragments() + // TODO: why are these individual methods and not some single "get_stats" method? + pub async fn count_fragments(&self) -> Result { + Ok(self.dataset.get().await?.count_fragments()) } pub async fn count_deleted_rows(&self) -> Result { - let dataset = self.clone_inner_dataset(); - Ok(dataset.count_deleted_rows().await?) + Ok(self.dataset.get().await?.count_deleted_rows().await?) } - pub async fn num_small_files(&self, max_rows_per_group: usize) -> usize { - let dataset = self.clone_inner_dataset(); - dataset.num_small_files(max_rows_per_group).await + pub async fn num_small_files(&self, max_rows_per_group: usize) -> Result { + Ok(self + .dataset + .get() + .await? + .num_small_files(max_rows_per_group) + .await) } pub async fn count_indexed_rows(&self, index_uuid: &str) -> Result> { @@ -633,7 +727,7 @@ impl NativeTable { } pub async fn load_indices(&self) -> Result> { - let dataset = self.clone_inner_dataset(); + let dataset = self.dataset.get().await?; let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?; Ok(indices .iter() @@ -650,7 +744,7 @@ impl NativeTable { if index.is_none() { return Ok(None); } - let dataset = self.clone_inner_dataset(); + let dataset = self.dataset.get().await?; let index_stats = dataset.index_statistics(&index.unwrap().index_name).await?; let index_stats: VectorIndexStatistics = serde_json::from_str(&index_stats).map_err(|e| Error::Lance { @@ -662,10 +756,6 @@ impl NativeTable { Ok(Some(index_stats)) } - - pub(crate) fn reset_dataset(&self, dataset: Dataset) { - *self.dataset.lock().expect("lock poison") = dataset; - } } #[async_trait] @@ -675,7 +765,7 @@ impl MergeInsert for NativeTable { params: MergeInsertBuilder, new_data: Box, ) -> Result<()> { - let dataset = Arc::new(self.clone_inner_dataset()); + let dataset = Arc::new(self.dataset.get().await?.clone()); let mut builder = LanceMergeInsertBuilder::try_new(dataset.clone(), params.on)?; match ( params.when_matched_update_all, @@ -702,7 +792,7 @@ impl MergeInsert for NativeTable { } let job = builder.try_build()?; let new_dataset = job.execute_reader(new_data).await?; - self.reset_dataset((*new_dataset).clone()); + self.dataset.set_latest(new_dataset.as_ref().clone()).await; Ok(()) } } @@ -721,13 +811,13 @@ impl Table for NativeTable { self.name.as_str() } - fn schema(&self) -> SchemaRef { - let lance_schema = { self.dataset.lock().expect("lock poison").schema().clone() }; - Arc::new(Schema::from(&lance_schema)) + async fn schema(&self) -> Result { + let lance_schema = self.dataset.get().await?.schema().clone(); + Ok(Arc::new(Schema::from(&lance_schema))) } async fn count_rows(&self, filter: Option) -> Result { - let dataset = { self.dataset.lock().expect("lock poison").clone() }; + let dataset = self.dataset.get().await?; if let Some(filter) = filter { let mut scanner = dataset.scan(); scanner.filter(&filter)?; @@ -740,20 +830,27 @@ impl Table for NativeTable { async fn add( &self, batches: Box, - params: Option, + params: AddDataOptions, ) -> Result<()> { - let params = Some(params.unwrap_or(WriteParams { - mode: WriteMode::Append, - ..WriteParams::default() - })); + let lance_params = params + .write_options + .lance_write_params + .unwrap_or(WriteParams { + mode: match params.mode { + AddDataMode::Append => WriteMode::Append, + AddDataMode::Overwrite => WriteMode::Overwrite, + }, + ..Default::default() + }); // patch the params if we have a write store wrapper - let params = match self.store_wrapper.clone() { - Some(wrapper) => params.patch_with_store_wrapper(wrapper)?, - None => params, + let lance_params = match self.store_wrapper.clone() { + Some(wrapper) => lance_params.patch_with_store_wrapper(wrapper)?, + None => lance_params, }; - self.reset_dataset(Dataset::write(batches, &self.uri, params).await?); + let dataset = Dataset::write(batches, &self.uri, Some(lance_params)).await?; + self.dataset.set_latest(dataset).await; Ok(()) } @@ -767,14 +864,12 @@ impl Table for NativeTable { } fn query(&self) -> Query { - Query::new(Arc::new(self.dataset.lock().expect("lock poison").clone())) + Query::new(self.dataset.clone()) } /// Delete rows from the table async fn delete(&self, predicate: &str) -> Result<()> { - let mut dataset = self.clone_inner_dataset(); - dataset.delete(predicate).await?; - self.reset_dataset(dataset); + self.dataset.get_mut().await?.delete(predicate).await?; Ok(()) } @@ -823,6 +918,33 @@ impl Table for NativeTable { } Ok(stats) } + + async fn add_columns( + &self, + transforms: NewColumnTransform, + read_columns: Option>, + ) -> Result<()> { + self.dataset + .get_mut() + .await? + .add_columns(transforms, read_columns) + .await?; + Ok(()) + } + + async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<()> { + self.dataset + .get_mut() + .await? + .alter_columns(alterations) + .await?; + Ok(()) + } + + async fn drop_columns(&self, columns: &[&str]) -> Result<()> { + self.dataset.get_mut().await?.drop_columns(columns).await?; + Ok(()) + } } #[cfg(test)] @@ -830,6 +952,7 @@ mod tests { use std::iter; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; + use std::time::Duration; use arrow_array::{ Array, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array, @@ -845,6 +968,8 @@ mod tests { use rand::Rng; use tempfile::tempdir; + use crate::connection::ConnectBuilder; + use super::*; #[tokio::test] @@ -881,32 +1006,13 @@ mod tests { assert_eq!(c.to_str().unwrap(), "s3://bucket/path/to/file/subfile"); } - #[tokio::test] - async fn test_create_already_exists() { - let tmp_dir = tempdir().unwrap(); - let uri = tmp_dir.path().to_str().unwrap(); - - let batches = make_test_batches(); - let _ = batches.schema().clone(); - NativeTable::create(uri, "test", batches, None, None) - .await - .unwrap(); - - let batches = make_test_batches(); - let result = NativeTable::create(uri, "test", batches, None, None).await; - assert!(matches!( - result.unwrap_err(), - Error::TableAlreadyExists { .. } - )); - } - #[tokio::test] async fn test_count_rows() { let tmp_dir = tempdir().unwrap(); let uri = tmp_dir.path().to_str().unwrap(); let batches = make_test_batches(); - let table = NativeTable::create(uri, "test", batches, None, None) + let table = NativeTable::create(uri, "test", batches, None, None, None) .await .unwrap(); @@ -924,7 +1030,7 @@ mod tests { let batches = make_test_batches(); let schema = batches.schema().clone(); - let table = NativeTable::create(uri, "test", batches, None, None) + let table = NativeTable::create(uri, "test", batches, None, None, None) .await .unwrap(); assert_eq!(table.count_rows(None).await.unwrap(), 10); @@ -940,7 +1046,10 @@ mod tests { schema.clone(), ); - table.add(Box::new(new_batches), None).await.unwrap(); + table + .add(Box::new(new_batches), AddDataOptions::default()) + .await + .unwrap(); assert_eq!(table.count_rows(None).await.unwrap(), 20); assert_eq!(table.name, "test"); } @@ -952,7 +1061,7 @@ mod tests { // Create a dataset with i=0..10 let batches = merge_insert_test_batches(0, 0); - let table = NativeTable::create(uri, "test", batches, None, None) + let table = NativeTable::create(uri, "test", batches, None, None, None) .await .unwrap(); assert_eq!(table.count_rows(None).await.unwrap(), 10); @@ -998,28 +1107,52 @@ mod tests { let batches = make_test_batches(); let schema = batches.schema().clone(); - let table = NativeTable::create(uri, "test", batches, None, None) + let table = NativeTable::create(uri, "test", batches, None, None, None) .await .unwrap(); assert_eq!(table.count_rows(None).await.unwrap(), 10); - let new_batches = RecordBatchIterator::new( - vec![RecordBatch::try_new( - schema.clone(), - vec![Arc::new(Int32Array::from_iter_values(100..110))], - ) - .unwrap()] - .into_iter() - .map(Ok), + let batches = vec![RecordBatch::try_new( schema.clone(), - ); + vec![Arc::new(Int32Array::from_iter_values(100..110))], + ) + .unwrap()] + .into_iter() + .map(Ok); + + let new_batches = RecordBatchIterator::new(batches.clone(), schema.clone()); + + // Can overwrite using AddDataOptions::mode + table + .add( + Box::new(new_batches), + AddDataOptions { + mode: AddDataMode::Overwrite, + ..Default::default() + }, + ) + .await + .unwrap(); + assert_eq!(table.count_rows(None).await.unwrap(), 10); + assert_eq!(table.name, "test"); + + // Can overwrite using underlying WriteParams (which + // take precedence over AddDataOptions::mode) let param: WriteParams = WriteParams { mode: WriteMode::Overwrite, ..Default::default() }; - table.add(Box::new(new_batches), Some(param)).await.unwrap(); + let opts = AddDataOptions { + write_options: WriteOptions { + lance_write_params: Some(param), + }, + mode: AddDataMode::Append, + }; + + let new_batches = RecordBatchIterator::new(batches.clone(), schema.clone()); + table.add(Box::new(new_batches), opts).await.unwrap(); assert_eq!(table.count_rows(None).await.unwrap(), 10); assert_eq!(table.name, "test"); } @@ -1329,7 +1462,7 @@ mod tests { ..Default::default() }; assert!(!wrapper.called()); - let _ = NativeTable::open_with_params(uri, "test", None, param) + let _ = NativeTable::open_with_params(uri, "test", None, Some(param), None) .await .unwrap(); assert!(wrapper.called()); @@ -1403,7 +1536,7 @@ mod tests { schema, ); - let table = NativeTable::create(uri, "test", batches, None, None) + let table = NativeTable::create(uri, "test", batches, None, None, None) .await .unwrap(); @@ -1448,4 +1581,68 @@ mod tests { Ok(FixedSizeListArray::from(data)) } + + #[tokio::test] + async fn test_read_consistency_interval() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1]))], + ) + .unwrap(); + + let intervals = vec![ + None, + Some(0), + Some(100), // 100 ms + ]; + + for interval in intervals { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + + let conn1 = ConnectBuilder::new(uri).execute().await.unwrap(); + let table1 = conn1 + .create_empty_table("my_table", batch.schema()) + .execute() + .await + .unwrap(); + + let mut conn2 = ConnectBuilder::new(uri); + if let Some(interval) = interval { + conn2 = conn2.read_consistency_interval(std::time::Duration::from_millis(interval)); + } + let conn2 = conn2.execute().await.unwrap(); + let table2 = conn2.open_table("my_table").execute().await.unwrap(); + + assert_eq!(table1.count_rows(None).await.unwrap(), 0); + assert_eq!(table2.count_rows(None).await.unwrap(), 0); + + table1 + .add( + Box::new(RecordBatchIterator::new( + vec![Ok(batch.clone())], + batch.schema(), + )), + AddDataOptions::default(), + ) + .await + .unwrap(); + assert_eq!(table1.count_rows(None).await.unwrap(), 1); + + match interval { + None => { + assert_eq!(table2.count_rows(None).await.unwrap(), 0); + } + Some(0) => { + assert_eq!(table2.count_rows(None).await.unwrap(), 1); + } + Some(100) => { + assert_eq!(table2.count_rows(None).await.unwrap(), 0); + tokio::time::sleep(Duration::from_millis(100)).await; + assert_eq!(table2.count_rows(None).await.unwrap(), 1); + } + _ => unreachable!(), + } + } + } } diff --git a/rust/lancedb/src/table/dataset.rs b/rust/lancedb/src/table/dataset.rs new file mode 100644 index 00000000..1772d5c1 --- /dev/null +++ b/rust/lancedb/src/table/dataset.rs @@ -0,0 +1,234 @@ +// Copyright 2024 LanceDB Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + ops::{Deref, DerefMut}, + sync::Arc, + time::{self, Duration, Instant}, +}; + +use lance::Dataset; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; + +use crate::error::Result; + +/// A wrapper around a [Dataset] that provides lazy-loading and consistency checks. +/// +/// This can be cloned cheaply. It supports concurrent reads or exclusive writes. +#[derive(Debug, Clone)] +pub struct DatasetConsistencyWrapper(Arc>); + +/// A wrapper around a [Dataset] that provides consistency checks. +/// +/// The dataset is lazily loaded, and starts off as None. On the first access, +/// the dataset is loaded. +#[derive(Debug, Clone)] +enum DatasetRef { + /// In this mode, the dataset is always the latest version. + Latest { + dataset: Dataset, + read_consistency_interval: Option, + last_consistency_check: Option, + }, + /// In this mode, the dataset is a specific version. It cannot be mutated. + TimeTravel { dataset: Dataset, version: u64 }, +} + +impl DatasetRef { + /// Reload the dataset to the appropriate version. + async fn reload(&mut self) -> Result<()> { + match self { + Self::Latest { + dataset, + last_consistency_check, + .. + } => { + *dataset = dataset + .checkout_version(dataset.latest_version_id().await?) + .await?; + last_consistency_check.replace(Instant::now()); + } + Self::TimeTravel { dataset, version } => { + dataset.checkout_version(*version).await?; + } + } + Ok(()) + } + + async fn as_latest(&mut self, read_consistency_interval: Option) -> Result<()> { + match self { + Self::Latest { .. } => Ok(()), + Self::TimeTravel { dataset, .. } => { + dataset + .checkout_version(dataset.latest_version_id().await?) + .await?; + *self = Self::Latest { + dataset: dataset.clone(), + read_consistency_interval, + last_consistency_check: Some(Instant::now()), + }; + Ok(()) + } + } + } + + fn set_latest(&mut self, dataset: Dataset) { + match self { + Self::Latest { + dataset: ref mut ds, + .. + } => { + *ds = dataset; + } + _ => unreachable!("Dataset should be in latest mode at this point"), + } + } +} + +impl DatasetConsistencyWrapper { + /// Create a new wrapper in the latest version mode. + pub fn new_latest(dataset: Dataset, read_consistency_interval: Option) -> Self { + Self(Arc::new(RwLock::new(DatasetRef::Latest { + dataset, + read_consistency_interval, + last_consistency_check: None, + }))) + } + + /// Create a new wrapper in the time travel mode. + pub fn new_time_travel(dataset: Dataset, version: u64) -> Self { + Self(Arc::new(RwLock::new(DatasetRef::TimeTravel { + dataset, + version, + }))) + } + + /// Create an independent copy of self. + /// + /// Unlike Clone, this will track versions independently of the original wrapper and + /// will be tied to a different RwLock. + pub async fn duplicate(&self) -> Self { + let ds_ref = self.0.read().await; + Self(Arc::new(RwLock::new((*ds_ref).clone()))) + } + + /// Get an immutable reference to the dataset. + pub async fn get(&self) -> Result> { + self.ensure_up_to_date().await?; + Ok(DatasetReadGuard { + guard: self.0.read().await, + }) + } + + /// Get a mutable reference to the dataset. + pub async fn get_mut(&self) -> Result> { + self.ensure_up_to_date().await?; + Ok(DatasetWriteGuard { + guard: self.0.write().await, + }) + } + + /// Convert into a wrapper in latest version mode + pub async fn as_latest(&mut self, read_consistency_interval: Option) -> Result<()> { + self.0 + .write() + .await + .as_latest(read_consistency_interval) + .await + } + + /// Provide a known latest version of the dataset. + /// + /// This is usually done after some write operation, which inherently will + /// have the latest version. + pub async fn set_latest(&self, dataset: Dataset) { + self.0.write().await.set_latest(dataset); + } + + async fn reload(&self) -> Result<()> { + self.0.write().await.reload().await + } + + async fn is_up_to_date(&self) -> Result { + let dataset_ref = self.0.read().await; + match &*dataset_ref { + DatasetRef::Latest { + read_consistency_interval, + last_consistency_check, + .. + } => match (read_consistency_interval, last_consistency_check) { + (None, _) => Ok(true), + (Some(_), None) => Ok(false), + (Some(read_consistency_interval), Some(last_consistency_check)) => { + if &last_consistency_check.elapsed() < read_consistency_interval { + Ok(true) + } else { + Ok(false) + } + } + }, + DatasetRef::TimeTravel { dataset, version } => { + Ok(dataset.version().version == *version) + } + } + } + + /// Ensures that the dataset is loaded and up-to-date with consistency and + /// version parameters. + async fn ensure_up_to_date(&self) -> Result<()> { + if !self.is_up_to_date().await? { + self.reload().await?; + } + Ok(()) + } +} + +pub struct DatasetReadGuard<'a> { + guard: RwLockReadGuard<'a, DatasetRef>, +} + +impl Deref for DatasetReadGuard<'_> { + type Target = Dataset; + + fn deref(&self) -> &Self::Target { + match &*self.guard { + DatasetRef::Latest { dataset, .. } => dataset, + DatasetRef::TimeTravel { dataset, .. } => dataset, + } + } +} + +pub struct DatasetWriteGuard<'a> { + guard: RwLockWriteGuard<'a, DatasetRef>, +} + +impl Deref for DatasetWriteGuard<'_> { + type Target = Dataset; + + fn deref(&self) -> &Self::Target { + match &*self.guard { + DatasetRef::Latest { dataset, .. } => dataset, + DatasetRef::TimeTravel { dataset, .. } => dataset, + } + } +} + +impl DerefMut for DatasetWriteGuard<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + match &mut *self.guard { + DatasetRef::Latest { dataset, .. } => dataset, + DatasetRef::TimeTravel { dataset, .. } => dataset, + } + } +} diff --git a/rust/vectordb/src/table/merge.rs b/rust/lancedb/src/table/merge.rs similarity index 100% rename from rust/vectordb/src/table/merge.rs rename to rust/lancedb/src/table/merge.rs diff --git a/rust/vectordb/src/utils.rs b/rust/lancedb/src/utils.rs similarity index 91% rename from rust/vectordb/src/utils.rs rename to rust/lancedb/src/utils.rs index a3a03382..ec508de3 100644 --- a/rust/vectordb/src/utils.rs +++ b/rust/lancedb/src/utils.rs @@ -32,20 +32,17 @@ impl PatchStoreParam for Option { } pub trait PatchWriteParam { - fn patch_with_store_wrapper( - self, - wrapper: Arc, - ) -> Result>; + fn patch_with_store_wrapper(self, wrapper: Arc) + -> Result; } -impl PatchWriteParam for Option { +impl PatchWriteParam for WriteParams { fn patch_with_store_wrapper( - self, + mut self, wrapper: Arc, - ) -> Result> { - let mut params = self.unwrap_or_default(); - params.store_params = params.store_params.patch_with_store_wrapper(wrapper)?; - Ok(Some(params)) + ) -> Result { + self.store_params = self.store_params.patch_with_store_wrapper(wrapper)?; + Ok(self) } } diff --git a/rust/lancedb/tests/lancedb_cloud.rs b/rust/lancedb/tests/lancedb_cloud.rs new file mode 100644 index 00000000..9bf75e91 --- /dev/null +++ b/rust/lancedb/tests/lancedb_cloud.rs @@ -0,0 +1,67 @@ +// Copyright 2024 LanceDB Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::RecordBatchIterator; + +#[tokio::test] +#[ignore] +async fn cloud_integration_test() { + let project = std::env::var("LANCEDB_PROJECT") + .expect("the LANCEDB_PROJECT env must be set to run the cloud integration test"); + let api_key = std::env::var("LANCEDB_API_KEY") + .expect("the LANCEDB_API_KEY env must be set to run the cloud integration test"); + let region = std::env::var("LANCEDB_REGION") + .expect("the LANCEDB_REGION env must be set to run the cloud integration test"); + let host_override = std::env::var("LANCEDB_HOST_OVERRIDE") + .map(Some) + .unwrap_or(None); + if host_override.is_none() { + println!("No LANCEDB_HOST_OVERRIDE has been set. Running integration test against LanceDb Cloud production instance"); + } + + let mut builder = lancedb::connect(&format!("db://{}", project)) + .api_key(&api_key) + .region(®ion); + if let Some(host_override) = &host_override { + builder = builder.host_override(host_override); + } + let db = builder.execute().await.unwrap(); + + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new("id", arrow_schema::DataType::Int64, false), + arrow_schema::Field::new("name", arrow_schema::DataType::Utf8, false), + ])); + let initial_data = arrow::record_batch::RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow_array::Int64Array::from(vec![1, 2, 3])), + Arc::new(arrow_array::StringArray::from(vec!["a", "b", "c"])), + ], + ); + let rbr = RecordBatchIterator::new(vec![initial_data], schema); + + let name = uuid::Uuid::new_v4().to_string(); + let tbl = db + .create_table(name.clone(), Box::new(rbr)) + .execute() + .await + .unwrap(); + + assert_eq!(tbl.name(), name); + + let table_names = db.table_names().await.unwrap(); + assert!(table_names.contains(&name)); +} diff --git a/rust/vectordb/src/connection.rs b/rust/vectordb/src/connection.rs deleted file mode 100644 index cb2800c5..00000000 --- a/rust/vectordb/src/connection.rs +++ /dev/null @@ -1,471 +0,0 @@ -// Copyright 2023 LanceDB Developers. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! LanceDB Database -//! - -use std::fs::create_dir_all; -use std::path::Path; -use std::sync::Arc; - -use arrow_array::RecordBatchReader; -use lance::dataset::WriteParams; -use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore}; -use object_store::{ - aws::AwsCredential, local::LocalFileSystem, CredentialProvider, StaticCredentialProvider, -}; -use snafu::prelude::*; - -use crate::error::{CreateDirSnafu, Error, InvalidTableNameSnafu, Result}; -use crate::io::object_store::MirroringObjectStoreWrapper; -use crate::table::{NativeTable, ReadParams, TableRef}; - -pub const LANCE_FILE_EXTENSION: &str = "lance"; - -/// A connection to LanceDB -#[async_trait::async_trait] -pub trait Connection: Send + Sync { - /// Get the names of all tables in the database. - async fn table_names(&self) -> Result>; - - /// Create a new table in the database. - /// - /// # Parameters - /// - /// * `name` - The name of the table. - /// * `batches` - The initial data to write to the table. - /// * `params` - Optional [`WriteParams`] to create the table. - /// - /// # Returns - /// Created [`TableRef`], or [`Err(Error::TableAlreadyExists)`] if the table already exists. - async fn create_table( - &self, - name: &str, - batches: Box, - params: Option, - ) -> Result; - - async fn open_table(&self, name: &str) -> Result { - self.open_table_with_params(name, ReadParams::default()) - .await - } - - async fn open_table_with_params(&self, name: &str, params: ReadParams) -> Result; - - /// Drop a table in the database. - /// - /// # Arguments - /// * `name` - The name of the table. - async fn drop_table(&self, name: &str) -> Result<()>; -} - -#[derive(Debug)] -pub struct ConnectOptions { - /// Database URI - /// - /// # Accpeted URI formats - /// - /// - `/path/to/database` - local database on file system. - /// - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud object store - /// - `db://dbname` - Lance Cloud - pub uri: String, - - /// Lance Cloud API key - pub api_key: Option, - /// Lance Cloud region - pub region: Option, - /// Lance Cloud host override - pub host_override: Option, - - /// User provided AWS credentials - pub aws_creds: Option, - - /// The maximum number of indices to cache in memory. Defaults to 256. - pub index_cache_size: u32, -} - -impl ConnectOptions { - /// Create a new [`ConnectOptions`] with the given database URI. - pub fn new(uri: &str) -> Self { - Self { - uri: uri.to_string(), - api_key: None, - region: None, - host_override: None, - aws_creds: None, - index_cache_size: 256, - } - } - - pub fn api_key(mut self, api_key: &str) -> Self { - self.api_key = Some(api_key.to_string()); - self - } - - pub fn region(mut self, region: &str) -> Self { - self.region = Some(region.to_string()); - self - } - - pub fn host_override(mut self, host_override: &str) -> Self { - self.host_override = Some(host_override.to_string()); - self - } - - /// [`AwsCredential`] to use when connecting to S3. - /// - pub fn aws_creds(mut self, aws_creds: AwsCredential) -> Self { - self.aws_creds = Some(aws_creds); - self - } - - pub fn index_cache_size(mut self, index_cache_size: u32) -> Self { - self.index_cache_size = index_cache_size; - self - } -} - -/// Connect to a LanceDB database. -/// -/// # Arguments -/// -/// - `uri` - URI where the database is located, can be a local file or a supported remote cloud storage -/// -/// ## Accepted URI formats -/// -/// - `/path/to/database` - local database on file system. -/// - `s3://bucket/path/to/database` or `gs://bucket/path/to/database` - database on cloud object store -/// - `db://dbname` - Lance Cloud -/// -pub async fn connect(uri: &str) -> Result> { - let options = ConnectOptions::new(uri); - connect_with_options(&options).await -} - -/// Connect with [`ConnectOptions`]. -/// -/// # Arguments -/// - `options` - [`ConnectOptions`] to connect to the database. -pub async fn connect_with_options(options: &ConnectOptions) -> Result> { - let db = Database::connect(&options.uri).await?; - Ok(Arc::new(db)) -} - -pub struct Database { - object_store: ObjectStore, - query_string: Option, - - pub(crate) uri: String, - pub(crate) base_path: object_store::path::Path, - - // the object store wrapper to use on write path - pub(crate) store_wrapper: Option>, -} - -const LANCE_EXTENSION: &str = "lance"; -const ENGINE: &str = "engine"; -const MIRRORED_STORE: &str = "mirroredStore"; - -/// A connection to LanceDB -impl Database { - /// Connects to LanceDB - /// - /// # Arguments - /// - /// * `uri` - URI where the database is located, can be a local file or a supported remote cloud storage - /// - /// # Returns - /// - /// * A [Database] object. - pub async fn connect(uri: &str) -> Result { - let options = ConnectOptions::new(uri); - Self::connect_with_options(&options).await - } - - pub async fn connect_with_options(options: &ConnectOptions) -> Result { - let uri = &options.uri; - let parse_res = url::Url::parse(uri); - - match parse_res { - Ok(url) if url.scheme().len() == 1 && cfg!(windows) => Self::open_path(uri).await, - Ok(mut url) => { - // iter thru the query params and extract the commit store param - let mut engine = None; - let mut mirrored_store = None; - let mut filtered_querys = vec![]; - - // WARNING: specifying engine is NOT a publicly supported feature in lancedb yet - // THE API WILL CHANGE - for (key, value) in url.query_pairs() { - if key == ENGINE { - engine = Some(value.to_string()); - } else if key == MIRRORED_STORE { - if cfg!(windows) { - return Err(Error::Lance { - message: "mirrored store is not supported on windows".into(), - }); - } - mirrored_store = Some(value.to_string()); - } else { - // to owned so we can modify the url - filtered_querys.push((key.to_string(), value.to_string())); - } - } - - // Filter out the commit store query param -- it's a lancedb param - url.query_pairs_mut().clear(); - url.query_pairs_mut().extend_pairs(filtered_querys); - // Take a copy of the query string so we can propagate it to lance - let query_string = url.query().map(|s| s.to_string()); - // clear the query string so we can use the url as the base uri - // use .set_query(None) instead of .set_query("") because the latter - // will add a trailing '?' to the url - url.set_query(None); - - let table_base_uri = if let Some(store) = engine { - static WARN_ONCE: std::sync::Once = std::sync::Once::new(); - WARN_ONCE.call_once(|| { - log::warn!("Specifing engine is not a publicly supported feature in lancedb yet. THE API WILL CHANGE"); - }); - let old_scheme = url.scheme().to_string(); - let new_scheme = format!("{}+{}", old_scheme, store); - url.to_string().replacen(&old_scheme, &new_scheme, 1) - } else { - url.to_string() - }; - - let plain_uri = url.to_string(); - let os_params: ObjectStoreParams = if let Some(aws_creds) = &options.aws_creds { - let credential_provider: Arc< - dyn CredentialProvider, - > = Arc::new(StaticCredentialProvider::new(AwsCredential { - key_id: aws_creds.key_id.clone(), - secret_key: aws_creds.secret_key.clone(), - token: aws_creds.token.clone(), - })); - ObjectStoreParams::with_aws_credentials( - Some(credential_provider), - options.region.clone(), - ) - } else { - ObjectStoreParams::default() - }; - let (object_store, base_path) = - ObjectStore::from_uri_and_params(&plain_uri, &os_params).await?; - if object_store.is_local() { - Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?; - } - - let write_store_wrapper = match mirrored_store { - Some(path) => { - let mirrored_store = Arc::new(LocalFileSystem::new_with_prefix(path)?); - let wrapper = MirroringObjectStoreWrapper::new(mirrored_store); - Some(Arc::new(wrapper) as Arc) - } - None => None, - }; - - Ok(Self { - uri: table_base_uri, - query_string, - base_path, - object_store, - store_wrapper: write_store_wrapper, - }) - } - Err(_) => Self::open_path(uri).await, - } - } - - async fn open_path(path: &str) -> Result { - let (object_store, base_path) = ObjectStore::from_uri(path).await?; - if object_store.is_local() { - Self::try_create_dir(path).context(CreateDirSnafu { path })?; - } - Ok(Self { - uri: path.to_string(), - query_string: None, - base_path, - object_store, - store_wrapper: None, - }) - } - - /// Try to create a local directory to store the lancedb dataset - fn try_create_dir(path: &str) -> core::result::Result<(), std::io::Error> { - let path = Path::new(path); - if !path.try_exists()? { - create_dir_all(path)?; - } - Ok(()) - } - - /// Get the URI of a table in the database. - fn table_uri(&self, name: &str) -> Result { - let path = Path::new(&self.uri); - let table_uri = path.join(format!("{}.{}", name, LANCE_FILE_EXTENSION)); - - let mut uri = table_uri - .as_path() - .to_str() - .context(InvalidTableNameSnafu { name })? - .to_string(); - - // If there are query string set on the connection, propagate to lance - if let Some(query) = self.query_string.as_ref() { - uri.push('?'); - uri.push_str(query.as_str()); - } - - Ok(uri) - } -} - -#[async_trait::async_trait] -impl Connection for Database { - async fn table_names(&self) -> Result> { - let mut f = self - .object_store - .read_dir(self.base_path.clone()) - .await? - .iter() - .map(Path::new) - .filter(|path| { - let is_lance = path - .extension() - .and_then(|e| e.to_str()) - .map(|e| e == LANCE_EXTENSION); - is_lance.unwrap_or(false) - }) - .filter_map(|p| p.file_stem().and_then(|s| s.to_str().map(String::from))) - .collect::>(); - f.sort(); - Ok(f) - } - - async fn create_table( - &self, - name: &str, - batches: Box, - params: Option, - ) -> Result { - let table_uri = self.table_uri(name)?; - - Ok(Arc::new( - NativeTable::create( - &table_uri, - name, - batches, - self.store_wrapper.clone(), - params, - ) - .await?, - )) - } - - /// Open a table in the database. - /// - /// # Arguments - /// * `name` - The name of the table. - /// * `params` - The parameters to open the table. - /// - /// # Returns - /// - /// * A [TableRef] object. - async fn open_table_with_params(&self, name: &str, params: ReadParams) -> Result { - let table_uri = self.table_uri(name)?; - Ok(Arc::new( - NativeTable::open_with_params(&table_uri, name, self.store_wrapper.clone(), params) - .await?, - )) - } - - async fn drop_table(&self, name: &str) -> Result<()> { - let dir_name = format!("{}.{}", name, LANCE_EXTENSION); - let full_path = self.base_path.child(dir_name.clone()); - self.object_store.remove_dir_all(full_path).await?; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use std::fs::create_dir_all; - - use tempfile::tempdir; - - use super::*; - - #[tokio::test] - async fn test_connect() { - let tmp_dir = tempdir().unwrap(); - let uri = tmp_dir.path().to_str().unwrap(); - let db = Database::connect(uri).await.unwrap(); - - assert_eq!(db.uri, uri); - } - - #[cfg(not(windows))] - #[tokio::test] - async fn test_connect_relative() { - let tmp_dir = tempdir().unwrap(); - let uri = std::fs::canonicalize(tmp_dir.path().to_str().unwrap()).unwrap(); - - let current_dir = std::env::current_dir().unwrap(); - let ancestors = current_dir.ancestors(); - let relative_ancestors = vec![".."; ancestors.count()]; - - let relative_root = std::path::PathBuf::from(relative_ancestors.join("/")); - let relative_uri = relative_root.join(&uri); - - let db = Database::connect(relative_uri.to_str().unwrap()) - .await - .unwrap(); - - assert_eq!(db.uri, relative_uri.to_str().unwrap().to_string()); - } - - #[tokio::test] - async fn test_table_names() { - let tmp_dir = tempdir().unwrap(); - create_dir_all(tmp_dir.path().join("table1.lance")).unwrap(); - create_dir_all(tmp_dir.path().join("table2.lance")).unwrap(); - create_dir_all(tmp_dir.path().join("invalidlance")).unwrap(); - - let uri = tmp_dir.path().to_str().unwrap(); - let db = Database::connect(uri).await.unwrap(); - let tables = db.table_names().await.unwrap(); - assert_eq!(tables.len(), 2); - assert!(tables[0].eq(&String::from("table1"))); - assert!(tables[1].eq(&String::from("table2"))); - } - - #[tokio::test] - async fn test_connect_s3() { - // let db = Database::connect("s3://bucket/path/to/database").await.unwrap(); - } - - #[tokio::test] - async fn drop_table() { - let tmp_dir = tempdir().unwrap(); - create_dir_all(tmp_dir.path().join("table1.lance")).unwrap(); - - let uri = tmp_dir.path().to_str().unwrap(); - let db = Database::connect(uri).await.unwrap(); - db.drop_table("table1").await.unwrap(); - - let tables = db.table_names().await.unwrap(); - assert_eq!(tables.len(), 0); - } -}