make node available to all users

mac debug info
add dbg prints
2025-12-24 13:59:58 +00:00 · 2023-05-25 17:50:07 -07:00 · 2023-05-25 13:33:54 -07:00 · 2023-05-25 09:44:02 -07:00 · 2023-05-25 09:21:40 -07:00 · 2023-05-25 09:21:40 -07:00
35 changed files with 1574 additions and 100 deletions
--- a/.github/workflows/make_release_commit.yml
+++ b/.github/workflows/make_release_commit.yml
@@ -0,0 +1,70 @@
+name: Create release commit
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: 'Just create the local commit/tags but do not push it'
+        required: true
+        default: "false"
+        type: choice
+        options:
+          - "true"
+          - "false"
+      part:
+        description: 'What kind of release is this?'
+        required: true
+        default: 'patch'
+        type: choice
+        options:
+          - patch
+          - minor
+          - major
+
+jobs:
+  bump-version:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check out main
+      uses: actions/checkout@v3
+      with:
+        ref: main
+        persist-credentials: false
+        fetch-depth: 0
+        lfs: true
+    - name: Install cargo utils
+      run: cargo install cargo-bump cargo-get
+    - name: Bump vectordb
+      working-directory: rust/vectordb
+      run: |
+        cargo bump ${{ inputs.part }}
+        echo "CRATE_VERSION=$(cargo get version)" >> $GITHUB_ENV
+    - name: Bump rust/ffi/node
+      working-directory: rust/ffi/node
+      run: |
+        cargo bump ${{ inputs.part }}
+        echo "FFI_CRATE_VERSION=$(cargo get version)" >> $GITHUB_ENV 
+    - name: Bump node
+      working-directory: node
+      run: |
+        npm version ${{ inputs.part }}
+        echo "NPM_PACKAGE_VERSION=$(cat package.json | jq -r '.version')" >> $GITHUB_ENV 
+    - name: Create tag
+      run: |
+        if [ "$CRATE_VERSION" != "$FFI_CRATE_VERSION" ]; then
+          echo "Version mismatch between rust/vectordb and rust/ffi/node"
+          exit 1
+        fi
+        if [ "$CRATE_VERSION" != "$NPM_PACKAGE_VERSION" ]; then
+          echo "Version mismatch between rust/vectordb and node"
+          exit 1
+        fi
+        export TAG="v$CRATE_VERSION'"
+        git tag $TAG
+    - name: Push new version and tag
+      if: ${{ inputs.dry_run }} == "false"
+      uses: ad-m/github-push-action@master
+      with:
+        github_token: ${{ secrets.RELEASE_TOKEN }}
+        branch: main
+        tags: true
--- a/.github/workflows/node.yml
+++ b/.github/workflows/node.yml
@@ -67,8 +67,10 @@ jobs:
    - name: Build
      run: |
        npm ci
-        npm run build
        npm run tsc
+        npm run build
+        npm run pack-build
+        npm install --no-save ./dist/vectordb-*.tgz
    - name: Test
      run: npm run test
  macos:
@@ -94,8 +96,10 @@ jobs:
    - name: Build
      run: |
        npm ci
-        npm run build
        npm run tsc
+        npm run build
+        npm run pack-build
+        npm install --no-save ./dist/vectordb-*.tgz
    - name: Test
      run: |
        npm run test
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -30,7 +30,7 @@ jobs:
        python-version: 3.${{ matrix.python-minor-version }}
    - name: Install lancedb
      run: |
-        pip install -e .
+        pip install -e ".[fts]"
        pip install pytest
    - name: Run tests
      run: pytest -x -v --durations=30 tests
@@ -49,10 +49,10 @@ jobs:
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
-        python-version: "3.10"
+        python-version: "3.11"
    - name: Install lancedb
      run: |
-        pip install -e .
+        pip install -e ".[fts]"
        pip install pytest
    - name: Run tests
      run: pytest -x -v --durations=30 tests
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -0,0 +1,174 @@
+name: Prepare Release
+
+# TODO: bump versions in CI
+# NOTE: Python is a separate release for now.
+
+on:
+  push:
+    tags:
+      - v*
+
+jobs:
+  draft-release:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: softprops/action-gh-release@v1
+        with:
+          draft: true
+          prerelease: true # hardcoded on for now
+          generate_release_notes: true
+  
+  rust:
+    runs-on: ubuntu-latest
+    needs: draft-release
+    defaults:
+      run:
+        shell: bash
+        working-directory: rust/vectordb
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y protobuf-compiler libssl-dev
+      - name: Package Rust
+        run: cargo package --all-features
+      - uses: softprops/action-gh-release@v1
+        with:
+          draft: true
+          files: target/package/vectordb-*.crate
+          fail_on_unmatched_files: true
+
+  node:
+    runs-on: ubuntu-latest
+    needs: draft-release
+    defaults:
+      run:
+        shell: bash
+        working-directory: node
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - uses: actions/setup-node@v3
+        with:
+          node-version: 20
+          cache: 'npm'
+          cache-dependency-path: node/package-lock.json
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y protobuf-compiler libssl-dev
+      - name: Build
+        run: |
+          npm ci
+          npm run tsc
+          npm pack
+      - uses: softprops/action-gh-release@v1
+        with:
+          draft: true
+          files: node/vectordb-*.tgz
+          fail_on_unmatched_files: true
+
+  node-macos:
+    runs-on: macos-12
+    needs: draft-release
+    strategy:
+      fail-fast: false
+      matrix:
+        target: [x86_64-apple-darwin, aarch64-apple-darwin]
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Install system dependencies
+      run: brew install protobuf
+    - name: Install npm dependencies
+      run: |
+        cd node
+        npm ci
+    - name: Install rustup target
+      if: ${{ matrix.target == 'aarch64-apple-darwin' }}
+      run: rustup target add aarch64-apple-darwin
+    - name: Build MacOS native node modules
+      run: bash ci/build_macos_artifacts.sh ${{ matrix.target }}
+    - uses: softprops/action-gh-release@v1
+      with:
+        draft: true
+        files: node/dist/vectordb-darwin*.tgz
+        fail_on_unmatched_files: true
+
+  node-linux:
+    name: node-linux (${{ matrix.arch}}-unknown-linux-${{ matrix.libc }})
+    runs-on: ubuntu-latest
+    needs: draft-release
+    strategy:
+      fail-fast: false
+      matrix:
+        libc:
+          - gnu
+          # TODO: re-enable musl once we have refactored to pre-built containers
+          # Right now we have to build node from source which is too expensive.
+          # - musl
+        arch:
+          - x86_64
+          - aarch64
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Change owner to root (for npm)
+      # The docker container is run as root, so we need the files to be owned by root
+      # Otherwise npm is a nightmare: https://github.com/npm/cli/issues/3773
+      run: sudo chown -R root:root .
+    - name: Set up QEMU
+      if: ${{ matrix.arch == 'aarch64' }}
+      uses: docker/setup-qemu-action@v2  
+      with:
+        platforms: arm64
+    - name: Build Linux GNU native node modules
+      if: ${{ matrix.libc == 'gnu' }}
+      run: |
+        docker run \
+          -v $(pwd):/io -w /io \
+          quay.io/pypa/manylinux2014_${{ matrix.arch }} \
+          bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-gnu
+    - name: Build musl Linux native node modules
+      if: ${{ matrix.libc == 'musl' }}
+      run: |
+        docker run --platform linux/arm64/v8 \
+          -v $(pwd):/io -w /io \
+          quay.io/pypa/musllinux_1_1_${{ matrix.arch }} \
+          bash ci/build_linux_artifacts.sh ${{ matrix.arch }}-unknown-linux-musl
+    - uses: softprops/action-gh-release@v1
+      with:
+        draft: true
+        files: node/dist/vectordb-linux*.tgz
+        fail_on_unmatched_files: true
+
+  release:
+    needs: [rust, node, node-macos, node-linux]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v3
+      - name: Publish to PyPI
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          python -m twine upload --non-interactive \
+            --skip-existing \
+            --repository testpypi python/dist/*
+      - name: Publish to NPM
+        run: |
+          for filename in node/dist/*.tgz; do
+            npm publish --dry-run $filename
+          done
+      - name: Publish to crates.io
+        env:
+          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+        run: |
+          cargo publish --dry-run --no-verify rust/target/vectordb-*.crate
+      # - uses: softprops/action-gh-release@v1
+      #   with:
+      #     draft: false
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,8 @@
 **/__pycache__
 .DS_Store

+.vscode
+
 rust/target
 rust/Cargo.lock

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3356,18 +3356,20 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"

 [[package]]
 name = "vectordb"
-version = "0.0.1"
+version = "0.1.2"
 dependencies = [
 "arrow-array",
+ "arrow-data",
 "arrow-schema",
 "lance",
+ "rand",
 "tempfile",
 "tokio",
 ]

 [[package]]
 name = "vectordb-node"
-version = "0.1.0"
+version = "0.1.2"
 dependencies = [
 "arrow-array",
 "arrow-ipc",
--- a/ci/build_linux_artifacts.sh
+++ b/ci/build_linux_artifacts.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Builds the Linux artifacts (node binaries).
+# Usage: ./build_linux_artifacts.sh [target]
+# Targets supported: 
+# - x86_64-unknown-linux-gnu:centos
+# - aarch64-unknown-linux-gnu:centos
+# - aarch64-unknown-linux-musl
+# - x86_64-unknown-linux-musl
+
+# TODO: refactor this into a Docker container we can pull
+
+set -e
+
+setup_dependencies() {
+    echo "Installing system dependencies..."
+    if [[ $1 == *musl ]]; then
+        # musllinux
+        apk add openssl-dev
+    else
+        # manylinux2014
+        yum install -y openssl-devel unzip
+    fi
+
+    if [[ $1 == x86_64* ]]; then
+        ARCH=x86_64
+    else
+        # gnu target
+        ARCH=aarch_64
+    fi
+    
+    # Install new enough protobuf (yum-provided is old)
+    PB_REL=https://github.com/protocolbuffers/protobuf/releases
+    PB_VERSION=23.1
+    curl -LO $PB_REL/download/v$PB_VERSION/protoc-$PB_VERSION-linux-$ARCH.zip
+    unzip protoc-$PB_VERSION-linux-$ARCH.zip -d /usr/local
+}
+
+install_node() {
+    echo "Installing node..."
+    curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.34.0/install.sh | bash
+    source "$HOME"/.bashrc
+
+    if [[ $1 == *musl ]]; then
+        # This node version is 15, we need 16 or higher:
+        # apk add nodejs-current npm 
+        # So instead we install from source (nvm doesn't provide binaries for musl):
+        nvm install -s --no-progress 17
+    else
+        nvm install --no-progress 17 # latest that supports glibc 2.17
+    fi
+}
+
+install_rust() {
+    echo "Installing rust..."
+    curl https://sh.rustup.rs -sSf | bash -s -- -y
+    export PATH="$PATH:/root/.cargo/bin"
+}
+
+build_node_binary() {
+    echo "Building node library for $1..."
+    pushd node
+
+    npm ci
+    
+    if [[ $1 == *musl ]]; then
+        # This is needed for cargo to allow build cdylibs with musl
+        export RUSTFLAGS="-C target-feature=-crt-static"
+    fi
+
+    # Cargo can run out of memory while pulling dependencies, espcially when running
+    # in QEMU. This is a workaround for that.
+    export CARGO_NET_GIT_FETCH_WITH_CLI=true
+
+    # We don't pass in target, since the native target here already matches
+    # and openblas-src doesn't do well with cross-compilation.
+    npm run build-release
+    npm run pack-build
+
+    popd
+}
+
+TARGET=${1:-x86_64-unknown-linux-gnu}
+# Others:
+# aarch64-unknown-linux-gnu
+# x86_64-unknown-linux-musl
+# aarch64-unknown-linux-musl
+
+setup_dependencies $TARGET
+install_node $TARGET
+install_rust
+build_node_binary $TARGET
--- a/ci/build_macos_artifacts.sh
+++ b/ci/build_macos_artifacts.sh
@@ -0,0 +1,38 @@
+# Builds the macOS artifacts (node binaries).
+# Usage: ./build_macos_artifacts.sh [target]
+# Targets supported: x86_64-apple-darwin aarch64-apple-darwin
+
+prebuild_rust() {
+    # Building here for the sake of easier debugging.
+    pushd rust/ffi/node
+
+    for target in $1
+    do
+        echo "Building rust library for $target"
+        export RUST_BACKTRACE=1
+        cargo build --release --target $target
+    done
+
+    popd
+}
+
+build_node_binaries() {
+    pushd node
+    
+    for target in $1
+    do
+        echo "Building node library for $target"
+        npm run build-release -- --target $target
+        npm run pack-build -- --target $target
+    done
+    popd
+}
+
+if [ -n "$1" ]; then
+    targets=$1
+else
+    targets="x86_64-apple-darwin aarch64-apple-darwin"
+fi
+
+prebuild_rust $targets
+build_node_binaries $targets
--- a/ci/release_process.md
+++ b/ci/release_process.md
@@ -0,0 +1,90 @@
+# How to release
+
+This is for the Rust crate and Node module. For now, the Python module is
+released separately.
+
+The release is started by bumping the versions and pushing a new tag. To do this
+automatically, use the `make_release_commit` GitHub action.
+
+When the tag is pushed, GitHub actions will start building the libraries and
+will upload them to a draft release.
+
+While those jobs are running, edit the release notes as needed. For example, 
+bring relevant new features and bugfixes to the top of the notes and the testing
+and CI changes to the bottom.
+
+Once the jobs have finished, the release will be marked as not draft and the
+artifacts will be released to crates.io, NPM, and PyPI.
+
+## Manual process
+
+You can also build the artifacts locally on a MacOS machine.
+
+### Build the MacOS release libraries
+
+One-time setup:
+
+```shell
+rustup target add x86_64-apple-darwin aarch64-apple-darwin
+```
+
+To build:
+
+```shell
+bash ci/build_macos_artifacts.sh
+```
+
+### Build the Linux release libraries
+
+To build a Linux library, we need to use docker with a different build script:
+
+```shell
+ARCH=aarch64
+docker run \
+    -v $(pwd):/io -w /io \
+    quay.io/pypa/manylinux2014_$ARCH \
+    bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-gnu
+```
+
+You can change `ARCH` to `x86_64`.
+
+Similar script for musl binaries (not yet working):
+
+```shell
+ARCH=aarch64
+docker run \
+    --user $(id -u) \
+    -v $(pwd):/io -w /io \
+    quay.io/pypa/musllinux_1_1_$ARCH \
+    bash ci/build_linux_artifacts.sh $ARCH-unknown-linux-musl
+```
+
+<!--
+
+For debugging, use these snippets:
+
+```shell
+ARCH=aarch64
+docker run -it \
+    -v $(pwd):/io -w /io \
+    quay.io/pypa/manylinux2014_$ARCH \
+    bash
+```
+
+```shell
+ARCH=aarch64
+docker run -it \
+    -v $(pwd):/io -w /io \
+    quay.io/pypa/musllinux_1_1_$ARCH \
+    bash
+```
+
+Note: musllinux_1_1 is Alpine Linux 3.12
+-->
+
+```
+docker run \
+    -v $(pwd):/io -w /io \
+    quay.io/pypa/musllinux_1_1_aarch64 \
+    bash alpine_repro.sh
+```
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -19,6 +19,7 @@ nav:
 - Basics: basic.md
 - Embeddings: embedding.md
 - Indexing: ann_indexes.md
+- Full-text search: fts.md
 - Integrations: integrations.md
 - Python API: python.md

--- a/docs/src/fts.md
+++ b/docs/src/fts.md
@@ -0,0 +1,50 @@
+# [EXPERIMENTAL] Full text search
+
+LanceDB now provides experimental support for full text search.
+This is currently Python only. We plan to push the integration down to Rust in the future
+to make this available for JS as well.
+
+## Installation
+
+To use full text search, you must install the fts optional dependencies:
+
+`pip install lancedb[fts]`
+
+
+## Quickstart
+
+Assume:
+1. `table` is a LanceDB Table
+2. `text` is the name of the Table column that we want to index
+
+To create the index:
+
+```python
+table.create_fts_index("text")
+```
+
+To search:
+
+```python
+df = table.search("puppy").limit(10).select(["text"]).to_df()
+```
+
+LanceDB automatically looks for an FTS index if the input is str.
+
+## Multiple text columns
+
+If you have multiple columns to index, pass them all as a list to `create_fts_index`:
+
+```python
+table.create_fts_index(["text1", "text2"])
+```
+
+Note that the search API call does not change - you can search over all indexed columns at once.
+
+## Current limitations
+
+1. Currently we do not yet support incremental writes.
+If you add data after fts index creation, it won't be reflected
+in search results until you do a full reindex.
+
+2. We currently only support local filesystem paths for the fts index.
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -45,5 +45,6 @@ We will be adding completed demo apps built using LanceDB.
 * [`Basic Operations`](basic.md) - basic functionality of LanceDB.
 * [`Embedding Functions`](embedding.md) - functions for working with embeddings.
 * [`Indexing`](ann_indexes.md) - create vector indexes to speed up queries.
+* [`Full text search`](fts.md) - [EXPERIMENTAL] full-text search API
 * [`Ecosystem Integrations`](integrations.md) - integrating LanceDB with python data tooling ecosystem.
 * [`API Reference`](python.md) - detailed documentation for the LanceDB Python SDK.
--- a/node/.npmignore
+++ b/node/.npmignore
@@ -0,0 +1,2 @@
+gen_test_data.py
+index.node
--- a/node/README.md
+++ b/node/README.md
@@ -8,6 +8,10 @@ A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb)
 npm install vectordb
 ```

+This will download the appropriate native library for your platform. We currently
+support x86_64 Linux, aarch64 Linux, Intel MacOS, and ARM (M1/M2) MacOS. We do not
+yet support Windows or musl-based Linux (such as Alpine Linux).
+
 ## Usage

 ### Basic Example
@@ -24,6 +28,19 @@ The [examples](./examples) folder contains complete examples.

 ## Development

+Build and install the rust library with:
+
+```bash
+npm run build
+npm run pack-build
+npm install --no-save ./dist/vectordb-*.tgz
+```
+
+`npm run build` builds the Rust library, `npm run pack-build` packages the Rust
+binary into an npm module called `@vectordb/<platform>` (for example, 
+`@vectordb/darwin-arm64.node`), and then `npm run install ...` installs that
+module.
+
 The LanceDB javascript is built with npm:

 ```bash
--- a/node/native.js
+++ b/node/native.js
@@ -12,29 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+const { currentTarget } = require('@neon-rs/load');
+
 let nativeLib;

-function getPlatformLibrary() {
-    if (process.platform === "darwin" && process.arch == "arm64") {
-        return require('./aarch64-apple-darwin.node');
-    } else if (process.platform === "darwin" && process.arch == "x64") {
-        return require('./x86_64-apple-darwin.node');
-    } else if (process.platform === "linux" && process.arch == "x64") {
-        return require('./x86_64-unknown-linux-gnu.node');
-    } else {
-        throw new Error(`vectordb: unsupported platform ${process.platform}_${process.arch}. Please file a bug report at https://github.com/lancedb/lancedb/issues`)
-    }
-}
-
 try {
-    nativeLib = require('./index.node')
+    nativeLib = require(`@vectordb/${currentTarget()}`);
 } catch (e) {
-    if (e.code === "MODULE_NOT_FOUND") {
-        nativeLib = getPlatformLibrary();
-    } else {
-        throw new Error('vectordb: failed to load native library. Please file a bug report at https://github.com/lancedb/lancedb/issues');
-    }
+    throw new Error(`vectordb: failed to load native library.
+  You may need to run \`npm install @vectordb/${currentTarget()}\`.
+
+  If that does not work, please file a bug report at https://github.com/lancedb/lancedb/issues
+
+  Source error: ${e}`);
 }

-module.exports = nativeLib
-
+// Dynamic require for runtime.
+module.exports = nativeLib;
--- a/node/package-lock.json
+++ b/node/package-lock.json
@@ -7,12 +7,26 @@
    "": {
      "name": "vectordb",
      "version": "0.1.1",
+      "cpu": [
+        "x64",
+        "arm64"
+      ],
      "license": "Apache-2.0",
+      "os": [
+        "darwin",
+        "linux"
+      ],
      "dependencies": {
        "@apache-arrow/ts": "^12.0.0",
+        "@neon-rs/load": "^0.0.74",
+        "@vectordb/darwin-arm64": "0.1.1",
+        "@vectordb/darwin-x64": "0.1.1",
+        "@vectordb/linux-x64-gnu": "0.1.1",
+        "@vectordb/linux-x64-musl": "0.1.1",
        "apache-arrow": "^12.0.0"
      },
      "devDependencies": {
+        "@neon-rs/cli": "^0.0.74",
        "@types/chai": "^4.3.4",
        "@types/mocha": "^10.0.1",
        "@types/node": "^18.16.2",
@@ -30,6 +44,12 @@
        "ts-node": "^10.9.1",
        "ts-node-dev": "^2.0.0",
        "typescript": "*"
+      },
+      "optionalDependencies": {
+        "@vectordb/darwin-arm64": "0.1.1",
+        "@vectordb/darwin-x64": "0.1.1",
+        "@vectordb/linux-x64-gnu": "0.1.1",
+        "@vectordb/linux-x64-musl": "0.1.1"
      }
    },
    "node_modules/@apache-arrow/ts": {
@@ -197,6 +217,20 @@
        "@jridgewell/sourcemap-codec": "^1.4.10"
      }
    },
+    "node_modules/@neon-rs/cli": {
+      "version": "0.0.74",
+      "resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
+      "integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
+      "dev": true,
+      "bin": {
+        "neon": "index.js"
+      }
+    },
+    "node_modules/@neon-rs/load": {
+      "version": "0.0.74",
+      "resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
+      "integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
+    },
    "node_modules/@nodelib/fs.scandir": {
      "version": "2.1.5",
      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@@ -4191,6 +4225,17 @@
        "@jridgewell/sourcemap-codec": "^1.4.10"
      }
    },
+    "@neon-rs/cli": {
+      "version": "0.0.74",
+      "resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.74.tgz",
+      "integrity": "sha512-9lPmNmjej5iKKOTMPryOMubwkgMRyTWRuaq1yokASvI5mPhr2kzPN7UVjdCOjQvpunNPngR9yAHoirpjiWhUHw==",
+      "dev": true
+    },
+    "@neon-rs/load": {
+      "version": "0.0.74",
+      "resolved": "https://registry.npmjs.org/@neon-rs/load/-/load-0.0.74.tgz",
+      "integrity": "sha512-/cPZD907UNz55yrc/ud4wDgQKtU1TvkD9jeqZWG6J4IMmZkp6zgjkQcKA8UvpkZlcpPHvc8J17sGzLFbP/LUYg=="
+    },
    "@nodelib/fs.scandir": {
      "version": "2.1.5",
      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
--- a/node/package.json
+++ b/node/package.json
@@ -1,15 +1,18 @@
 {
  "name": "vectordb",
-  "version": "0.1.1",
+  "version": "0.1.2",
  "description": " Serverless, low-latency vector database for AI applications",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
  "scripts": {
    "tsc": "tsc -b",
-    "build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json-render-diagnostics",
+    "build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
    "build-release": "npm run build -- --release",
+    "cross-release": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cross build --message-format=json --release -p vectordb-node",
    "test": "mocha -recursive dist/test",
-    "lint": "eslint src --ext .js,.ts"
+    "lint": "eslint src --ext .js,.ts",
+    "pack-build": "neon pack-build",
+    "check-npm": "printenv && which node && which npm && npm --version"
  },
  "repository": {
    "type": "git",
@@ -24,6 +27,7 @@
  "author": "Lance Devs",
  "license": "Apache-2.0",
  "devDependencies": {
+    "@neon-rs/cli": "^0.0.74",
    "@types/chai": "^4.3.4",
    "@types/mocha": "^10.0.1",
    "@types/node": "^18.16.2",
@@ -44,6 +48,33 @@
  },
  "dependencies": {
    "@apache-arrow/ts": "^12.0.0",
+    "@neon-rs/load": "^0.0.74",
    "apache-arrow": "^12.0.0"
+  },
+  "os": [
+    "darwin",
+    "linux"
+  ],
+  "cpu": [
+    "x64",
+    "arm64"
+  ],
+  "neon": {
+    "targets": {
+      "x86_64-apple-darwin": "@vectordb/darwin-x64",
+      "aarch64-apple-darwin": "@vectordb/darwin-arm64",
+      "x86_64-unknown-linux-gnu": "@vectordb/linux-x64-gnu",
+      "x86_64-unknown-linux-musl": "@vectordb/linux-x64-musl",
+      "aarch64-unknown-linux-gnu": "@vectordb/linux-arm64-gnu",
+      "aarch64-unknown-linux-musl": "@vectordb/linux-arm64-musl"
+    }
+  },
+  "optionalDependencies": {
+    "@vectordb/darwin-arm64": "0.1.2",
+    "@vectordb/darwin-x64": "0.1.2",
+    "@vectordb/linux-x64-gnu": "0.1.2",
+    "@vectordb/linux-x64-musl": "0.1.2",
+    "@vectordb/linux-arm64-gnu": "0.1.2",
+    "@vectordb/linux-arm64-musl": "0.1.2"
  }
 }
--- a/node/src/arrow.ts
+++ b/node/src/arrow.ts
@@ -18,7 +18,7 @@ import {
  List,
  makeBuilder,
  RecordBatchFileWriter,
-  Table,
+  Table, Utf8,
  type Vector,
  vectorFromArray
 } from 'apache-arrow'
@@ -52,7 +52,12 @@ export function convertToTable (data: Array<Record<string, unknown>>): Table {
      for (const datum of data) {
        values.push(datum[columnsKey])
      }
-      records[columnsKey] = vectorFromArray(values)
+      if (typeof values[0] === 'string') {
+        // `vectorFromArray` converts strings into dictionary vectors, forcing it back to a string column
+        records[columnsKey] = vectorFromArray(values, new Utf8())
+      } else {
+        records[columnsKey] = vectorFromArray(values)
+      }
    }
  }

--- a/node/src/index.ts
+++ b/node/src/index.ts
@@ -21,7 +21,7 @@ import {
 import { fromRecordsToBuffer } from './arrow'

 // eslint-disable-next-line @typescript-eslint/no-var-requires
-const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd } = require('../native.js')
+const { databaseNew, databaseTableNames, databaseOpenTable, tableCreate, tableSearch, tableAdd, tableCreateVectorIndex } = require('../native.js')

 /**
 * Connect to a LanceDB instance at the given URI
@@ -100,64 +100,154 @@ export class Table {
  }

  /**
-   * Insert records into this Table
-   * @param data Records to be inserted into the Table
+   * Insert records into this Table.
   *
-   * @param mode Append / Overwrite existing records. Default: Append
+   * @param data Records to be inserted into the Table
   * @return The number of rows added to the table
   */
  async add (data: Array<Record<string, unknown>>): Promise<number> {
    return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Append.toString())
  }

+  /**
+   * Insert records into this Table, replacing its contents.
+   *
+   * @param data Records to be inserted into the Table
+   * @return The number of rows added to the table
+   */
  async overwrite (data: Array<Record<string, unknown>>): Promise<number> {
    return tableAdd.call(this._tbl, await fromRecordsToBuffer(data), WriteMode.Overwrite.toString())
  }
+
+  async create_index (indexParams: VectorIndexParams): Promise<any> {
+    return tableCreateVectorIndex.call(this._tbl, indexParams)
+  }
 }

+interface IvfPQIndexConfig {
+  /**
+   * The column to be indexed
+   */
+  column?: string
+
+  /**
+   * A unique name for the index
+   */
+  index_name?: string
+
+  /**
+   * Metric type, L2 or Cosine
+   */
+  metric_type?: MetricType
+
+  /**
+   * The number of partitions this index
+   */
+  num_partitions?: number
+
+  /**
+   * The max number of iterations for kmeans training.
+   */
+  max_iters?: number
+
+  /**
+   * Train as optimized product quantization.
+   */
+  use_opq?: boolean
+
+  /**
+   * Number of subvectors to build PQ code
+   */
+  num_sub_vectors?: number
+  /**
+   * The number of bits to present one PQ centroid.
+   */
+  num_bits?: number
+
+  /**
+   * Max number of iterations to train OPQ, if `use_opq` is true.
+   */
+  max_opq_iters?: number
+
+  type: 'ivf_pq'
+}
+
+export type VectorIndexParams = IvfPQIndexConfig
+
 /**
 * A builder for nearest neighbor queries for LanceDB.
 */
 export class Query {
  private readonly _tbl: any
-  private readonly _query_vector: number[]
+  private readonly _queryVector: number[]
  private _limit: number
-  private readonly _refine_factor?: number
-  private readonly _nprobes: number
+  private _refineFactor?: number
+  private _nprobes: number
  private readonly _columns?: string[]
  private _filter?: string
-  private readonly _metric = 'L2'
+  private _metricType?: MetricType

  constructor (tbl: any, queryVector: number[]) {
    this._tbl = tbl
-    this._query_vector = queryVector
+    this._queryVector = queryVector
    this._limit = 10
    this._nprobes = 20
-    this._refine_factor = undefined
+    this._refineFactor = undefined
    this._columns = undefined
    this._filter = undefined
+    this._metricType = undefined
  }

+  /***
+   * Sets the number of results that will be returned
+   * @param value number of results
+   */
  limit (value: number): Query {
    this._limit = value
    return this
  }

+  /**
+   * Refine the results by reading extra elements and re-ranking them in memory.
+   * @param value refine factor to use in this query.
+   */
+  refineFactor (value: number): Query {
+    this._refineFactor = value
+    return this
+  }
+
+  /**
+   * The number of probes used. A higher number makes search more accurate but also slower.
+   * @param value The number of probes used.
+   */
+  nprobes (value: number): Query {
+    this._nprobes = value
+    return this
+  }
+
+  /**
+   * A filter statement to be applied to this query.
+   * @param value A filter in the same format used by a sql WHERE clause.
+   */
  filter (value: string): Query {
    this._filter = value
    return this
  }

  /**
-     * Execute the query and return the results as an Array of Objects
-     */
+   * The MetricType used for this Query.
+   * @param value The metric to the. @see MetricType for the different options
+   */
+  metricType (value: MetricType): Query {
+    this._metricType = value
+    return this
+  }
+
+  /**
+   * Execute the query and return the results as an Array of Objects
+   */
  async execute<T = Record<string, unknown>> (): Promise<T[]> {
-    let buffer
-    if (this._filter != null) {
-      buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit, this._filter)
-    } else {
-      buffer = await tableSearch.call(this._tbl, this._query_vector, this._limit)
-    }
+    const buffer = await tableSearch.call(this._tbl, this)
    const data = tableFromIPC(buffer)
    return data.toArray().map((entry: Record<string, unknown>) => {
      const newObject: Record<string, unknown> = {}
@@ -177,3 +267,18 @@ export enum WriteMode {
  Overwrite = 'overwrite',
  Append = 'append'
 }
+
+/**
+ * Distance metrics type.
+ */
+export enum MetricType {
+  /**
+   * Euclidean distance
+   */
+  L2 = 'l2',
+
+  /**
+   * Cosine distance
+   */
+  Cosine = 'cosine'
+}
--- a/node/src/test/test.ts
+++ b/node/src/test/test.ts
@@ -17,6 +17,7 @@ import { assert } from 'chai'
 import { track } from 'temp'

 import * as lancedb from '../index'
+import { MetricType, Query } from '../index'

 describe('LanceDB client', function () {
  describe('when creating a connection to lancedb', function () {
@@ -67,7 +68,7 @@ describe('LanceDB client', function () {
      const uri = await createTestDB()
      const con = await lancedb.connect(uri)
      const table = await con.openTable('vectors')
-      const results = await table.search([0.1, 0.3]).filter('id == 2').execute()
+      const results = await table.search([0.1, 0.1]).filter('id == 2').execute()
      assert.equal(results.length, 1)
      assert.equal(results[0].id, 2)
    })
@@ -96,8 +97,8 @@ describe('LanceDB client', function () {
      const con = await lancedb.connect(dir)

      const data = [
-        { id: 1, vector: [0.1, 0.2], price: 10 },
-        { id: 2, vector: [1.1, 1.2], price: 50 }
+        { id: 1, vector: [0.1, 0.2], price: 10, name: 'a' },
+        { id: 2, vector: [1.1, 1.2], price: 50, name: 'b' }
      ]

      const table = await con.createTable('vectors', data)
@@ -105,8 +106,8 @@ describe('LanceDB client', function () {
      assert.equal(results.length, 2)

      const dataAdd = [
-        { id: 3, vector: [2.1, 2.2], price: 10 },
-        { id: 4, vector: [3.1, 3.2], price: 50 }
+        { id: 3, vector: [2.1, 2.2], price: 10, name: 'c' },
+        { id: 4, vector: [3.1, 3.2], price: 50, name: 'd' }
      ]
      await table.add(dataAdd)
      const resultsAdd = await table.search([0.1, 0.3]).execute()
@@ -130,16 +131,43 @@ describe('LanceDB client', function () {
      assert.equal(resultsAdd.length, 2)
    })
  })
+
+  describe('when creating a vector index', function () {
+    it('overwrite all records in a table', async function () {
+      const uri = await createTestDB(32, 300)
+      const con = await lancedb.connect(uri)
+      const table = await con.openTable('vectors')
+      await table.create_index({ type: 'ivf_pq', column: 'vector', num_partitions: 2, max_iters: 2 })
+    }).timeout(10_000) // Timeout is high partially because GH macos runner is pretty slow
+  })
 })

-async function createTestDB (): Promise<string> {
+describe('Query object', function () {
+  it('sets custom parameters', async function () {
+    const query = new Query(undefined, [0.1, 0.3])
+      .limit(1)
+      .metricType(MetricType.Cosine)
+      .refineFactor(100)
+      .nprobes(20) as Record<string, any>
+    assert.equal(query._limit, 1)
+    assert.equal(query._metricType, MetricType.Cosine)
+    assert.equal(query._refineFactor, 100)
+    assert.equal(query._nprobes, 20)
+  })
+})
+
+async function createTestDB (numDimensions: number = 2, numRows: number = 2): Promise<string> {
  const dir = await track().mkdir('lancejs')
  const con = await lancedb.connect(dir)

-  const data = [
-    { id: 1, vector: [0.1, 0.2], name: 'foo', price: 10, is_active: true },
-    { id: 2, vector: [1.1, 1.2], name: 'bar', price: 50, is_active: false }
-  ]
+  const data = []
+  for (let i = 0; i < numRows; i++) {
+    const vector = []
+    for (let j = 0; j < numDimensions; j++) {
+      vector.push(i + (j * 0.1))
+    }
+    data.push({ id: i + 1, name: `name_${i}`, price: i + 10, is_active: (i % 2 === 0), vector })
+  }

  await con.createTable('vectors', data)
  return dir
--- a/python/lancedb/fts.py
+++ b/python/lancedb/fts.py
@@ -0,0 +1,122 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Full text search index using tantivy-py"""
+import os
+from typing import List, Tuple
+
+import pyarrow as pa
+import tantivy
+
+from .table import LanceTable
+
+
+def create_index(index_path: str, text_fields: List[str]) -> tantivy.Index:
+    """
+    Create a new Index (not populated)
+
+    Parameters
+    ----------
+    index_path : str
+        Path to the index directory
+    text_fields : List[str]
+        List of text fields to index
+
+    Returns
+    -------
+    index : tantivy.Index
+        The index object (not yet populated)
+    """
+    # Declaring our schema.
+    schema_builder = tantivy.SchemaBuilder()
+    # special field that we'll populate with row_id
+    schema_builder.add_integer_field("doc_id", stored=True)
+    # data fields
+    for name in text_fields:
+        schema_builder.add_text_field(name, stored=True)
+    schema = schema_builder.build()
+    os.makedirs(index_path, exist_ok=True)
+    index = tantivy.Index(schema, path=index_path)
+    return index
+
+
+def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -> int:
+    """
+    Populate an index with data from a LanceTable
+
+    Parameters
+    ----------
+    index : tantivy.Index
+        The index object
+    table : LanceTable
+        The table to index
+    fields : List[str]
+        List of fields to index
+    """
+    # first check the fields exist and are string or large string type
+    for name in fields:
+        f = table.schema.field(name)  # raises KeyError if not found
+        if not pa.types.is_string(f.type) and not pa.types.is_large_string(f.type):
+            raise TypeError(f"Field {name} is not a string type")
+
+    # create a tantivy writer
+    writer = index.writer()
+    # write data into index
+    dataset = table.to_lance()
+    row_id = 0
+    for b in dataset.to_batches(columns=fields):
+        for i in range(b.num_rows):
+            doc = tantivy.Document()
+            doc.add_integer("doc_id", row_id)
+            for name in fields:
+                doc.add_text(name, b[name][i].as_py())
+            writer.add_document(doc)
+            row_id += 1
+    # commit changes
+    writer.commit()
+    return row_id
+
+
+def search_index(
+    index: tantivy.Index, query: str, limit: int = 10
+) -> Tuple[Tuple[int], Tuple[float]]:
+    """
+    Search an index for a query
+
+    Parameters
+    ----------
+    index : tantivy.Index
+        The index object
+    query : str
+        The query string
+    limit : int
+        The maximum number of results to return
+
+    Returns
+    -------
+    ids_and_score: list[tuple[int], tuple[float]]
+        A tuple of two tuples, the first containing the document ids
+        and the second containing the scores
+    """
+    searcher = index.searcher()
+    query = index.parse_query(query)
+    # get top results
+    results = searcher.search(query, limit)
+    return tuple(
+        zip(
+            *[
+                (searcher.doc(doc_address)["doc_id"][0], score)
+                for score, doc_address in results.hits
+            ]
+        )
+    )
--- a/python/lancedb/query.py
+++ b/python/lancedb/query.py
@@ -14,6 +14,7 @@ from __future__ import annotations

 import numpy as np
 import pandas as pd
+import pyarrow as pa

 from .common import VECTOR_COLUMN_NAME

@@ -131,7 +132,6 @@ class LanceQueryBuilder:
        vector and the returned vector.
        """
        ds = self._table.to_lance()
-        # TODO indexed search
        tbl = ds.to_table(
            columns=self._columns,
            filter=self._where,
@@ -145,3 +145,26 @@ class LanceQueryBuilder:
            },
        )
        return tbl.to_pandas()
+
+
+class LanceFtsQueryBuilder(LanceQueryBuilder):
+    def to_df(self) -> pd.DataFrame:
+        try:
+            import tantivy
+        except ImportError:
+            raise ImportError(
+                "You need to install the `lancedb[fts]` extra to use this method."
+            )
+
+        from .fts import search_index
+
+        # get the index path
+        index_path = self._table._get_fts_index_path()
+        # open the index
+        index = tantivy.Index.open(index_path)
+        # get the scores and doc ids
+        row_ids, scores = search_index(index, self._query, self._limit)
+        scores = pa.array(scores)
+        output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
+        output_tbl = output_tbl.append_column("score", scores)
+        return output_tbl.to_pandas()
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -14,7 +14,9 @@
 from __future__ import annotations

 import os
+import shutil
 from functools import cached_property
+from typing import List, Union

 import lance
 import numpy as np
@@ -24,7 +26,8 @@ from lance import LanceDataset
 from lance.vector import vec_to_table

 from .common import DATA, VEC, VECTOR_COLUMN_NAME
-from .query import LanceQueryBuilder
+from .query import LanceFtsQueryBuilder, LanceQueryBuilder
+from .util import get_uri_scheme


 def _sanitize_data(data, schema):
@@ -130,6 +133,27 @@ class LanceTable:
        )
        self._reset_dataset()

+    def create_fts_index(self, field_names: Union[str, List[str]]):
+        """Create a full-text search index on the table.
+
+        Warning - this API is highly experimental and is highly likely to change
+        in the future.
+
+        Parameters
+        ----------
+        field_names: str or list of str
+            The name(s) of the field to index.
+        """
+        from .fts import create_index, populate_index
+
+        if isinstance(field_names, str):
+            field_names = [field_names]
+        index = create_index(self._get_fts_index_path(), field_names)
+        populate_index(index, self, field_names)
+
+    def _get_fts_index_path(self):
+        return os.path.join(self._dataset_uri, "_indices", "tantivy")
+
    @cached_property
    def _dataset(self) -> LanceDataset:
        return lance.dataset(self._dataset_uri, version=self._version)
@@ -158,7 +182,7 @@ class LanceTable:
        self._reset_dataset()
        return len(self)

-    def search(self, query: VEC) -> LanceQueryBuilder:
+    def search(self, query: Union[VEC, str]) -> LanceQueryBuilder:
        """Create a search query to find the nearest neighbors
        of the given query vector.

@@ -174,6 +198,10 @@ class LanceTable:
        and also the "score" column which is the distance between the query
        vector and the returned vector.
        """
+        if isinstance(query, str):
+            # fts
+            return LanceFtsQueryBuilder(self, query)
+
        if isinstance(query, list):
            query = np.array(query)
        if isinstance(query, np.ndarray):
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -45,6 +45,10 @@ dev = [
 docs = [
    "mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"
 ]
+fts = [
+    # tantivy 0.19.2
+    "tantivy@git+https://github.com/quickwit-oss/tantivy-py#164adc87e1a033117001cf70e38c82a53014d985"
+]

 [build-system]
 requires = [
--- a/python/tests/test_fts.py
+++ b/python/tests/test_fts.py
@@ -0,0 +1,84 @@
+# Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import os
+import random
+
+import numpy as np
+import pandas as pd
+import pytest
+import tantivy
+
+import lancedb as ldb
+import lancedb.fts
+
+
+@pytest.fixture
+def table(tmp_path) -> ldb.table.LanceTable:
+    db = ldb.connect(tmp_path)
+    vectors = [np.random.randn(128) for _ in range(100)]
+
+    nouns = ("puppy", "car", "rabbit", "girl", "monkey")
+    verbs = ("runs", "hits", "jumps", "drives", "barfs")
+    adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
+    adj = ("adorable", "clueless", "dirty", "odd", "stupid")
+    text = [
+        " ".join(
+            [
+                nouns[random.randrange(0, 5)],
+                verbs[random.randrange(0, 5)],
+                adv[random.randrange(0, 5)],
+                adj[random.randrange(0, 5)],
+            ]
+        )
+        for _ in range(100)
+    ]
+    table = db.create_table(
+        "test", data=pd.DataFrame({"vector": vectors, "text": text, "text2": text})
+    )
+    return table
+
+
+def test_create_index(tmp_path):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    assert isinstance(index, tantivy.Index)
+    assert os.path.exists(str(tmp_path / "index"))
+
+
+def test_populate_index(tmp_path, table):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    assert ldb.fts.populate_index(index, table, ["text"]) == len(table)
+
+
+def test_search_index(tmp_path, table):
+    index = ldb.fts.create_index(str(tmp_path / "index"), ["text"])
+    ldb.fts.populate_index(index, table, ["text"])
+    index.reload()
+    results = ldb.fts.search_index(index, query="puppy", limit=10)
+    assert len(results) == 2
+    assert len(results[0]) == 10  # row_ids
+    assert len(results[1]) == 10  # scores
+
+
+def test_create_index_from_table(tmp_path, table):
+    table.create_fts_index("text")
+    df = table.search("puppy").limit(10).select(["text"]).to_df()
+    assert len(df) == 10
+    assert "text" in df.columns
+
+
+def test_create_index_multiple_columns(tmp_path, table):
+    table.create_fts_index(["text", "text2"])
+    df = table.search("puppy").limit(10).to_df()
+    assert len(df) == 10
+    assert "text" in df.columns
+    assert "text2" in df.columns
--- a/rust/ffi/node/Cargo.toml
+++ b/rust/ffi/node/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vectordb-node"
-version = "0.1.0"
+version = "0.1.2"
 description = "Serverless, low-latency vector database for AI applications"
 license = "Apache-2.0"
 edition = "2018"
--- a/rust/ffi/node/src/index.rs
+++ b/rust/ffi/node/src/index.rs
@@ -0,0 +1,15 @@
+// Copyright 2023 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod vector;
--- a/rust/ffi/node/src/index/vector.rs
+++ b/rust/ffi/node/src/index/vector.rs
@@ -0,0 +1,128 @@
+// Copyright 2023 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::convert::TryFrom;
+
+use lance::index::vector::ivf::IvfBuildParams;
+use lance::index::vector::pq::PQBuildParams;
+use lance::index::vector::MetricType;
+use neon::context::FunctionContext;
+use neon::prelude::*;
+
+use vectordb::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
+
+use crate::{runtime, JsTable};
+
+pub(crate) fn table_create_vector_index(mut cx: FunctionContext) -> JsResult<JsPromise> {
+    let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
+    let index_params = cx.argument::<JsObject>(0)?;
+    let index_params_builder = get_index_params_builder(&mut cx, index_params).unwrap();
+
+    let rt = runtime(&mut cx)?;
+    let channel = cx.channel();
+
+    let (deferred, promise) = cx.promise();
+    let table = js_table.table.clone();
+
+    rt.block_on(async move {
+        let add_result = table
+            .lock()
+            .unwrap()
+            .create_idx(&index_params_builder)
+            .await;
+
+        deferred.settle_with(&channel, move |mut cx| {
+            add_result
+                .map(|_| cx.undefined())
+                .or_else(|err| cx.throw_error(err.to_string()))
+        });
+    });
+    Ok(promise)
+}
+
+fn get_index_params_builder(
+    cx: &mut FunctionContext,
+    obj: Handle<JsObject>,
+) -> Result<impl VectorIndexBuilder, String> {
+    let idx_type = obj
+        .get::<JsString, _, _>(cx, "type")
+        .map_err(|t| t.to_string())?
+        .value(cx);
+
+    match idx_type.as_str() {
+        "ivf_pq" => {
+            let mut index_builder: IvfPQIndexBuilder = IvfPQIndexBuilder::new();
+            let mut pq_params = PQBuildParams::default();
+
+            obj.get_opt::<JsString, _, _>(cx, "column")
+                .map_err(|t| t.to_string())?
+                .map(|s| index_builder.column(s.value(cx)));
+
+            obj.get_opt::<JsString, _, _>(cx, "index_name")
+                .map_err(|t| t.to_string())?
+                .map(|s| index_builder.index_name(s.value(cx)));
+
+            obj.get_opt::<JsString, _, _>(cx, "metric_type")
+                .map_err(|t| t.to_string())?
+                .map(|s| MetricType::try_from(s.value(cx).as_str()))
+                .map(|mt| {
+                    let metric_type = mt.unwrap();
+                    index_builder.metric_type(metric_type);
+                    pq_params.metric_type = metric_type;
+                });
+
+            let num_partitions = obj
+                .get_opt::<JsNumber, _, _>(cx, "num_partitions")
+                .map_err(|t| t.to_string())?
+                .map(|s| s.value(cx) as usize);
+
+            let max_iters = obj
+                .get_opt::<JsNumber, _, _>(cx, "max_iters")
+                .map_err(|t| t.to_string())?
+                .map(|s| s.value(cx) as usize);
+
+            num_partitions.map(|np| {
+                let max_iters = max_iters.unwrap_or(50);
+                let ivf_params = IvfBuildParams {
+                    num_partitions: np,
+                    max_iters,
+                };
+                index_builder.ivf_params(ivf_params)
+            });
+
+            obj.get_opt::<JsBoolean, _, _>(cx, "use_opq")
+                .map_err(|t| t.to_string())?
+                .map(|s| pq_params.use_opq = s.value(cx));
+
+            obj.get_opt::<JsNumber, _, _>(cx, "num_sub_vectors")
+                .map_err(|t| t.to_string())?
+                .map(|s| pq_params.num_sub_vectors = s.value(cx) as usize);
+
+            obj.get_opt::<JsNumber, _, _>(cx, "num_bits")
+                .map_err(|t| t.to_string())?
+                .map(|s| pq_params.num_bits = s.value(cx) as usize);
+
+            obj.get_opt::<JsNumber, _, _>(cx, "max_iters")
+                .map_err(|t| t.to_string())?
+                .map(|s| pq_params.max_iters = s.value(cx) as usize);
+
+            obj.get_opt::<JsNumber, _, _>(cx, "max_opq_iters")
+                .map_err(|t| t.to_string())?
+                .map(|s| pq_params.max_opq_iters = s.value(cx) as usize);
+
+            Ok(index_builder)
+        }
+        t => Err(format!("{} is not a valid index type", t).to_string()),
+    }
+}
--- a/rust/ffi/node/src/lib.rs
+++ b/rust/ffi/node/src/lib.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::collections::HashMap;
+use std::convert::TryFrom;
 use std::ops::Deref;
 use std::sync::{Arc, Mutex};

@@ -21,6 +22,7 @@ use arrow_ipc::writer::FileWriter;
 use futures::{TryFutureExt, TryStreamExt};
 use lance::arrow::RecordBatchBuffer;
 use lance::dataset::WriteMode;
+use lance::index::vector::MetricType;
 use neon::prelude::*;
 use neon::types::buffer::TypedArray;
 use once_cell::sync::OnceCell;
@@ -34,17 +36,18 @@ use crate::arrow::arrow_buffer_to_record_batch;

 mod arrow;
 mod convert;
+mod index;

 struct JsDatabase {
    database: Arc<Database>,
 }

+impl Finalize for JsDatabase {}
+
 struct JsTable {
    table: Arc<Mutex<Table>>,
 }

-impl Finalize for JsDatabase {}
-
 impl Finalize for JsTable {}

 fn runtime<'a, C: Context<'a>>(cx: &mut C) -> NeonResult<&'static Runtime> {
@@ -87,7 +90,9 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {
        let table_rst = database.open_table(table_name).await;

        deferred.settle_with(&channel, move |mut cx| {
-            let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
+            let table = Arc::new(Mutex::new(
+                table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
+            ));
            Ok(cx.boxed(JsTable { table }))
        });
    });
@@ -96,15 +101,32 @@ fn database_open_table(mut cx: FunctionContext) -> JsResult<JsPromise> {

 fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
    let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
-    let query_vector = cx.argument::<JsArray>(0)?; //. .as_value(&mut cx);
-    let limit = cx.argument::<JsNumber>(1)?.value(&mut cx);
-    let filter = cx.argument_opt(2).map(|f| f.downcast_or_throw::<JsString, _>(&mut cx).unwrap().value(&mut cx));
+    let query_obj = cx.argument::<JsObject>(0)?;
+
+    let limit = query_obj
+        .get::<JsNumber, _, _>(&mut cx, "_limit")?
+        .value(&mut cx);
+    let filter = query_obj
+        .get_opt::<JsString, _, _>(&mut cx, "_filter")?
+        .map(|s| s.value(&mut cx));
+    let refine_factor = query_obj
+        .get_opt::<JsNumber, _, _>(&mut cx, "_refineFactor")?
+        .map(|s| s.value(&mut cx))
+        .map(|i| i as u32);
+    let nprobes = query_obj
+        .get::<JsNumber, _, _>(&mut cx, "_nprobes")?
+        .value(&mut cx) as usize;
+    let metric_type = query_obj
+        .get_opt::<JsString, _, _>(&mut cx, "_metricType")?
+        .map(|s| s.value(&mut cx))
+        .map(|s| MetricType::try_from(s.as_str()).unwrap());

    let rt = runtime(&mut cx)?;
    let channel = cx.channel();

    let (deferred, promise) = cx.promise();
    let table = js_table.table.clone();
+    let query_vector = query_obj.get::<JsArray, _, _>(&mut cx, "_queryVector")?;
    let query = convert::js_array_to_vec(query_vector.deref(), &mut cx);

    rt.spawn(async move {
@@ -113,7 +135,10 @@ fn table_search(mut cx: FunctionContext) -> JsResult<JsPromise> {
            .unwrap()
            .search(Float32Array::from(query))
            .limit(limit as usize)
-            .filter(filter);
+            .refine_factor(refine_factor)
+            .nprobes(nprobes)
+            .filter(filter)
+            .metric_type(metric_type);
        let record_batch_stream = builder.execute();
        let results = record_batch_stream
            .and_then(|stream| stream.try_collect::<Vec<_>>().map_err(Error::from))
@@ -164,7 +189,9 @@ fn table_create(mut cx: FunctionContext) -> JsResult<JsPromise> {
        let table_rst = database.create_table(table_name, batch_reader).await;

        deferred.settle_with(&channel, move |mut cx| {
-            let table = Arc::new(Mutex::new(table_rst.or_else(|err| cx.throw_error(err.to_string()))?));
+            let table = Arc::new(Mutex::new(
+                table_rst.or_else(|err| cx.throw_error(err.to_string()))?,
+            ));
            Ok(cx.boxed(JsTable { table }))
        });
    });
@@ -178,9 +205,7 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
        ("overwrite", WriteMode::Overwrite),
    ]);

-    let js_table = cx
-        .this()
-        .downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
+    let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
    let buffer = cx.argument::<JsBuffer>(0)?;
    let write_mode = cx.argument::<JsString>(1)?.value(&mut cx);
    let batches = arrow_buffer_to_record_batch(buffer.as_slice(&mut cx));
@@ -204,7 +229,6 @@ fn table_add(mut cx: FunctionContext) -> JsResult<JsPromise> {
    Ok(promise)
 }

-
 #[neon::main]
 fn main(mut cx: ModuleContext) -> NeonResult<()> {
    cx.export_function("databaseNew", database_new)?;
@@ -213,5 +237,9 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
    cx.export_function("tableSearch", table_search)?;
    cx.export_function("tableCreate", table_create)?;
    cx.export_function("tableAdd", table_add)?;
+    cx.export_function(
+        "tableCreateVectorIndex",
+        index::vector::table_create_vector_index,
+    )?;
    Ok(())
 }
--- a/rust/vectordb/Cargo.toml
+++ b/rust/vectordb/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vectordb"
-version = "0.0.1"
+version = "0.1.2"
 edition = "2021"
 description = "Serverless, low-latency vector database for AI applications"
 license = "Apache-2.0"
@@ -10,9 +10,11 @@ repository = "https://github.com/lancedb/lancedb"

 [dependencies]
 arrow-array = "37.0"
+arrow-data = "37.0"
 arrow-schema = "37.0"
 lance = "0.4.3"
 tokio = { version = "1.23", features = ["rt-multi-thread"] }

 [dev-dependencies]
 tempfile = "3.5.0"
+rand = { version = "0.8.3", features = ["small_rng"] }
--- a/rust/vectordb/src/index.rs
+++ b/rust/vectordb/src/index.rs
@@ -0,0 +1,15 @@
+// Copyright 2023 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod vector;
--- a/rust/vectordb/src/index/vector.rs
+++ b/rust/vectordb/src/index/vector.rs
@@ -0,0 +1,163 @@
+// Copyright 2023 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use lance::index::vector::ivf::IvfBuildParams;
+use lance::index::vector::pq::PQBuildParams;
+use lance::index::vector::{MetricType, VectorIndexParams};
+
+pub trait VectorIndexBuilder {
+    fn get_column(&self) -> Option<String>;
+    fn get_index_name(&self) -> Option<String>;
+    fn build(&self) -> VectorIndexParams;
+}
+
+pub struct IvfPQIndexBuilder {
+    column: Option<String>,
+    index_name: Option<String>,
+    metric_type: Option<MetricType>,
+    ivf_params: Option<IvfBuildParams>,
+    pq_params: Option<PQBuildParams>,
+}
+
+impl IvfPQIndexBuilder {
+    pub fn new() -> IvfPQIndexBuilder {
+        IvfPQIndexBuilder {
+            column: None,
+            index_name: None,
+            metric_type: None,
+            ivf_params: None,
+            pq_params: None,
+        }
+    }
+}
+
+impl IvfPQIndexBuilder {
+    pub fn column(&mut self, column: String) -> &mut IvfPQIndexBuilder {
+        self.column = Some(column);
+        self
+    }
+
+    pub fn index_name(&mut self, index_name: String) -> &mut IvfPQIndexBuilder {
+        self.index_name = Some(index_name);
+        self
+    }
+
+    pub fn metric_type(&mut self, metric_type: MetricType) -> &mut IvfPQIndexBuilder {
+        self.metric_type = Some(metric_type);
+        self
+    }
+
+    pub fn ivf_params(&mut self, ivf_params: IvfBuildParams) -> &mut IvfPQIndexBuilder {
+        self.ivf_params = Some(ivf_params);
+        self
+    }
+
+    pub fn pq_params(&mut self, pq_params: PQBuildParams) -> &mut IvfPQIndexBuilder {
+        self.pq_params = Some(pq_params);
+        self
+    }
+}
+
+impl VectorIndexBuilder for IvfPQIndexBuilder {
+    fn get_column(&self) -> Option<String> {
+        self.column.clone()
+    }
+
+    fn get_index_name(&self) -> Option<String> {
+        self.index_name.clone()
+    }
+
+    fn build(&self) -> VectorIndexParams {
+        let ivf_params = self.ivf_params.clone().unwrap_or(IvfBuildParams::default());
+        let pq_params = self.pq_params.clone().unwrap_or(PQBuildParams::default());
+
+        VectorIndexParams::with_ivf_pq_params(pq_params.metric_type, ivf_params, pq_params)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use lance::index::vector::ivf::IvfBuildParams;
+    use lance::index::vector::pq::PQBuildParams;
+    use lance::index::vector::{MetricType, StageParams};
+
+    use crate::index::vector::{IvfPQIndexBuilder, VectorIndexBuilder};
+
+    #[test]
+    fn test_builder_no_params() {
+        let index_builder = IvfPQIndexBuilder::new();
+        assert!(index_builder.get_column().is_none());
+        assert!(index_builder.get_index_name().is_none());
+
+        let index_params = index_builder.build();
+        assert_eq!(index_params.stages.len(), 2);
+        if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
+            let default = IvfBuildParams::default();
+            assert_eq!(ivf_params.num_partitions, default.num_partitions);
+            assert_eq!(ivf_params.max_iters, default.max_iters);
+        } else {
+            panic!("Expected first stage to be ivf")
+        }
+
+        if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
+            assert_eq!(pq_params.use_opq, false);
+        } else {
+            panic!("Expected second stage to be pq")
+        }
+    }
+
+    #[test]
+    fn test_builder_all_params() {
+        let mut index_builder = IvfPQIndexBuilder::new();
+
+        index_builder
+            .column("c".to_owned())
+            .metric_type(MetricType::Cosine)
+            .index_name("index".to_owned());
+
+        assert_eq!(index_builder.column.clone().unwrap(), "c");
+        assert_eq!(index_builder.metric_type.unwrap(), MetricType::Cosine);
+        assert_eq!(index_builder.index_name.clone().unwrap(), "index");
+
+        let ivf_params = IvfBuildParams::new(500);
+        let mut pq_params = PQBuildParams::default();
+        pq_params.use_opq = true;
+        pq_params.max_iters = 1;
+        pq_params.num_bits = 8;
+        pq_params.num_sub_vectors = 50;
+        pq_params.metric_type = MetricType::Cosine;
+        pq_params.max_opq_iters = 2;
+        index_builder.ivf_params(ivf_params);
+        index_builder.pq_params(pq_params);
+
+        let index_params = index_builder.build();
+        assert_eq!(index_params.stages.len(), 2);
+        if let StageParams::Ivf(ivf_params) = index_params.stages.get(0).unwrap() {
+            assert_eq!(ivf_params.num_partitions, 500);
+        } else {
+            assert!(false, "Expected first stage to be ivf")
+        }
+
+        if let StageParams::PQ(pq_params) = index_params.stages.get(1).unwrap() {
+            assert_eq!(pq_params.use_opq, true);
+            assert_eq!(pq_params.max_iters, 1);
+            assert_eq!(pq_params.num_bits, 8);
+            assert_eq!(pq_params.num_sub_vectors, 50);
+            assert_eq!(pq_params.metric_type, MetricType::Cosine);
+            assert_eq!(pq_params.max_opq_iters, 2);
+        } else {
+            assert!(false, "Expected second stage to be pq")
+        }
+    }
+}
--- a/rust/vectordb/src/lib.rs
+++ b/rust/vectordb/src/lib.rs
@@ -14,5 +14,6 @@

 pub mod database;
 pub mod error;
+pub mod index;
 pub mod query;
 pub mod table;
--- a/rust/vectordb/src/query.rs
+++ b/rust/vectordb/src/query.rs
@@ -29,7 +29,7 @@ pub struct Query {
    pub filter: Option<String>,
    pub nprobes: usize,
    pub refine_factor: Option<u32>,
-    pub metric_type: MetricType,
+    pub metric_type: Option<MetricType>,
    pub use_index: bool,
 }

@@ -51,9 +51,9 @@ impl Query {
            limit: 10,
            nprobes: 20,
            refine_factor: None,
-            metric_type: MetricType::L2,
+            metric_type: None,
            use_index: false,
-            filter: None
+            filter: None,
        }
    }

@@ -71,10 +71,10 @@ impl Query {
            self.limit,
        )?;
        scanner.nprobs(self.nprobes);
-        scanner.distance_metric(self.metric_type);
        scanner.use_index(self.use_index);
        self.filter.as_ref().map(|f| scanner.filter(f));
        self.refine_factor.map(|rf| scanner.refine(rf));
+        self.metric_type.map(|mt| scanner.distance_metric(mt));
        Ok(scanner.try_into_stream().await?)
    }

@@ -123,7 +123,7 @@ impl Query {
    /// # Arguments
    ///
    /// * `metric_type` - The distance metric to use. By default [MetricType::L2] is used.
-    pub fn metric_type(mut self, metric_type: MetricType) -> Query {
+    pub fn metric_type(mut self, metric_type: Option<MetricType>) -> Query {
        self.metric_type = metric_type;
        self
    }
@@ -174,14 +174,14 @@ mod tests {
            .limit(100)
            .nprobes(1000)
            .use_index(true)
-            .metric_type(MetricType::Cosine)
+            .metric_type(Some(MetricType::Cosine))
            .refine_factor(Some(999));

        assert_eq!(query.query_vector, new_vector);
        assert_eq!(query.limit, 100);
        assert_eq!(query.nprobes, 1000);
        assert_eq!(query.use_index, true);
-        assert_eq!(query.metric_type, MetricType::Cosine);
+        assert_eq!(query.metric_type, Some(MetricType::Cosine));
        assert_eq!(query.refine_factor, Some(999));
    }

--- a/rust/vectordb/src/table.rs
+++ b/rust/vectordb/src/table.rs
@@ -17,8 +17,10 @@ use std::sync::Arc;

 use arrow_array::{Float32Array, RecordBatchReader};
 use lance::dataset::{Dataset, WriteMode, WriteParams};
+use lance::index::IndexType;

 use crate::error::{Error, Result};
+use crate::index::vector::VectorIndexBuilder;
 use crate::query::Query;

 pub const VECTOR_COLUMN_NAME: &str = "vector";
@@ -80,7 +82,30 @@ impl Table {

        let dataset =
            Arc::new(Dataset::write(&mut batches, path, Some(WriteParams::default())).await?);
-        Ok(Table { name, path: path.to_string(), dataset })
+        Ok(Table {
+            name,
+            path: path.to_string(),
+            dataset,
+        })
+    }
+
+    pub async fn create_idx(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
+        use lance::index::DatasetIndexExt;
+
+        let dataset = self
+            .dataset
+            .create_index(
+                &[index_builder
+                    .get_column()
+                    .unwrap_or(VECTOR_COLUMN_NAME.to_string())
+                    .as_str()],
+                IndexType::Vector,
+                index_builder.get_index_name(),
+                &index_builder.build(),
+            )
+            .await?;
+        self.dataset = Arc::new(dataset);
+        Ok(())
    }

    /// Insert records into this Table
@@ -95,12 +120,13 @@ impl Table {
    pub async fn add(
        &mut self,
        mut batches: Box<dyn RecordBatchReader>,
-        write_mode: Option<WriteMode>
+        write_mode: Option<WriteMode>,
    ) -> Result<usize> {
        let mut params = WriteParams::default();
        params.mode = write_mode.unwrap_or(WriteMode::Append);

-        self.dataset = Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
+        self.dataset =
+            Arc::new(Dataset::write(&mut batches, self.path.as_str(), Some(params)).await?);
        Ok(batches.count())
    }

@@ -125,13 +151,21 @@ impl Table {

 #[cfg(test)]
 mod tests {
-    use arrow_array::{Float32Array, Int32Array, RecordBatch, RecordBatchReader};
+    use arrow_array::{
+        Array, FixedSizeListArray, Float32Array, Int32Array, RecordBatch, RecordBatchReader,
+    };
+    use arrow_data::ArrayDataBuilder;
    use arrow_schema::{DataType, Field, Schema};
    use lance::arrow::RecordBatchBuffer;
    use lance::dataset::{Dataset, WriteMode};
+    use lance::index::vector::ivf::IvfBuildParams;
+    use lance::index::vector::pq::PQBuildParams;
+    use rand::Rng;
    use std::sync::Arc;
    use tempfile::tempdir;

+    use crate::error::Result;
+    use crate::index::vector::IvfPQIndexBuilder;
    use crate::table::Table;

    #[tokio::test]
@@ -171,14 +205,17 @@ mod tests {

        let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
        let schema = batches.schema().clone();
-        let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
+        let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
+            .await
+            .unwrap();
        assert_eq!(table.count_rows().await.unwrap(), 10);

-        let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
-            schema,
-            vec![Arc::new(Int32Array::from_iter_values(100..110))],
-        )
-       .unwrap()]));
+        let new_batches: Box<dyn RecordBatchReader> =
+            Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
+                schema,
+                vec![Arc::new(Int32Array::from_iter_values(100..110))],
+            )
+            .unwrap()]));

        table.add(new_batches, None).await.unwrap();
        assert_eq!(table.count_rows().await.unwrap(), 20);
@@ -192,15 +229,22 @@ mod tests {

        let batches: Box<dyn RecordBatchReader> = Box::new(make_test_batches());
        let schema = batches.schema().clone();
-        let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches).await.unwrap();
+        let mut table = Table::create(Arc::new(path_buf), "test".to_string(), batches)
+            .await
+            .unwrap();
        assert_eq!(table.count_rows().await.unwrap(), 10);

-        let new_batches: Box<dyn RecordBatchReader> = Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
-            schema,
-            vec![Arc::new(Int32Array::from_iter_values(100..110))],
-        ).unwrap()]));
+        let new_batches: Box<dyn RecordBatchReader> =
+            Box::new(RecordBatchBuffer::new(vec![RecordBatch::try_new(
+                schema,
+                vec![Arc::new(Int32Array::from_iter_values(100..110))],
+            )
+            .unwrap()]));

-        table.add(new_batches, Some(WriteMode::Overwrite)).await.unwrap();
+        table
+            .add(new_batches, Some(WriteMode::Overwrite))
+            .await
+            .unwrap();
        assert_eq!(table.count_rows().await.unwrap(), 10);
        assert_eq!(table.name, "test");
    }
@@ -236,4 +280,74 @@ mod tests {
        )
        .unwrap()])
    }
+
+    #[tokio::test]
+    async fn test_create_index() {
+        use arrow_array::RecordBatch;
+        use arrow_schema::{DataType, Field, Schema as ArrowSchema};
+        use rand;
+        use std::iter::repeat_with;
+
+        use arrow_array::Float32Array;
+
+        let tmp_dir = tempdir().unwrap();
+        let path_buf = tmp_dir.into_path();
+
+        let dimension = 16;
+        let schema = Arc::new(ArrowSchema::new(vec![Field::new(
+            "embeddings",
+            DataType::FixedSizeList(
+                Arc::new(Field::new("item", DataType::Float32, true)),
+                dimension,
+            ),
+            false,
+        )]));
+
+        let mut rng = rand::thread_rng();
+        let float_arr = Float32Array::from(
+            repeat_with(|| rng.gen::<f32>())
+                .take(512 * dimension as usize)
+                .collect::<Vec<f32>>(),
+        );
+
+        let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
+        let batches = RecordBatchBuffer::new(vec![RecordBatch::try_new(
+            schema.clone(),
+            vec![vectors.clone()],
+        )
+        .unwrap()]);
+
+        let reader: Box<dyn RecordBatchReader + Send> = Box::new(batches);
+        let mut table = Table::create(Arc::new(path_buf), "test".to_string(), reader)
+            .await
+            .unwrap();
+
+        let mut i = IvfPQIndexBuilder::new();
+
+        let index_builder = i
+            .column("embeddings".to_string())
+            .index_name("my_index".to_string())
+            .ivf_params(IvfBuildParams::new(256))
+            .pq_params(PQBuildParams::default());
+
+        table.create_idx(index_builder).await.unwrap();
+
+        assert_eq!(table.dataset.load_indices().await.unwrap().len(), 1);
+        assert_eq!(table.count_rows().await.unwrap(), 512);
+        assert_eq!(table.name, "test");
+    }
+
+    fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
+        let list_type = DataType::FixedSizeList(
+            Arc::new(Field::new("item", values.data_type().clone(), true)),
+            list_size,
+        );
+        let data = ArrayDataBuilder::new(list_type)
+            .len(values.len() / list_size as usize)
+            .add_child_data(values.into_data())
+            .build()
+            .unwrap();
+
+        Ok(FixedSizeListArray::from(data))
+    }
 }
Author	SHA1	Message	Date
Will Jones	a9dcfe7535	make node available to all users	2023-05-25 17:50:07 -07:00
Will Jones	0028b95fd8	mac debug info	2023-05-25 13:33:54 -07:00
Will Jones	102f1d7404	add dbg prints	2023-05-25 09:44:02 -07:00
Will Jones	500aa7b002	give up on musl for now	2023-05-25 09:21:40 -07:00
Will Jones	8aa0f6b4ba	use manylinux containers locally	2023-05-25 09:21:40 -07:00
Will Jones	140aa32e08	try manylinux again	2023-05-25 09:21:40 -07:00
Will Jones	a067c3dc85	fixes for action	2023-05-25 09:21:40 -07:00
Will Jones	e762a4db4b	cleanup	2023-05-25 09:21:40 -07:00
Will Jones	5e0ff01879	match versions	2023-05-25 09:21:40 -07:00
Will Jones	84356220dd	fill out rest of release script	2023-05-25 09:21:40 -07:00
Will Jones	6c03662c68	more progress on release workflow	2023-05-25 09:21:40 -07:00
Will Jones	5e098f4fe5	wip: see if we can build the lib in ci	2023-05-25 09:21:40 -07:00
Chang She	f485378ea4	Basic full text search capabilities (#62 ) This is v1 of integrating full text search index into LanceDB. # API The query API is roughly the same as before, except if the input is text instead of a vector we assume that its fts search. ## Example If `table` is a LanceDB LanceTable, then: Build index: `table.create_fts_index("text")` Query: `df = table.search("puppy").limit(10).select(["text"]).to_df()` # Implementation Here we use the tantivy-py package to build the index. We then use the row id's as the full-text-search index's doc id then we just do a Take operation to fetch the rows. # Limitations 1. don't support incremental row appends yet. New data won't show up in search 2. local filesystem only 3. requires building tantivy explicitly --------- Co-authored-by: Chang She <chang@lancedb.com>	2023-05-24 22:25:31 -06:00
gsilvestrin	f923cfe47f	add create index to nodejs client (#89 )	2023-05-24 16:45:58 -06:00
gsilvestrin	06cb7b6458	add query params to to nodejs client (#87 )	2023-05-24 15:48:31 -06:00
gsilvestrin	bdef634954	bugfix: string columns should be converted to Utf8Array (#94 )	2023-05-23 14:58:49 -07:00