Bump to 0.1.11

Bump minimal lance version to 0.5.8 (#318 )
feat(node): Add Windows support (#294 )
2025-12-23 13:29:57 +00:00 · 2023-07-17 12:45:17 -07:00 · 2023-07-17 12:41:29 -07:00 · 2023-07-17 08:48:24 -07:00 · 2023-07-16 21:45:55 -07:00 · 2023-07-16 13:24:38 -07:00
33 changed files with 719 additions and 528 deletions
--- a/.github/workflows/docs_test.yml
+++ b/.github/workflows/docs_test.yml
@@ -81,7 +81,7 @@ jobs:
      run: |
        cd docs/test/node_modules/vectordb 
        npm ci
-        npm run build
+        npm run build-release
        npm run tsc
    - name: Create test files
      run: |
--- a/.github/workflows/npm-publish.yml
+++ b/.github/workflows/npm-publish.yml
@@ -116,6 +116,39 @@ jobs:
        path: |
          node/dist/vectordb-linux*.tgz

+  node-windows:
+    runs-on: windows-2022
+    # Only runs on tags that matches the make-release action
+    if: startsWith(github.ref, 'refs/tags/v')
+    strategy:
+      fail-fast: false
+      matrix:
+        target: [x86_64-pc-windows-msvc]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install Protoc v21.12
+        working-directory: C:\
+        run: |
+          New-Item -Path 'C:\protoc' -ItemType Directory
+          Set-Location C:\protoc
+          Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
+          7z x protoc.zip
+          Add-Content $env:GITHUB_PATH "C:\protoc\bin"
+        shell: powershell
+      - name: Install npm dependencies
+        run: |
+          cd node
+          npm ci
+      - name: Build Windows native node modules
+        run: .\ci\build_windows_artifacts.ps1 ${{ matrix.target }}
+      - name: Upload Windows Artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: windows-native
+          path: |
+            node/dist/vectordb-win32*.tgz
+
  release:
    needs: [node, node-macos, node-linux]
    runs-on: ubuntu-latest
@@ -132,6 +165,7 @@ jobs:
        env:
          NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
        run: |
-          for filename in */*.tgz; do
+          mv */*.tgz .
+          for filename in *.tgz; do
            npm publish $filename
          done
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -66,3 +66,24 @@ jobs:
        run: cargo build --all-features
      - name: Run tests
        run: cargo test --all-features
+  windows:
+    runs-on: windows-2022
+    steps:
+      - uses: actions/checkout@v3
+      - uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: rust
+      - name: Install Protoc v21.12
+        working-directory: C:\
+        run: |
+          New-Item -Path 'C:\protoc' -ItemType Directory
+          Set-Location C:\protoc
+          Invoke-WebRequest https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip -OutFile C:\protoc\protoc.zip
+          7z x protoc.zip
+          Add-Content $env:GITHUB_PATH "C:\protoc\bin"
+        shell: powershell
+      - name: Run tests
+        run: |
+          $env:VCPKG_ROOT = $env:VCPKG_INSTALLATION_ROOT
+          cargo build
+          cargo test
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,9 +6,11 @@ members = [
 resolver = "2"

 [workspace.dependencies]
-lance = "=0.5.5"
+lance = "=0.5.8"
 arrow-array = "42.0"
 arrow-data = "42.0"
 arrow-schema = "42.0"
 arrow-ipc = "42.0"
+half = { "version" = "2.2.1", default-features = false }
 object_store = "0.6.1"
+
--- a/ci/build_windows_artifacts.ps1
+++ b/ci/build_windows_artifacts.ps1
@@ -0,0 +1,41 @@
+# Builds the Windows artifacts (node binaries).
+# Usage:  .\ci\build_windows_artifacts.ps1 [target]
+# Targets supported:
+# - x86_64-pc-windows-msvc
+# - i686-pc-windows-msvc
+
+function Prebuild-Rust {
+    param (
+        [string]$target
+    )
+
+    # Building here for the sake of easier debugging.
+    Push-Location -Path "rust/ffi/node"
+    Write-Host "Building rust library for $target"
+    $env:RUST_BACKTRACE=1
+    cargo build --release --target $target
+    Pop-Location
+}
+
+function Build-NodeBinaries {
+    param (
+        [string]$target
+    )
+
+    Push-Location -Path "node"
+    Write-Host "Building node library for $target"
+    npm run build-release -- --target $target
+    npm run pack-build -- --target $target
+    Pop-Location
+}
+
+$targets = $args[0]
+if (-not $targets) {
+    $targets = "x86_64-pc-windows-msvc"
+}
+
+Write-Host "Building artifacts for targets: $targets"
+foreach ($target in $targets) {
+    Prebuild-Rust $target
+    Build-NodeBinaries $target
+}
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -60,6 +60,9 @@ nav:
 - Python integrations:
  - Pandas and PyArrow: python/arrow.md
  - DuckDB: python/duckdb.md
+  - LangChain 🦜️🔗: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/lancedb.html
+  - LlamaIndex 🦙: https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/LanceDBIndexDemo.html
+  - Pydantic: python/pydantic.md
 - Python examples:
  - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
  - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
@@ -68,6 +71,7 @@ nav:
  - Serverless QA Bot with Modal: examples/serverless_qa_bot_with_modal_and_langchain.md
 - Javascript examples:
  - YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
+  - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
 - References:
  - Vector Search: search.md
  - SQL filters: sql.md
--- a/docs/src/embedding.md
+++ b/docs/src/embedding.md
@@ -46,7 +46,7 @@ You can also use an external API like OpenAI to generate embeddings

        def embed_func(c):
            rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
-        return [record["embedding"] for record in rs["data"]]
+            return [record["embedding"] for record in rs["data"]]
      ```

 === "Javascript"
@@ -126,7 +126,7 @@ belong in the same latent space and your results will be nonsensical.
 === "Javascript"
     ```javascript
      const results = await table
-        .search('What's the best pizza topping?')
+        .search("What's the best pizza topping?")
        .limit(10)
        .execute()
     ```
--- a/docs/src/examples/transformerjs_embedding_search_nodejs.md
+++ b/docs/src/examples/transformerjs_embedding_search_nodejs.md
@@ -0,0 +1,121 @@
+# Vector embedding search using TransformersJS
+
+## Embed and query data from LacneDB using TransformersJS
+
+<img id="splash" width="400" alt="transformersjs" src="https://github.com/lancedb/lancedb/assets/43097991/88a31e30-3d6f-4eef-9216-4b7c688f1b4f">
+
+This example shows how to use the [transformers.js](https://github.com/xenova/transformers.js) library to perform vector embedding search using LanceDB's Javascript API.
+
+
+### Setting up
+First, install the dependencies:
+```bash
+npm install vectordb
+npm i @xenova/transformers
+```
+
+We will also be using the [all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) model to make it compatible with Transformers.js
+
+Within our `index.js` file we will import the necessary libraries and define our model and database:
+
+```javascript
+const lancedb = require('vectordb')
+const { pipeline } = await import('@xenova/transformers')
+const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
+```
+
+### Creating the embedding function
+
+Next, we will create a function that will take in a string and return the vector embedding of that string. We will use the `pipe` function we defined earlier to get the vector embedding of the string.
+
+```javascript
+// Define the function. `sourceColumn` is required for LanceDB to know
+// which column to use as input.
+const embed_fun = {}
+embed_fun.sourceColumn = 'text'
+embed_fun.embed = async function (batch) {
+    let result = []
+    // Given a batch of strings, we will use the `pipe` function to get
+    // the vector embedding of each string.
+    for (let text of batch) {
+        // 'mean' pooling and normalizing allows the embeddings to share the
+        // same length.
+        const res = await pipe(text, { pooling: 'mean', normalize: true })
+        result.push(Array.from(res['data']))
+    }
+    return (result)
+}
+```
+
+### Creating the database
+
+Now, we will create the LanceDB database and add the embedding function we defined earlier.
+
+```javascript
+// Link a folder and create a table with data
+const db = await lancedb.connect('data/sample-lancedb')
+
+// You can also import any other data, but make sure that you have a column
+// for the embedding function to use.
+const data = [
+    { id: 1, text: 'Cherry', type: 'fruit' },
+    { id: 2, text: 'Carrot', type: 'vegetable' },
+    { id: 3, text: 'Potato', type: 'vegetable' },
+    { id: 4, text: 'Apple', type: 'fruit' },
+    { id: 5, text: 'Banana', type: 'fruit' }
+]
+
+// Create the table with the embedding function
+const table = await db.createTable('food_table', data, "create", embed_fun)
+```
+
+### Performing the search
+
+Now, we can perform the search using the `search` function. LanceDB automatically uses the embedding function we defined earlier to get the vector embedding of the query string.
+
+```javascript
+// Query the table
+const results = await table
+    .search("a sweet fruit to eat")
+    .metricType("cosine")
+    .limit(2)
+    .execute()
+console.log(results.map(r => r.text))
+```
+```bash
+[ 'Banana', 'Cherry' ]
+```
+
+Output of `results`:
+```bash
+[
+  {
+    vector: Float32Array(384) [
+      -0.057455405592918396,
+      0.03617725893855095,
+      -0.0367760956287384,
+      ... 381 more items
+    ],
+    id: 5,
+    text: 'Banana',
+    type: 'fruit',
+    score: 0.4919965863227844
+  },
+  {
+    vector: Float32Array(384) [
+      0.0009714411571621895,
+      0.008223623037338257,
+      0.009571489877998829,
+      ... 381 more items
+    ],
+    id: 1,
+    text: 'Cherry',
+    type: 'fruit',
+    score: 0.5540297031402588
+  }
+]
+```
+
+### Wrapping it up
+
+In this example, we showed how to use the `transformers.js` library to perform vector embedding search using LanceDB's Javascript API. You can find the full code for this example on [Github](https://github.com/lancedb/lancedb/blob/main/node/examples/js-transformers/index.js)!
--- a/docs/src/python/arrow.md
+++ b/docs/src/python/arrow.md
@@ -5,6 +5,8 @@ Built on top of [Apache Arrow](https://arrow.apache.org/),
 `LanceDB` is easy to integrate with the Python ecosystem, including [Pandas](https://pandas.pydata.org/)
 and PyArrow.

+## Create dataset
+
 First, we need to connect to a `LanceDB` database.

 ```py
@@ -27,10 +29,42 @@ data = pd.DataFrame({
 table = db.create_table("pd_table", data=data)
 ```

-You will find detailed instructions of creating dataset and index in
-[Basic Operations](basic.md) and [Indexing](ann_indexes.md)
+Similar to [`pyarrow.write_dataset()`](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.write_dataset.html),
+[db.create_table()](../python/#lancedb.db.DBConnection.create_table) accepts a wide-range of forms of data.
+
+For example, if you have a dataset that is larger than memory size, you can create table with `Iterator[pyarrow.RecordBatch]`,
+to lazily generate data:
+
+```py
+
+from typing import Iterable
+import pyarrow as pa
+import lancedb
+
+def make_batches() -> Iterable[pa.RecordBatch]:
+    for i in range(5):
+        yield pa.RecordBatch.from_arrays(
+            [
+                pa.array([[3.1, 4.1], [5.9, 26.5]]),
+                pa.array(["foo", "bar"]),
+                pa.array([10.0, 20.0]),
+            ],
+            ["vector", "item", "price"])
+
+schema=pa.schema([
+    pa.field("vector", pa.list_(pa.float32())),
+    pa.field("item", pa.utf8()),
+    pa.field("price", pa.float32()),
+])
+
+table = db.create_table("iterable_table", data=make_batches(), schema=schema)
+```
+
+You will find detailed instructions of creating dataset in
+[Basic Operations](../basic.md) and [API](../python/#lancedb.db.DBConnection.create_table)
 sections.

+## Vector Search

 We can now perform similarity search via `LanceDB` Python API.

--- a/docs/src/python/pydantic.md
+++ b/docs/src/python/pydantic.md
@@ -0,0 +1,35 @@
+# Pydantic
+
+[Pydantic](https://docs.pydantic.dev/latest/) is a data validation library in Python.
+
+## Schema
+
+LanceDB supports to create Apache Arrow Schema from a
+[Pydantic BaseModel](https://docs.pydantic.dev/latest/api/main/#pydantic.main.BaseModel)
+via [pydantic_to_schema()](python.md##lancedb.pydantic.pydantic_to_schema) method.
+
+::: lancedb.pydantic.pydantic_to_schema
+
+## Vector Field
+
+LanceDB provides a [`vector(dim)`](python.md#lancedb.pydantic.vector) method to define a
+vector Field in a Pydantic Model.
+
+::: lancedb.pydantic.vector
+
+## Type Conversion
+
+LanceDB automatically convert Pydantic fields to
+[Apache Arrow DataType](https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html#pyarrow.DataType).
+
+Current supported type conversions:
+
+| Pydantic Field Type | PyArrow Data Type |
+| ------------------- | ----------------- |
+| `int`               | `pyarrow.int64`   |
+| `float`              | `pyarrow.float64`  |
+| `bool`              | `pyarrow.bool`    |
+| `str`               | `pyarrow.utf8()`    |
+| `list`              | `pyarrow.List`    |
+| `BaseModel`         | `pyarrow.Struct`    |
+| `vector(n)`         | `pyarrow.FixedSizeList(float32, n)` |
--- a/docs/src/python/python.md
+++ b/docs/src/python/python.md
@@ -46,10 +46,6 @@ pip install lancedb

 ## Utilities

-::: lancedb.schema.schema_to_dict
-
-::: lancedb.schema.dict_to_schema
-
 ::: lancedb.vector

 ## Integrations
--- a/docs/test/md_testing.js
+++ b/docs/test/md_testing.js
@@ -7,6 +7,7 @@ const excludedFiles = [
  "../src/embedding.md",
  "../src/examples/serverless_lancedb_with_s3_and_lambda.md",
  "../src/examples/serverless_qa_bot_with_modal_and_langchain.md",
+  "../src/examples/transformerjs_embedding_search_nodejs.md",
  "../src/examples/youtube_transcript_bot_with_nodejs.md",
 ];
 const nodePrefix = "javascript";
@@ -48,4 +49,4 @@ for (const file of files.filter((file) => !excludedFiles.includes(file))) {
    fs.mkdirSync(path.dirname(outPath), { recursive: true });
    fs.writeFileSync(outPath, asyncPrefix + "\n" + lines.join("\n") + asyncSuffix);
  }
-}
+}
--- a/node/examples/js-transformers/index.js
+++ b/node/examples/js-transformers/index.js
@@ -0,0 +1,66 @@
+// Copyright 2023 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+'use strict'
+
+
+async function example() {
+
+    const lancedb = require('vectordb')
+
+    // Import transformers and the all-MiniLM-L6-v2 model (https://huggingface.co/Xenova/all-MiniLM-L6-v2)
+    const { pipeline } = await import('@xenova/transformers')
+    const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
+
+
+    // Create embedding function from pipeline which returns a list of vectors from batch
+    // sourceColumn is the name of the column in the data to be embedded
+    //
+    // Output of pipe is a Tensor { data: Float32Array(384) }, so filter for the vector
+    const embed_fun = {}
+    embed_fun.sourceColumn = 'text'
+    embed_fun.embed = async function (batch) {
+        let result = []
+        for (let text of batch) {
+            const res = await pipe(text, { pooling: 'mean', normalize: true })
+            result.push(Array.from(res['data']))
+        }
+        return (result)
+    }
+
+    // Link a folder and create a table with data
+    const db = await lancedb.connect('data/sample-lancedb')
+
+    const data = [
+        { id: 1, text: 'Cherry', type: 'fruit' },
+        { id: 2, text: 'Carrot', type: 'vegetable' },
+        { id: 3, text: 'Potato', type: 'vegetable' },
+        { id: 4, text: 'Apple', type: 'fruit' },
+        { id: 5, text: 'Banana', type: 'fruit' }
+    ]
+
+    const table = await db.createTable('food_table', data, "create", embed_fun)
+
+
+    // Query the table
+    const results = await table
+        .search("a sweet fruit to eat")
+        .metricType("cosine")
+        .limit(2)
+        .execute()
+    console.log(results.map(r => r.text))
+
+}
+
+example().then(_ => { console.log("Done!") })
--- a/node/examples/js-transformers/package.json
+++ b/node/examples/js-transformers/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "vectordb-example-js-transformers",
+  "version": "1.0.0",
+  "description": "Example for using transformers.js with lancedb",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "Lance Devs",
+  "license": "Apache-2.0",
+  "dependencies": {
+    "@xenova/transformers": "^2.4.1",
+    "vectordb": "^0.1.12"
+  }
+
+}
--- a/node/package-lock.json
+++ b/node/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "vectordb",
-  "version": "0.1.12",
+  "version": "0.1.13",
  "lockfileVersion": 2,
  "requires": true,
  "packages": {
    "": {
      "name": "vectordb",
-      "version": "0.1.12",
+      "version": "0.1.13",
      "cpu": [
        "x64",
        "arm64"
@@ -14,7 +14,8 @@
      "license": "Apache-2.0",
      "os": [
        "darwin",
-        "linux"
+        "linux",
+        "win32"
      ],
      "dependencies": {
        "@apache-arrow/ts": "^12.0.0",
@@ -49,10 +50,11 @@
        "typescript": "*"
      },
      "optionalDependencies": {
-        "vectordb-darwin-arm64": "0.1.12",
-        "vectordb-darwin-x64": "0.1.12",
-        "vectordb-linux-arm64-gnu": "0.1.12",
-        "vectordb-linux-x64-gnu": "0.1.12"
+        "vectordb-darwin-arm64": "0.1.13",
+        "vectordb-darwin-x64": "0.1.13",
+        "vectordb-linux-arm64-gnu": "0.1.13",
+        "vectordb-linux-x64-gnu": "0.1.13",
+        "vectordb-win32-x64-msvc": "0.1.13"
      }
    },
    "node_modules/@apache-arrow/ts": {
@@ -4286,6 +4288,42 @@
      "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
      "dev": true
    },
+    "node_modules/vectordb-darwin-arm64": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
+      "integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
+      "cpu": [
+        "arm64"
+      ],
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/vectordb-darwin-x64": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
+      "integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
+      "cpu": [
+        "x64"
+      ],
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/vectordb-linux-x64-gnu": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
+      "integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
+      "cpu": [
+        "x64"
+      ],
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
    "node_modules/vscode-oniguruma": {
      "version": "1.7.0",
      "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
@@ -7581,6 +7619,24 @@
      "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
      "dev": true
    },
+    "vectordb-darwin-arm64": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.1.13.tgz",
+      "integrity": "sha512-9lLuX5P8m75EfP85pfC4LxO9J7Tzu4LngX55BVAdFe6qPRHu+iHmLw0QYYSVDqNm3GtDr2qFJlL2ILlsApyYyg==",
+      "optional": true
+    },
+    "vectordb-darwin-x64": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/vectordb-darwin-x64/-/vectordb-darwin-x64-0.1.13.tgz",
+      "integrity": "sha512-5mkhBJlcfAqcty7Ww2csgYogq+b0NhtllAbag9IIznvqfcrvITU0H0vm5LGWbRuE/BUUxC25MJhm93YWBzqEVA==",
+      "optional": true
+    },
+    "vectordb-linux-x64-gnu": {
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.1.13.tgz",
+      "integrity": "sha512-fU+sIHUkXyMdrWjggT93p0blKD+pbgr+x01tn9d2/pbA1ePo2AwuE86rYPA+BjyCUE1QifPgKadzGVVpqWYmnQ==",
+      "optional": true
+    },
    "vscode-oniguruma": {
      "version": "1.7.0",
      "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz",
--- a/node/package.json
+++ b/node/package.json
@@ -8,7 +8,7 @@
    "tsc": "tsc -b",
    "build": "cargo-cp-artifact --artifact cdylib vectordb-node index.node -- cargo build --message-format=json",
    "build-release": "npm run build -- --release",
-    "test": "npm run tsc; mocha -recursive dist/test",
+    "test": "npm run tsc && mocha -recursive dist/test",
    "lint": "eslint src --ext .js,.ts",
    "clean": "rm -rf node_modules *.node dist/",
    "pack-build": "neon pack-build",
@@ -60,7 +60,8 @@
  },
  "os": [
    "darwin",
-    "linux"
+    "linux",
+    "win32"
  ],
  "cpu": [
    "x64",
@@ -71,13 +72,15 @@
      "x86_64-apple-darwin": "vectordb-darwin-x64",
      "aarch64-apple-darwin": "vectordb-darwin-arm64",
      "x86_64-unknown-linux-gnu": "vectordb-linux-x64-gnu",
-      "aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu"
+      "aarch64-unknown-linux-gnu": "vectordb-linux-arm64-gnu",
+      "x86_64-pc-windows-msvc": "vectordb-win32-x64-msvc"
    }
  },
  "optionalDependencies": {
    "vectordb-darwin-arm64": "0.1.13",
    "vectordb-darwin-x64": "0.1.13",
+    "vectordb-linux-arm64-gnu": "0.1.13",
    "vectordb-linux-x64-gnu": "0.1.13",
-    "vectordb-linux-arm64-gnu": "0.1.13"
+    "vectordb-win32-x64-msvc": "0.1.13"
  }
 }
--- a/python/lancedb/db.py
+++ b/python/lancedb/db.py
@@ -13,11 +13,12 @@

 from __future__ import annotations

-import functools
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple, Union

+import pandas as pd
 import pyarrow as pa
 from pyarrow import fs

@@ -38,8 +39,10 @@ class DBConnection(ABC):
    def create_table(
        self,
        name: str,
-        data: DATA = None,
-        schema: pa.Schema = None,
+        data: Optional[
+            Union[List[dict], dict, pd.DataFrame, pa.Table, Iterable[pa.RecordBatch]],
+        ] = None,
+        schema: Optional[pa.Schema] = None,
        mode: str = "create",
        on_bad_vectors: str = "error",
        fill_value: float = 0.0,
@@ -51,7 +54,7 @@ class DBConnection(ABC):
        name: str
            The name of the table.
        data: list, tuple, dict, pd.DataFrame; optional
-            The data to insert into the table.
+            The data to initialize the table. User must provide at least one of `data` or `schema`.
        schema: pyarrow.Schema; optional
            The schema of the table.
        mode: str; default "create"
@@ -64,16 +67,16 @@ class DBConnection(ABC):
        fill_value: float
            The value to use when filling vectors. Only used if on_bad_vectors="fill".

-        Note
-        ----
-        The vector index won't be created by default.
-        To create the index, call the `create_index` method on the table.
-
        Returns
        -------
        LanceTable
            A reference to the newly created table.

+        !!! note
+
+            The vector index won't be created by default.
+            To create the index, call the `create_index` method on the table.
+
        Examples
        --------

@@ -119,7 +122,7 @@ class DBConnection(ABC):

        Data is converted to Arrow before being written to disk. For maximum
        control over how data is saved, either provide the PyArrow schema to
-        convert to or else provide a PyArrow table directly.
+        convert to or else provide a [PyArrow Table](pyarrow.Table) directly.

        >>> custom_schema = pa.schema([
        ...   pa.field("vector", pa.list_(pa.float32(), 2)),
@@ -138,6 +141,30 @@ class DBConnection(ABC):
        vector: [[[1.1,1.2],[0.2,1.8]]]
        lat: [[45.5,40.1]]
        long: [[-122.7,-74.1]]
+
+
+        It is also possible to create an table from `[Iterable[pa.RecordBatch]]`:
+
+
+        >>> import pyarrow as pa
+        >>> def make_batches():
+        ...     for i in range(5):
+        ...         yield pa.RecordBatch.from_arrays(
+        ...             [
+        ...                 pa.array([[3.1, 4.1], [5.9, 26.5]]),
+        ...                 pa.array(["foo", "bar"]),
+        ...                 pa.array([10.0, 20.0]),
+        ...             ],
+        ...             ["vector", "item", "price"],
+        ...         )
+        >>> schema=pa.schema([
+        ...     pa.field("vector", pa.list_(pa.float32())),
+        ...     pa.field("item", pa.utf8()),
+        ...     pa.field("price", pa.float32()),
+        ... ])
+        >>> db.create_table("table4", make_batches(), schema=schema)
+        LanceTable(table4)
+
        """
        raise NotImplementedError

@@ -252,7 +279,7 @@ class LanceDBConnection(DBConnection):
    def create_table(
        self,
        name: str,
-        data: DATA = None,
+        data: Optional[Union[List[dict], dict, pd.DataFrame]] = None,
        schema: pa.Schema = None,
        mode: str = "create",
        on_bad_vectors: str = "error",
@@ -260,114 +287,22 @@ class LanceDBConnection(DBConnection):
    ) -> LanceTable:
        """Create a table in the database.

-        Parameters
-        ----------
-        name: str
-            The name of the table.
-        data: list, tuple, dict, pd.DataFrame; optional
-            The data to insert into the table.
-        schema: pyarrow.Schema; optional
-            The schema of the table.
-        mode: str; default "create"
-            The mode to use when creating the table. Can be either "create" or "overwrite".
-            By default, if the table already exists, an exception is raised.
-            If you want to overwrite the table, use mode="overwrite".
-        on_bad_vectors: str, default "error"
-            What to do if any of the vectors are not the same size or contains NaNs.
-            One of "error", "drop", "fill".
-        fill_value: float
-            The value to use when filling vectors. Only used if on_bad_vectors="fill".
-
-        Note
-        ----
-        The vector index won't be created by default.
-        To create the index, call the `create_index` method on the table.
-
-        Returns
-        -------
-        LanceTable
-            A reference to the newly created table.
-
-        Examples
-        --------
-
-        Can create with list of tuples or dictionaries:
-
-        >>> import lancedb
-        >>> db = lancedb.connect("./.lancedb")
-        >>> data = [{"vector": [1.1, 1.2], "lat": 45.5, "long": -122.7},
-        ...         {"vector": [0.2, 1.8], "lat": 40.1, "long":  -74.1}]
-        >>> db.create_table("my_table", data)
-        LanceTable(my_table)
-        >>> db["my_table"].head()
-        pyarrow.Table
-        vector: fixed_size_list<item: float>[2]
-          child 0, item: float
-        lat: double
-        long: double
-        ----
-        vector: [[[1.1,1.2],[0.2,1.8]]]
-        lat: [[45.5,40.1]]
-        long: [[-122.7,-74.1]]
-
-        You can also pass a pandas DataFrame:
-
-        >>> import pandas as pd
-        >>> data = pd.DataFrame({
-        ...    "vector": [[1.1, 1.2], [0.2, 1.8]],
-        ...    "lat": [45.5, 40.1],
-        ...    "long": [-122.7, -74.1]
-        ... })
-        >>> db.create_table("table2", data)
-        LanceTable(table2)
-        >>> db["table2"].head()
-        pyarrow.Table
-        vector: fixed_size_list<item: float>[2]
-          child 0, item: float
-        lat: double
-        long: double
-        ----
-        vector: [[[1.1,1.2],[0.2,1.8]]]
-        lat: [[45.5,40.1]]
-        long: [[-122.7,-74.1]]
-
-        Data is converted to Arrow before being written to disk. For maximum
-        control over how data is saved, either provide the PyArrow schema to
-        convert to or else provide a PyArrow table directly.
-
-        >>> custom_schema = pa.schema([
-        ...   pa.field("vector", pa.list_(pa.float32(), 2)),
-        ...   pa.field("lat", pa.float32()),
-        ...   pa.field("long", pa.float32())
-        ... ])
-        >>> db.create_table("table3", data, schema = custom_schema)
-        LanceTable(table3)
-        >>> db["table3"].head()
-        pyarrow.Table
-        vector: fixed_size_list<item: float>[2]
-          child 0, item: float
-        lat: float
-        long: float
-        ----
-        vector: [[[1.1,1.2],[0.2,1.8]]]
-        lat: [[45.5,40.1]]
-        long: [[-122.7,-74.1]]
+        See
+        ---
+        DBConnection.create_table
        """
        if mode.lower() not in ["create", "overwrite"]:
            raise ValueError("mode must be either 'create' or 'overwrite'")

-        if data is not None:
-            tbl = LanceTable.create(
-                self,
-                name,
-                data,
-                schema,
-                mode=mode,
-                on_bad_vectors=on_bad_vectors,
-                fill_value=fill_value,
-            )
-        else:
-            tbl = LanceTable.open(self, name)
+        tbl = LanceTable.create(
+            self,
+            name,
+            data,
+            schema,
+            mode=mode,
+            on_bad_vectors=on_bad_vectors,
+            fill_value=fill_value,
+        )
        return tbl

    def open_table(self, name: str) -> LanceTable:
--- a/python/lancedb/pydantic.py
+++ b/python/lancedb/pydantic.py
@@ -18,7 +18,7 @@ from __future__ import annotations
 import inspect
 import sys
 import types
-from abc import ABC, abstractstaticmethod
+from abc import ABC, abstractmethod
 from typing import Any, List, Type, Union, _GenericAlias

 import pyarrow as pa
@@ -27,11 +27,13 @@ from pydantic_core import CoreSchema, core_schema


 class FixedSizeListMixin(ABC):
-    @abstractstaticmethod
+    @staticmethod
+    @abstractmethod
    def dim() -> int:
        raise NotImplementedError

-    @abstractstaticmethod
+    @staticmethod
+    @abstractmethod
    def value_arrow_type() -> pa.DataType:
        raise NotImplementedError

@@ -41,9 +43,15 @@ def vector(
 ) -> Type[FixedSizeListMixin]:
    """Pydantic Vector Type.

-    Note
-    ----
-    Experimental feature.
+    !!! warning
+        Experimental feature.
+
+    Parameters
+    ----------
+    dim : int
+        The dimension of the vector.
+    value_type : pyarrow.DataType, optional
+        The value type of the vector, by default pa.float32()

    Examples
    --------
@@ -52,9 +60,15 @@ def vector(
    >>> from lancedb.pydantic import vector
    ...
    >>> class MyModel(pydantic.BaseModel):
-    ...     vector: vector(756)
    ...     id: int
-    ...     description: str
+    ...     url: str
+    ...     embeddings: vector(768)
+    >>> schema = pydantic_to_schema(MyModel)
+    >>> assert schema == pa.schema([
+    ...     pa.field("id", pa.int64(), False),
+    ...     pa.field("url", pa.utf8(), False),
+    ...     pa.field("embeddings", pa.list_(pa.float32(), 768), False)
+    ... ])
    """

    # TODO: make a public parameterized type.
@@ -163,7 +177,36 @@ def pydantic_to_schema(model: Type[pydantic.BaseModel]) -> pa.Schema:

    Returns
    -------
-    A PyArrow Schema.
+    pyarrow.Schema
+
+    Examples
+    --------
+
+    >>> from typing import List, Optional
+    >>> import pydantic
+    >>> from lancedb.pydantic import pydantic_to_schema
+    ...
+    >>> class InnerModel(pydantic.BaseModel):
+    ...     a: str
+    ...     b: Optional[float]
+    >>>
+    >>> class FooModel(pydantic.BaseModel):
+    ...     id: int
+    ...     s: Optional[str] = None
+    ...     vec: List[float]
+    ...     li: List[int]
+    ...     inner: InnerModel
+    >>> schema = pydantic_to_schema(FooModel)
+    >>> assert schema == pa.schema([
+    ...     pa.field("id", pa.int64(), False),
+    ...     pa.field("s", pa.utf8(), True),
+    ...     pa.field("vec", pa.list_(pa.float64()), False),
+    ...     pa.field("li", pa.list_(pa.int64()), False),
+    ...     pa.field("inner", pa.struct([
+    ...         pa.field("a", pa.utf8(), False),
+    ...         pa.field("b", pa.float64(), True),
+    ...     ]), False),
+    ... ])
    """
    fields = _pydantic_model_to_fields(model)
    return pa.schema(fields)
--- a/python/lancedb/query.py
+++ b/python/lancedb/query.py
@@ -226,6 +226,7 @@ class LanceQueryBuilder:
            columns=self._columns,
            nprobes=self._nprobes,
            refine_factor=self._refine_factor,
+            vector_column=self._vector_column,
        )
        return self._table._execute_query(query)

--- a/python/lancedb/remote/arrow.py
+++ b/python/lancedb/remote/arrow.py
@@ -0,0 +1,22 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pyarrow as pa
+
+
+def to_ipc_binary(table: pa.Table) -> bytes:
+    """Serialize a PyArrow Table to IPC binary."""
+    sink = pa.BufferOutputStream()
+    with pa.ipc.new_stream(sink, table.schema) as writer:
+        writer.write_table(table)
+    return sink.getvalue().to_pybytes()
--- a/python/lancedb/remote/client.py
+++ b/python/lancedb/remote/client.py
@@ -13,7 +13,7 @@


 import functools
-from typing import Any, Callable, Dict, Union
+from typing import Any, Callable, Dict, Optional, Union

 import aiohttp
 import attr
@@ -24,6 +24,8 @@ from lancedb.common import Credential
 from lancedb.remote import VectorQuery, VectorQueryResult
 from lancedb.remote.errors import LanceDBClientError

+ARROW_STREAM_CONTENT_TYPE = "application/vnd.apache.arrow.stream"
+

 def _check_not_closed(f):
    @functools.wraps(f)
@@ -59,9 +61,12 @@ class RestfulLanceDBClient:

    @functools.cached_property
    def headers(self) -> Dict[str, str]:
-        return {
+        headers = {
            "x-api-key": self.api_key,
        }
+        if self.region == "local":  # Local test mode
+            headers["Host"] = f"{self.db_name}.{self.region}.api.lancedb.com"
+        return headers

    @staticmethod
    async def _check_status(resp: aiohttp.ClientResponse):
@@ -93,7 +98,9 @@ class RestfulLanceDBClient:
    async def post(
        self,
        uri: str,
-        data: Union[Dict[str, Any], BaseModel],
+        data: Union[Dict[str, Any], BaseModel, bytes],
+        params: Optional[Dict[str, Any]] = None,
+        content_type: Optional[str] = None,
        deserialize: Callable = lambda resp: resp.json(),
    ) -> Dict[str, Any]:
        """Send a POST request and returns the deserialized response payload.
@@ -107,10 +114,19 @@ class RestfulLanceDBClient:
        """
        if isinstance(data, BaseModel):
            data: Dict[str, Any] = data.dict(exclude_none=True)
+        if isinstance(data, bytes):
+            req_kwargs = {"data": data}
+        else:
+            req_kwargs = {"json": data}
+
+        headers = self.headers.copy()
+        if content_type is not None:
+            headers["content-type"] = content_type
        async with self.session.post(
            uri,
-            json=data,
-            headers=self.headers,
+            headers=headers,
+            params=params,
+            **req_kwargs,
        ) as resp:
            resp: aiohttp.ClientResponse = resp
            await self._check_status(resp)
@@ -119,11 +135,11 @@ class RestfulLanceDBClient:
    @_check_not_closed
    async def list_tables(self):
        """List all tables in the database."""
-        json = await self.get("/1/table/", {})
+        json = await self.get("/v1/table/", {})
        return json["tables"]

    @_check_not_closed
    async def query(self, table_name: str, query: VectorQuery) -> VectorQueryResult:
        """Query a table."""
-        tbl = await self.post(f"/1/table/{table_name}/", query, deserialize=_read_ipc)
+        tbl = await self.post(f"/v1/table/{table_name}/", query, deserialize=_read_ipc)
        return VectorQueryResult(tbl)
--- a/python/lancedb/remote/db.py
+++ b/python/lancedb/remote/db.py
@@ -12,6 +12,7 @@
 #  limitations under the License.

 import asyncio
+import uuid
 from typing import List
 from urllib.parse import urlparse

@@ -19,9 +20,11 @@ import pyarrow as pa

 from lancedb.common import DATA
 from lancedb.db import DBConnection
-from lancedb.table import Table
+from lancedb.schema import schema_to_json
+from lancedb.table import Table, _sanitize_data

-from .client import RestfulLanceDBClient
+from .arrow import to_ipc_binary
+from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient


 class RemoteDBConnection(DBConnection):
@@ -71,8 +74,31 @@ class RemoteDBConnection(DBConnection):
        name: str,
        data: DATA = None,
        schema: pa.Schema = None,
-        mode: str = "create",
        on_bad_vectors: str = "error",
        fill_value: float = 0.0,
    ) -> Table:
-        raise NotImplementedError
+        if data is None and schema is None:
+            raise ValueError("Either data or schema must be provided.")
+        if data is not None:
+            data = _sanitize_data(
+                data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
+            )
+        else:
+            if schema is None:
+                raise ValueError("Either data or schema must be provided")
+            data = pa.Table.from_pylist([], schema=schema)
+
+        from .table import RemoteTable
+
+        data = to_ipc_binary(data)
+        request_id = uuid.uuid4().hex
+
+        self._loop.run_until_complete(
+            self._client.post(
+                f"/v1/table/{name}/create",
+                data=data,
+                params={"request_id": request_id},
+                content_type=ARROW_STREAM_CONTENT_TYPE,
+            )
+        )
+        return RemoteTable(self, name)
--- a/python/lancedb/remote/table.py
+++ b/python/lancedb/remote/table.py
@@ -11,6 +11,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.

+import uuid
+from functools import cached_property
 from typing import Union

 import pyarrow as pa
@@ -18,7 +20,10 @@ import pyarrow as pa
 from lancedb.common import DATA, VEC, VECTOR_COLUMN_NAME

 from ..query import LanceQueryBuilder, Query
-from ..table import Query, Table
+from ..schema import json_to_schema
+from ..table import Query, Table, _sanitize_data
+from .arrow import to_ipc_binary
+from .client import ARROW_STREAM_CONTENT_TYPE
 from .db import RemoteDBConnection


@@ -30,8 +35,14 @@ class RemoteTable(Table):
    def __repr__(self) -> str:
        return f"RemoteTable({self._conn.db_name}.{self.name})"

+    @cached_property
    def schema(self) -> pa.Schema:
-        raise NotImplementedError
+        """Return the schema of the table."""
+        resp = self._conn._loop.run_until_complete(
+            self._conn._client.get(f"/v1/table/{self._name}/describe")
+        )
+        schema = json_to_schema(resp["schema"])
+        return schema

    def to_arrow(self) -> pa.Table:
        raise NotImplementedError
@@ -53,7 +64,22 @@ class RemoteTable(Table):
        on_bad_vectors: str = "error",
        fill_value: float = 0.0,
    ) -> int:
-        raise NotImplementedError
+        data = _sanitize_data(
+            data, self.schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
+        )
+        payload = to_ipc_binary(data)
+
+        request_id = uuid.uuid4().hex
+
+        self._conn._loop.run_until_complete(
+            self._conn._client.post(
+                f"/v1/table/{self._name}/insert",
+                data=payload,
+                params={"request_id": request_id, "mode": mode},
+                content_type=ARROW_STREAM_CONTENT_TYPE,
+            )
+        )
+        return len(data)

    def search(
        self, query: Union[VEC, str], vector_column: str = VECTOR_COLUMN_NAME
--- a/python/lancedb/schema.py
+++ b/python/lancedb/schema.py
@@ -13,10 +13,10 @@

 """Schema related utilities."""

-import json
 from typing import Any, Dict, Type

 import pyarrow as pa
+from lance import json_to_schema, schema_to_json


 def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataType:
@@ -43,247 +43,3 @@ def vector(dimension: int, value_type: pa.DataType = pa.float32()) -> pa.DataTyp
    ... ])
    """
    return pa.list_(value_type, dimension)
-
-
-def _type_to_dict(dt: pa.DataType) -> Dict[str, Any]:
-    if pa.types.is_boolean(dt):
-        return {"type": "boolean"}
-    elif pa.types.is_int8(dt):
-        return {"type": "int8"}
-    elif pa.types.is_int16(dt):
-        return {"type": "int16"}
-    elif pa.types.is_int32(dt):
-        return {"type": "int32"}
-    elif pa.types.is_int64(dt):
-        return {"type": "int64"}
-    elif pa.types.is_uint8(dt):
-        return {"type": "uint8"}
-    elif pa.types.is_uint16(dt):
-        return {"type": "uint16"}
-    elif pa.types.is_uint32(dt):
-        return {"type": "uint32"}
-    elif pa.types.is_uint64(dt):
-        return {"type": "uint64"}
-    elif pa.types.is_float16(dt):
-        return {"type": "float16"}
-    elif pa.types.is_float32(dt):
-        return {"type": "float32"}
-    elif pa.types.is_float64(dt):
-        return {"type": "float64"}
-    elif pa.types.is_date32(dt):
-        return {"type": f"date32"}
-    elif pa.types.is_date64(dt):
-        return {"type": f"date64"}
-    elif pa.types.is_time32(dt):
-        return {"type": f"time32:{dt.unit}"}
-    elif pa.types.is_time64(dt):
-        return {"type": f"time64:{dt.unit}"}
-    elif pa.types.is_timestamp(dt):
-        return {"type": f"timestamp:{dt.unit}:{dt.tz if dt.tz is not None else ''}"}
-    elif pa.types.is_string(dt):
-        return {"type": "string"}
-    elif pa.types.is_binary(dt):
-        return {"type": "binary"}
-    elif pa.types.is_large_string(dt):
-        return {"type": "large_string"}
-    elif pa.types.is_large_binary(dt):
-        return {"type": "large_binary"}
-    elif pa.types.is_fixed_size_binary(dt):
-        return {"type": "fixed_size_binary", "width": dt.byte_width}
-    elif pa.types.is_fixed_size_list(dt):
-        return {
-            "type": "fixed_size_list",
-            "width": dt.list_size,
-            "value_type": _type_to_dict(dt.value_type),
-        }
-    elif pa.types.is_list(dt):
-        return {
-            "type": "list",
-            "value_type": _type_to_dict(dt.value_type),
-        }
-    elif pa.types.is_struct(dt):
-        return {
-            "type": "struct",
-            "fields": [_field_to_dict(dt.field(i)) for i in range(dt.num_fields)],
-        }
-    elif pa.types.is_dictionary(dt):
-        return {
-            "type": "dictionary",
-            "index_type": _type_to_dict(dt.index_type),
-            "value_type": _type_to_dict(dt.value_type),
-        }
-    # TODO: support extension types
-
-    raise TypeError(f"Unsupported type: {dt}")
-
-
-def _field_to_dict(field: pa.field) -> Dict[str, Any]:
-    ret = {
-        "name": field.name,
-        "type": _type_to_dict(field.type),
-        "nullable": field.nullable,
-    }
-    if field.metadata is not None:
-        ret["metadata"] = field.metadata
-    return ret
-
-
-def schema_to_dict(schema: pa.Schema) -> Dict[str, Any]:
-    """Convert a PyArrow [Schema](pyarrow.Schema) to a dictionary.
-
-    Parameters
-    ----------
-    schema : pa.Schema
-        The PyArrow Schema to convert
-
-    Returns
-    -------
-    A dict of the data type.
-
-    Examples
-    --------
-
-    >>> import pyarrow as pa
-    >>> import lancedb
-    >>> schema = pa.schema(
-    ...     [
-    ...         pa.field("id", pa.int64()),
-    ...         pa.field("vector", lancedb.vector(512), nullable=False),
-    ...         pa.field(
-    ...             "struct",
-    ...             pa.struct(
-    ...             [
-    ...                 pa.field("a", pa.utf8()),
-    ...                 pa.field("b", pa.float32()),
-    ...             ]
-    ...         ),
-    ...         True,
-    ...     ),
-    ...     ],
-    ...     metadata={"key": "value"},
-    ... )
-    >>> json_schema = schema_to_dict(schema)
-    >>> assert json_schema == {
-    ...     "fields": [
-    ...     {"name": "id", "type": {"type": "int64"}, "nullable": True},
-    ...     {
-    ...         "name": "vector",
-    ...         "type": {
-    ...             "type": "fixed_size_list",
-    ...             "value_type": {"type": "float32"},
-    ...             "width": 512,
-    ...         },
-    ...        "nullable": False,
-    ...    },
-    ...    {
-    ...         "name": "struct",
-    ...         "type": {
-    ...             "type": "struct",
-    ...             "fields": [
-    ...                 {"name": "a", "type": {"type": "string"}, "nullable": True},
-    ...                 {"name": "b", "type": {"type": "float32"}, "nullable": True},
-    ...            ],
-    ...         },
-    ...         "nullable": True,
-    ...     },
-    ...     ],
-    ...     "metadata": {"key": "value"},
-    ... }
-
-    """
-    fields = []
-    for name in schema.names:
-        field = schema.field(name)
-        fields.append(_field_to_dict(field))
-    json_schema = {
-        "fields": fields,
-        "metadata": {
-            k.decode("utf-8"): v.decode("utf-8") for (k, v) in schema.metadata.items()
-        }
-        if schema.metadata is not None
-        else {},
-    }
-    return json_schema
-
-
-def _dict_to_type(dt: Dict[str, Any]) -> pa.DataType:
-    type_name = dt["type"]
-    try:
-        return {
-            "boolean": pa.bool_(),
-            "int8": pa.int8(),
-            "int16": pa.int16(),
-            "int32": pa.int32(),
-            "int64": pa.int64(),
-            "uint8": pa.uint8(),
-            "uint16": pa.uint16(),
-            "uint32": pa.uint32(),
-            "uint64": pa.uint64(),
-            "float16": pa.float16(),
-            "float32": pa.float32(),
-            "float64": pa.float64(),
-            "string": pa.string(),
-            "binary": pa.binary(),
-            "large_string": pa.large_string(),
-            "large_binary": pa.large_binary(),
-            "date32": pa.date32(),
-            "date64": pa.date64(),
-        }[type_name]
-    except KeyError:
-        pass
-
-    if type_name == "fixed_size_binary":
-        return pa.binary(dt["width"])
-    elif type_name == "fixed_size_list":
-        return pa.list_(_dict_to_type(dt["value_type"]), dt["width"])
-    elif type_name == "list":
-        return pa.list_(_dict_to_type(dt["value_type"]))
-    elif type_name == "struct":
-        fields = []
-        for field in dt["fields"]:
-            fields.append(_dict_to_field(field))
-        return pa.struct(fields)
-    elif type_name == "dictionary":
-        return pa.dictionary(
-            _dict_to_type(dt["index_type"]), _dict_to_type(dt["value_type"])
-        )
-    elif type_name.startswith("time32:"):
-        return pa.time32(type_name.split(":")[1])
-    elif type_name.startswith("time64:"):
-        return pa.time64(type_name.split(":")[1])
-    elif type_name.startswith("timestamp:"):
-        fields = type_name.split(":")
-        unit = fields[1]
-        tz = fields[2] if len(fields) > 2 else None
-        return pa.timestamp(unit, tz)
-    raise TypeError(f"Unsupported type: {dt}")
-
-
-def _dict_to_field(field: Dict[str, Any]) -> pa.Field:
-    name = field["name"]
-    nullable = field["nullable"] if "nullable" in field else True
-    dt = _dict_to_type(field["type"])
-    metadata = field.get("metadata", None)
-    return pa.field(name, dt, nullable, metadata)
-
-
-def dict_to_schema(json: Dict[str, Any]) -> pa.Schema:
-    """Reconstruct a PyArrow Schema from a JSON dict.
-
-    Parameters
-    ----------
-    json : Dict[str, Any]
-        The JSON dict to reconstruct Schema from.
-
-    Returns
-    -------
-    A PyArrow Schema.
-    """
-    fields = []
-    for field in json["fields"]:
-        fields.append(_dict_to_field(field))
-    metadata = {
-        k.encode("utf-8"): v.encode("utf-8")
-        for (k, v) in json.get("metadata", {}).items()
-    }
-    return pa.schema(fields, metadata)
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -16,7 +16,7 @@ from __future__ import annotations
 import os
 from abc import ABC, abstractmethod
 from functools import cached_property
-from typing import List, Union
+from typing import Iterable, List, Union

 import lance
 import numpy as np
@@ -44,7 +44,7 @@ def _sanitize_data(data, schema, on_bad_vectors, fill_value):
        data = _sanitize_schema(
            data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
        )
-    if not isinstance(data, pa.Table):
+    if not isinstance(data, (pa.Table, Iterable)):
        raise TypeError(f"Unsupported data type: {type(data)}")
    return data

@@ -483,7 +483,7 @@ class LanceTable(Table):
            if schema is None:
                raise ValueError("Either data or schema must be provided")
            data = pa.Table.from_pylist([], schema=schema)
-        lance.write_dataset(data, tbl._dataset_uri, mode=mode)
+        lance.write_dataset(data, tbl._dataset_uri, schema=schema, mode=mode)
        return LanceTable(db, name)

    @classmethod
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "lancedb"
-version = "0.1.10"
-dependencies = ["pylance~=0.5.0", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic>=2", "attr"]
+version = "0.1.11"
+dependencies = ["pylance~=0.5.8", "ratelimiter", "retry", "tqdm", "aiohttp", "pydantic>=2", "attr"]
 description = "lancedb"
 authors = [
    { name = "LanceDB Devs", email = "dev@lancedb.com" },
--- a/python/tests/test_db.py
+++ b/python/tests/test_db.py
@@ -13,6 +13,7 @@

 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest

 import lancedb
@@ -75,6 +76,32 @@ def test_ingest_pd(tmp_path):
    assert db.open_table("test").name == db["test"].name


+def test_ingest_record_batch_iterator(tmp_path):
+    def batch_reader():
+        for i in range(5):
+            yield pa.RecordBatch.from_arrays(
+                [
+                    pa.array([[3.1, 4.1], [5.9, 26.5]]),
+                    pa.array(["foo", "bar"]),
+                    pa.array([10.0, 20.0]),
+                ],
+                ["vector", "item", "price"],
+            )
+
+    db = lancedb.connect(tmp_path)
+    tbl = db.create_table(
+        "test",
+        batch_reader(),
+        schema=pa.schema(
+            [
+                pa.field("vector", pa.list_(pa.float32())),
+                pa.field("item", pa.utf8()),
+                pa.field("price", pa.float32()),
+            ]
+        ),
+    )
+
+
 def test_create_mode(tmp_path):
    db = lancedb.connect(tmp_path)
    data = pd.DataFrame(
@@ -131,6 +158,9 @@ def test_empty_or_nonexistent_table(tmp_path):
    with pytest.raises(Exception):
        db.open_table("does_not_exist")

+    schema = pa.schema([pa.field("a", pa.int32())])
+    db.create_table("test", schema=schema)
+

 def test_replace_index(tmp_path):
    db = lancedb.connect(uri=tmp_path)
--- a/python/tests/test_query.py
+++ b/python/tests/test_query.py
@@ -119,6 +119,7 @@ def test_query_builder_with_different_vector_column():
            columns=["b"],
            nprobes=20,
            refine_factor=None,
+            vector_column="foo_vector",
        )
    )

--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -1,109 +0,0 @@
-#  Copyright 2023 LanceDB Developers
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import pyarrow as pa
-
-import lancedb
-from lancedb.schema import dict_to_schema, schema_to_dict
-
-
-def test_schema_to_dict():
-    schema = pa.schema(
-        [
-            pa.field("id", pa.int64()),
-            pa.field("vector", lancedb.vector(512), nullable=False),
-            pa.field(
-                "struct",
-                pa.struct(
-                    [
-                        pa.field("a", pa.utf8()),
-                        pa.field("b", pa.float32()),
-                    ]
-                ),
-                True,
-            ),
-            pa.field("d", pa.dictionary(pa.int64(), pa.utf8()), False),
-        ],
-        metadata={"key": "value"},
-    )
-
-    json_schema = schema_to_dict(schema)
-    assert json_schema == {
-        "fields": [
-            {"name": "id", "type": {"type": "int64"}, "nullable": True},
-            {
-                "name": "vector",
-                "type": {
-                    "type": "fixed_size_list",
-                    "value_type": {"type": "float32"},
-                    "width": 512,
-                },
-                "nullable": False,
-            },
-            {
-                "name": "struct",
-                "type": {
-                    "type": "struct",
-                    "fields": [
-                        {"name": "a", "type": {"type": "string"}, "nullable": True},
-                        {"name": "b", "type": {"type": "float32"}, "nullable": True},
-                    ],
-                },
-                "nullable": True,
-            },
-            {
-                "name": "d",
-                "type": {
-                    "type": "dictionary",
-                    "index_type": {"type": "int64"},
-                    "value_type": {"type": "string"},
-                },
-                "nullable": False,
-            },
-        ],
-        "metadata": {"key": "value"},
-    }
-
-    actual_schema = dict_to_schema(json_schema)
-    assert actual_schema == schema
-
-
-def test_temporal_types():
-    schema = pa.schema(
-        [
-            pa.field("t32", pa.time32("s")),
-            pa.field("t32ms", pa.time32("ms")),
-            pa.field("t64", pa.time64("ns")),
-            pa.field("ts", pa.timestamp("s")),
-            pa.field("ts_us_tz", pa.timestamp("us", tz="America/New_York")),
-        ],
-    )
-    json_schema = schema_to_dict(schema)
-
-    assert json_schema == {
-        "fields": [
-            {"name": "t32", "type": {"type": "time32:s"}, "nullable": True},
-            {"name": "t32ms", "type": {"type": "time32:ms"}, "nullable": True},
-            {"name": "t64", "type": {"type": "time64:ns"}, "nullable": True},
-            {"name": "ts", "type": {"type": "timestamp:s:"}, "nullable": True},
-            {
-                "name": "ts_us_tz",
-                "type": {"type": "timestamp:us:America/New_York"},
-                "nullable": True,
-            },
-        ],
-        "metadata": {},
-    }
-
-    actual_schema = dict_to_schema(json_schema)
-    assert actual_schema == schema
--- a/rust/ffi/node/Cargo.toml
+++ b/rust/ffi/node/Cargo.toml
@@ -15,6 +15,7 @@ arrow-ipc = { workspace = true }
 arrow-schema = { workspace = true }
 once_cell = "1"
 futures = "0.3"
+half = { workspace = true }
 lance = { workspace = true }
 vectordb = { path = "../../vectordb" }
 tokio = { version = "1.23", features = ["rt-multi-thread"] }
--- a/rust/vectordb/Cargo.toml
+++ b/rust/vectordb/Cargo.toml
@@ -13,6 +13,7 @@ arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
 object_store = { workspace = true }
 snafu = "0.7.4"
+half = { workspace = true }
 lance = { workspace = true }
 tokio = { version = "1.23", features = ["rt-multi-thread"] }

--- a/rust/vectordb/src/database.rs
+++ b/rust/vectordb/src/database.rs
@@ -27,6 +27,7 @@ pub struct Database {
    object_store: ObjectStore,

    pub(crate) uri: String,
+    pub(crate) base_path: object_store::path::Path,
 }

 const LANCE_EXTENSION: &str = "lance";
@@ -43,12 +44,13 @@ impl Database {
    ///
    /// * A [Database] object.
    pub async fn connect(uri: &str) -> Result<Database> {
-        let (object_store, _) = ObjectStore::from_uri(uri).await?;
+        let (object_store, base_path) = ObjectStore::from_uri(uri).await?;
        if object_store.is_local() {
            Self::try_create_dir(uri).context(CreateDirSnafu { path: uri })?;
        }
        Ok(Database {
            uri: uri.to_string(),
+            base_path,
            object_store,
        })
    }
@@ -70,7 +72,7 @@ impl Database {
    pub async fn table_names(&self) -> Result<Vec<String>> {
        let f = self
            .object_store
-            .read_dir(self.uri.as_str())
+            .read_dir(self.base_path.clone())
            .await?
            .iter()
            .map(|fname| Path::new(fname))
@@ -141,8 +143,9 @@ impl Database {
    /// # Arguments
    /// * `name` - The name of the table.
    pub async fn drop_table(&self, name: &str) -> Result<()> {
-        let dir_name = format!("{}/{}.{}", self.uri, name, LANCE_EXTENSION);
-        self.object_store.remove_dir_all(dir_name).await?;
+        let dir_name = format!("{}.{}", name, LANCE_EXTENSION);
+        let full_path = self.base_path.child(dir_name.clone());
+        self.object_store.remove_dir_all(full_path).await?;
        Ok(())
    }
 }
--- a/rust/vectordb/src/table.rs
+++ b/rust/vectordb/src/table.rs
@@ -16,6 +16,7 @@ use std::path::Path;
 use std::sync::Arc;

 use arrow_array::{Float32Array, RecordBatchReader};
+use arrow_schema::SchemaRef;
 use lance::dataset::{Dataset, ReadParams, WriteParams};
 use lance::index::IndexType;
 use snafu::prelude::*;
@@ -144,6 +145,16 @@ impl Table {
        })
    }

+    /// Schema of this Table.
+    pub fn schema(&self) -> SchemaRef {
+        Arc::new(self.dataset.schema().into())
+    }
+
+    /// Version of this Table
+    pub fn version(&self) -> u64 {
+        self.dataset.version().version
+    }
+
    /// Create index on the table.
    pub async fn create_index(&mut self, index_builder: &impl VectorIndexBuilder) -> Result<()> {
        use lance::index::DatasetIndexExt;
@@ -274,6 +285,7 @@ mod tests {
    }

    #[test]
+    #[cfg(not(windows))]
    fn test_object_store_path() {
        use std::path::Path as StdPath;
        let p = StdPath::new("s3://bucket/path/to/file");
@@ -350,10 +362,7 @@ mod tests {
            ..Default::default()
        };

-        table
-            .add(new_batches, Some(param))
-            .await
-            .unwrap();
+        table.add(new_batches, Some(param)).await.unwrap();
        assert_eq!(table.count_rows().await.unwrap(), 10);
        assert_eq!(table.name, "test");
    }
Author	SHA1	Message	Date
Lei Xu	2704a4522c	Bump to 0.1.11	2023-07-17 12:45:17 -07:00
Lei Xu	030f07e7f0	Bump minimal lance version to 0.5.8 (#318 )	2023-07-17 12:41:29 -07:00
gsilvestrin	72afa06b7a	feat(node): Add Windows support (#294 )	2023-07-17 08:48:24 -07:00
Lei Xu	088e745e1d	[Python] Create table with Iterator[RecordBatch] and add docs (#316 )	2023-07-16 21:45:55 -07:00
Lei Xu	7a57cddb2c	[Python] Add records to remote (#315 )	2023-07-16 13:24:38 -07:00
Lei Xu	8ff5f88916	[Python] Bug fixes in remote API (#314 )	2023-07-16 11:09:19 -07:00
Lei Xu	028a6e433d	[Python] Get table schema (#313 )	2023-07-15 17:39:37 -07:00
Lei Xu	04c6814fb1	[Rust] Expose Table schema and version in Rust (#312 )	2023-07-14 22:01:23 -07:00
Lei Xu	c62e4ca1eb	Bump lance version to 0.5.7 (#311 )	2023-07-14 17:17:31 -07:00
gsilvestrin	aecc5fc42b	feat(node): Fix npm publish task (#298 )	2023-07-14 13:39:15 -07:00
Chang She	2fdcb307eb	[python] Fix a few minor bugs (#304 )	2023-07-15 03:47:42 +08:00
Tevin Wang	ad18826579	[Documentation Code Testing] build node sdk in release (#307 )	2023-07-14 12:46:48 -07:00
Leon Yee	a8a50591d7	[docs] small fixes (#308 ) Closes #288 and #287	2023-07-14 12:46:31 -07:00
gsilvestrin	6dfe7fabc2	pin half (#310 )	2023-07-14 12:45:05 -07:00
gsilvestrin	2b108e1c80	Updating package-lock.json file (#301 )	2023-07-13 17:50:01 -07:00
Lei Xu	8c9edafccc	[Doc] Add more Python integrations documents (#299 )	2023-07-13 17:09:39 -07:00
Leon Yee	0590413b96	Added transformersJS example to docs and node/examples (#297 )	2023-07-13 17:01:36 -07:00