diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml
new file mode 100644
index 00000000..12d13522
--- /dev/null
+++ b/.github/workflows/java-publish.yml
@@ -0,0 +1,109 @@
+name: Build and publish Java packages
+on:
+ release:
+ types: [released]
+ pull_request:
+ paths:
+ - .github/workflows/java-publish.yml
+
+jobs:
+ macos-arm64:
+ name: Build on MacOS Arm64
+ runs-on: macos-14
+ timeout-minutes: 45
+ defaults:
+ run:
+ working-directory: ./java/core/lancedb-jni
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - uses: Swatinem/rust-cache@v2
+ - name: Install dependencies
+ run: |
+ brew install protobuf
+ - name: Build release
+ run: |
+ cargo build --release
+ - uses: actions/upload-artifact@v4
+ with:
+ name: liblancedb_jni_darwin_aarch64.zip
+ path: target/release/liblancedb_jni.dylib
+ retention-days: 1
+ if-no-files-found: error
+ linux-arm64:
+ name: Build on Linux Arm64
+ runs-on: warp-ubuntu-2204-arm64-8x
+ timeout-minutes: 45
+ defaults:
+ run:
+ working-directory: ./java/core/lancedb-jni
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - uses: Swatinem/rust-cache@v2
+ - uses: actions-rust-lang/setup-rust-toolchain@v1
+ with:
+ toolchain: "1.79.0"
+ cache-workspaces: "./java/core/lancedb-jni"
+ # Disable full debug symbol generation to speed up CI build and keep memory down
+ # "1" means line tables only, which is useful for panic tracebacks.
+ rustflags: "-C debuginfo=1"
+ - name: Install dependencies
+ run: |
+ sudo apt -y -qq update
+ sudo apt install -y protobuf-compiler libssl-dev pkg-config
+ - name: Build release
+ run: |
+ cargo build --release
+ - uses: actions/upload-artifact@v4
+ with:
+ name: liblancedb_jni_linux_aarch64.zip
+ path: target/release/liblancedb_jni.so
+ retention-days: 1
+ if-no-files-found: error
+ linux-x86:
+ runs-on: warp-ubuntu-2204-x64-8x
+ timeout-minutes: 30
+ needs: [macos-arm64, linux-arm64]
+ defaults:
+ run:
+ working-directory: ./java
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - uses: Swatinem/rust-cache@v2
+ - name: Set up Java 8
+ uses: actions/setup-java@v4
+ with:
+ distribution: temurin
+ java-version: 8
+ cache: "maven"
+ server-id: ossrh
+ server-username: SONATYPE_USER
+ server-password: SONATYPE_TOKEN
+ gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
+ gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
+ - name: Install dependencies
+ run: |
+ sudo apt -y -qq update
+ sudo apt install -y protobuf-compiler libssl-dev pkg-config
+ - name: Download artifact
+ uses: actions/download-artifact@v4
+ - name: Copy native libs
+ run: |
+ mkdir -p ./core/target/classes/nativelib/darwin-aarch64 ./core/target/classes/nativelib/linux-aarch64
+ cp ../liblancedb_jni_darwin_aarch64.zip/liblancedb_jni.dylib ./core/target/classes/nativelib/darwin-aarch64/liblancedb_jni.dylib
+ cp ../liblancedb_jni_linux_aarch64.zip/liblancedb_jni.so ./core/target/classes/nativelib/linux-aarch64/liblancedb_jni.so
+ - name: Set github
+ run: |
+ git config --global user.email "LanceDB Github Runner"
+ git config --global user.name "dev+gha@lancedb.com"
+ - name: Publish with Java 8
+ run: |
+ echo "use-agent" >> ~/.gnupg/gpg.conf
+ echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
+ export GPG_TTY=$(tty)
+ mvn --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh
+ env:
+ SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
+ SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 0230caef..bb0c456c 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -26,6 +26,7 @@ theme:
- content.code.copy
- content.tabs.link
- content.action.edit
+ - content.tooltips
- toc.follow
- navigation.top
- navigation.tabs
@@ -35,6 +36,7 @@ theme:
- navigation.instant
icon:
repo: fontawesome/brands/github
+ annotation: material/arrow-right-circle
custom_dir: overrides
plugins:
@@ -76,7 +78,12 @@ markdown_extensions:
- pymdownx.tabbed:
alternate_style: true
- md_in_html
+ - abbr
- attr_list
+ - pymdownx.snippets
+ - pymdownx.emoji:
+ emoji_index: !!python/name:material.extensions.emoji.twemoji
+ emoji_generator: !!python/name:material.extensions.emoji.to_svg
nav:
- Home:
diff --git a/docs/src/concepts/index_hnsw.md b/docs/src/concepts/index_hnsw.md
index 9e8dc948..8bfaf39c 100644
--- a/docs/src/concepts/index_hnsw.md
+++ b/docs/src/concepts/index_hnsw.md
@@ -15,11 +15,13 @@ HNSW also combines this with the ideas behind a classic 1-dimensional search dat
## k-Nearest Neighbor Graphs and k-approximate Nearest neighbor Graphs
The k-nearest neighbor graph actually predates its use for ANN search. Its construction is quite simple:
+
* Each vector in the dataset is given an associated vertex.
* Each vertex has outgoing edges to its k nearest neighbors. That is, the k closest other vertices by Euclidean distance between the two corresponding vectors. This can be thought of as a "friend list" for the vertex.
* For some applications (including nearest-neighbor search), the incoming edges are also added.
Eventually, it was realized that the following greedy search method over such a graph typically results in good approximate nearest neighbors:
+
* Given a query vector, start at some fixed "entry point" vertex (e.g. the approximate center node).
* Look at that vertex's neighbors. If any of them are closer to the query vector than the current vertex, then move to that vertex.
* Repeat until a local optimum is found.
@@ -36,15 +38,18 @@ One downside of k-NN and k-ANN graphs alone is that one must typically build the
## HNSW: Hierarchical Navigable Small Worlds
HNSW builds on k-ANN in two main ways:
+
* Instead of getting the k-approximate nearest neighbors for a large value of k, it sparsifies the k-ANN graph using a carefully chosen "edge pruning" heuristic, allowing for the number of edges per vertex to be limited to a relatively small constant.
* The "entry point" vertex is chosen dynamically using a recursively constructed data structure on a subset of the data, similarly to a skip list.
This recursive structure can be thought of as separating into layers:
+
* At the bottom-most layer, an k-ANN graph on the whole dataset is present.
* At the second layer, a k-ANN graph on a fraction of the dataset (e.g. 10%) is present.
* At the Lth layer, a k-ANN graph is present. It is over a (constant) fraction (e.g. 10%) of the vectors/vertices present in the L-1th layer.
Then the greedy search routine operates as follows:
+
* At the top layer (using an arbitrary vertex as an entry point), use the greedy local search routine on the k-ANN graph to get an approximate nearest neighbor at that layer.
* Using the approximate nearest neighbor found in the previous layer as an entry point, find an approximate nearest neighbor in the next layer with the same method.
* Repeat until the bottom-most layer is reached. Then use the entry point to find multiple nearest neighbors (e.g. top 10).
diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
index 4aa8b3db..72a7e825 100644
--- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
+++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
@@ -17,7 +17,7 @@ from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
db = lancedb.connect(tmp_path)
-func = get_registry.get("imagebind").create()
+func = get_registry().get("imagebind").create()
class ImageBindModel(LanceModel):
text: str
diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
index bf50dfd2..eb6139f5 100644
--- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
+++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
@@ -20,7 +20,7 @@ from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
db = lancedb.connect(tmp_path)
-func = get_registry.get("open-clip").create()
+func = get_registry().get("open-clip").create()
class Images(LanceModel):
label: str
diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
index 39eba18c..fd99f2ca 100644
--- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
+++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
@@ -4,13 +4,14 @@ Using cohere API requires cohere package, which can be installed using `pip inst
You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API.
Supported models are:
-* embed-english-v3.0
-* embed-multilingual-v3.0
-* embed-english-light-v3.0
-* embed-multilingual-light-v3.0
-* embed-english-v2.0
-* embed-english-light-v2.0
-* embed-multilingual-v2.0
+
+- embed-english-v3.0
+- embed-multilingual-v3.0
+- embed-english-light-v3.0
+- embed-multilingual-light-v3.0
+- embed-english-v2.0
+- embed-english-light-v2.0
+- embed-multilingual-v2.0
Supported parameters (to be passed in `create` method) are:
diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md
index ced97048..5457dc9f 100644
--- a/docs/src/embeddings/default_embedding_functions.md
+++ b/docs/src/embeddings/default_embedding_functions.md
@@ -1,30 +1,84 @@
-There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models.
+# π Available Embedding Models
-## Text embedding functions
-Contains the text embedding functions registered by default.
+There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models. π
-* Embedding functions have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with exponential backoff.
-* Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7.
+Before jumping on the list of available models, let's understand how to get an embedding model initialized and configured to use in our code:
-**Available Text Embeddings**:
+!!! example "Example usage"
+ ```python
+ model = get_registry()
+ .get("openai")
+ .create(name="text-embedding-ada-002")
+ ```
-- [Sentence Transformers](available_embedding_models/text_embedding_functions/sentence_transformers.md)
-- [Huggingface Embedding Models](available_embedding_models/text_embedding_functions/huggingface_embedding.md)
-- [Ollama Embeddings](available_embedding_models/text_embedding_functions/ollama_embedding.md)
-- [OpenAI Embeddings](available_embedding_models/text_embedding_functions/openai_embedding.md)
-- [Instructor Embeddings](available_embedding_models/text_embedding_functions/instructor_embedding.md)
-- [Gemini Embeddings](available_embedding_models/text_embedding_functions/gemini_embedding.md)
-- [Cohere Embeddings](available_embedding_models/text_embedding_functions/cohere_embedding.md)
-- [Jina Embeddings](available_embedding_models/text_embedding_functions/jina_embedding.md)
-- [AWS Bedrock Text Embedding Functions](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md)
-- [IBM Watsonx.ai Embeddings](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md)
+Now let's understand the above syntax:
+```python
+model = get_registry().get("model_id").create(...params)
+```
+**Thisπ line effectively creates a configured instance of an `embedding function` with `model` of choice that is ready for use.**
+
+- `get_registry()` : This function call returns an instance of a `EmbeddingFunctionRegistry` object. This registry manages the registration and retrieval of embedding functions.
+
+- `.get("model_id")` : This method call on the registry object and retrieves the **embedding models functions** associated with the `"model_id"` (1) .
+ { .annotate }
+
+ 1. Hover over the names in table below to find out the `model_id` of different embedding functions.
+
+- `.create(...params)` : This method call is on the object returned by the `get` method. It instantiates an embedding model function using the **specified parameters**.
+
+??? question "What parameters does the `.create(...params)` method accepts?"
+ **Checkout the documentation of specific embedding models (links in the table belowπ) to know what parameters it takes**.
+
+!!! tip "Moving on"
+ Now that we know how to get the **desired embedding model** and use it in our code, let's explore the comprehensive **list** of embedding models **supported by LanceDB**, in the tables below.
+
+## Text Embedding Functions π
+These functions are registered by default to handle text embeddings.
+
+- π **Embedding functions** have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with **exponential backoff**.
+
+- π Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7.
+
+π **Available Text Embeddings**
+
+| **Embedding** :material-information-outline:{ title="Hover over the name to find out the model_id" } | **Description** | **Documentation** |
+|-----------|-------------|---------------|
+| [**Sentence Transformers**](available_embedding_models/text_embedding_functions/sentence_transformers.md "sentence-transformers") | π§ **SentenceTransformers** is a Python framework for state-of-the-art sentence, text, and image embeddings. | [
](available_embedding_models/text_embedding_functions/sentence_transformers.md)|
+| [**Huggingface Models**](available_embedding_models/text_embedding_functions/huggingface_embedding.md "huggingface") |π€ We offer support for all **Huggingface** models. The default model is `colbert-ir/colbertv2.0`. | [
](available_embedding_models/text_embedding_functions/huggingface_embedding.md) |
+| [**Ollama Embeddings**](available_embedding_models/text_embedding_functions/ollama_embedding.md "ollama") | π Generate embeddings via the **Ollama** python library. Ollama supports embedding models, making it possible to build RAG apps. | [
](available_embedding_models/text_embedding_functions/ollama_embedding.md)|
+| [**OpenAI Embeddings**](available_embedding_models/text_embedding_functions/openai_embedding.md "openai")| π **OpenAIβs** text embeddings measure the relatedness of text strings. **LanceDB** supports state-of-the-art embeddings from OpenAI. | [
](available_embedding_models/text_embedding_functions/openai_embedding.md)|
+| [**Instructor Embeddings**](available_embedding_models/text_embedding_functions/instructor_embedding.md "instructor") | π **Instructor**: An instruction-finetuned text embedding model that can generate text embeddings tailored to any task and domains by simply providing the task instruction, without any finetuning. | [
](available_embedding_models/text_embedding_functions/instructor_embedding.md) |
+| [**Gemini Embeddings**](available_embedding_models/text_embedding_functions/gemini_embedding.md "gemini-text") | π Googleβs Gemini API generates state-of-the-art embeddings for words, phrases, and sentences. | [
](available_embedding_models/text_embedding_functions/gemini_embedding.md) |
+| [**Cohere Embeddings**](available_embedding_models/text_embedding_functions/cohere_embedding.md "cohere") | π¬ This will help you get started with **Cohere** embedding models using LanceDB. Using cohere API requires cohere package. Install it via `pip`. | [
](available_embedding_models/text_embedding_functions/cohere_embedding.md) |
+| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | π World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [
](available_embedding_models/text_embedding_functions/jina_embedding.md) |
+| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | βοΈ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [
](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |
+| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | π‘ Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [
](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
-## Multi-modal embedding functions
-Multi-modal embedding functions allow you to query your table using both images and text.
-**Available Multi-modal Embeddings** :
+[st-key]: "sentence-transformers"
+[hf-key]: "huggingface"
+[ollama-key]: "ollama"
+[openai-key]: "openai"
+[instructor-key]: "instructor"
+[gemini-key]: "gemini-text"
+[cohere-key]: "cohere"
+[jina-key]: "jina"
+[aws-key]: "bedrock-text"
+[watsonx-key]: "watsonx"
-- [OpenClip Embeddings](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md)
-- [Imagebind Embeddings](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)
-- [Jina Embeddings](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md)
\ No newline at end of file
+
+## Multi-modal Embedding FunctionsπΌοΈ
+
+Multi-modal embedding functions allow you to query your table using both images and text. π¬πΌοΈ
+
+π **Available Multi-modal Embeddings**
+
+| Embedding :material-information-outline:{ title="Hover over the name to find out the model_id" } | Description | Documentation |
+|-----------|-------------|---------------|
+| [**OpenClip Embeddings**](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md "open-clip") | π¨ We support CLIP model embeddings using the open source alternative, **open-clip** which supports various customizations. | [
](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) |
+| [**Imagebind Embeddings**](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md "imageind") | π We have support for **imagebind model embeddings**. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. | [
](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)|
+| [**Jina Multi-modal Embeddings**](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md "jina") | π **Jina embeddings** can also be used to embed both **text** and **image** data, only some of the models support image data and you can check the detailed documentation. π | [
](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) |
+
+!!! note
+ If you'd like to request support for additional **embedding functions**, please feel free to open an issue on our LanceDB [GitHub issue page](https://github.com/lancedb/lancedb/issues).
\ No newline at end of file
diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md
index 4be12858..9de2ba94 100644
--- a/docs/src/guides/tables.md
+++ b/docs/src/guides/tables.md
@@ -416,7 +416,6 @@ You can create an empty table for scenarios where you want to add data to the ta
=== "Python"
- ```python
An empty table can be initialized via a PyArrow schema.
diff --git a/java/core/pom.xml b/java/core/pom.xml
index a469c3ae..b6fedc19 100644
--- a/java/core/pom.xml
+++ b/java/core/pom.xml
@@ -8,7 +8,7 @@
com.lancedb
lancedb-parent
- 0.1-SNAPSHOT
+ 0.0.3
../pom.xml
@@ -68,7 +68,7 @@
lancedb-jni
-
+ true
${project.build.directory}/classes/nativelib
true
diff --git a/java/pom.xml b/java/pom.xml
index 48a64c12..6a0a95a7 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -6,15 +6,28 @@
com.lancedb
lancedb-parent
- 0.1-SNAPSHOT
+ 0.0.3
pom
- Lance Parent
+ LanceDB Parent
+ LanceDB vector database Java API
+ http://lancedb.com/
+
+
+
+ Lance DB Dev Group
+ dev@lancedb.com
+
+
+
+
+ The Apache Software License, Version 2.0
+ http://www.apache.org/licenses/LICENSE-2.0.txt
+
+
UTF-8
- 11
- 11
15.0.0
@@ -22,6 +35,12 @@
core
+
+ scm:git:https://github.com/lancedb/lancedb.git
+ scm:git:ssh://git@github.com/lancedb/lancedb.git
+ https://github.com/lancedb/lancedb
+
+
@@ -62,8 +81,45 @@
+
+
+ ossrh
+ https://s01.oss.sonatype.org/content/repositories/snapshots
+
+
+ ossrh
+ https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/
+
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 2.2.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.9.1
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
org.apache.maven.plugins
maven-checkstyle-plugin
@@ -126,4 +182,82 @@
+
+
+
+ jdk8
+
+ [1.8,1.8.999]
+
+
+ 1.8
+ 1.8
+
+
+
+ jdk11+
+
+ [11,)
+
+
+ 11
+ 11
+
+
+
+
+ maven-surefire-plugin
+ 3.2.5
+
+ --add-opens=java.base/java.nio=ALL-UNNAMED
+
+ false
+
+
+
+
+
+
+ deploy-to-ossrh
+
+
+
+ org.sonatype.central
+ central-publishing-maven-plugin
+ 0.4.0
+ true
+
+ ossrh
+ true
+
+
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6.13
+ true
+
+ ossrh
+ https://s01.oss.sonatype.org/
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 1.5
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+
+
+
+
+
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 069fb4de..7d41d891 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
- "pylance==0.17.0-beta.2",
+ "pylance==0.17.0",
"ratelimiter~=1.0",
"requests>=2.31.0",
"retry>=0.9.2",
diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi
index c4642637..55c3db99 100644
--- a/python/python/lancedb/_lancedb.pyi
+++ b/python/python/lancedb/_lancedb.pyi
@@ -73,6 +73,7 @@ class Query:
def where(self, filter: str): ...
def select(self, columns: Tuple[str, str]): ...
def limit(self, limit: int): ...
+ def offset(self, offset: int): ...
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
def nearest_to_text(self, query: dict) -> Query: ...
async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
@@ -83,6 +84,7 @@ class VectorQuery:
def select(self, columns: List[str]): ...
def select_with_projection(self, columns: Tuple[str, str]): ...
def limit(self, limit: int): ...
+ def offset(self, offset: int): ...
def column(self, column: str): ...
def distance_type(self, distance_type: str): ...
def postfilter(self): ...
diff --git a/python/python/lancedb/db.py b/python/python/lancedb/db.py
index 1c77b299..d2345e4a 100644
--- a/python/python/lancedb/db.py
+++ b/python/python/lancedb/db.py
@@ -14,7 +14,6 @@
from __future__ import annotations
import asyncio
-import inspect
import os
from abc import abstractmethod
from pathlib import Path
@@ -27,8 +26,13 @@ from pyarrow import fs
from lancedb.common import data_to_reader, validate_schema
from ._lancedb import connect as lancedb_connect
-from .pydantic import LanceModel
-from .table import AsyncTable, LanceTable, Table, _sanitize_data, _table_path
+from .table import (
+ AsyncTable,
+ LanceTable,
+ Table,
+ _table_path,
+ sanitize_create_table,
+)
from .util import (
fs_from_uri,
get_uri_location,
@@ -37,6 +41,7 @@ from .util import (
)
if TYPE_CHECKING:
+ from .pydantic import LanceModel
from datetime import timedelta
from ._lancedb import Connection as LanceDbConnection
@@ -722,12 +727,6 @@ class AsyncConnection(object):
... await db.create_table("table4", make_batches(), schema=schema)
>>> asyncio.run(iterable_example())
"""
- if inspect.isclass(schema) and issubclass(schema, LanceModel):
- # convert LanceModel to pyarrow schema
- # note that it's possible this contains
- # embedding function metadata already
- schema = schema.to_arrow_schema()
-
metadata = None
# Defining defaults here and not in function prototype. In the future
@@ -738,31 +737,9 @@ class AsyncConnection(object):
if fill_value is None:
fill_value = 0.0
- if data is not None:
- data, schema = _sanitize_data(
- data,
- schema,
- metadata=metadata,
- on_bad_vectors=on_bad_vectors,
- fill_value=fill_value,
- )
-
- if schema is None:
- if data is None:
- raise ValueError("Either data or schema must be provided")
- elif hasattr(data, "schema"):
- schema = data.schema
- elif isinstance(data, Iterable):
- if metadata:
- raise TypeError(
- (
- "Persistent embedding functions not yet "
- "supported for generator data input"
- )
- )
-
- if metadata:
- schema = schema.with_metadata(metadata)
+ data, schema = sanitize_create_table(
+ data, schema, metadata, on_bad_vectors, fill_value
+ )
validate_schema(schema)
if exist_ok is None:
diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py
index 9da90987..13b0460c 100644
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -42,9 +42,9 @@ if TYPE_CHECKING:
import PIL
import polars as pl
- from .common import VEC
from ._lancedb import Query as LanceQuery
from ._lancedb import VectorQuery as LanceVectorQuery
+ from .common import VEC
from .pydantic import LanceModel
from .table import Table
@@ -85,6 +85,8 @@ class Query(pydantic.BaseModel):
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
tuning advice.
+ offset: int
+ The offset to start fetching results from
"""
vector_column: Optional[str] = None
@@ -119,6 +121,8 @@ class Query(pydantic.BaseModel):
with_row_id: bool = False
+ offset: int = 0
+
class LanceQueryBuilder(ABC):
"""An abstract query builder. Subclasses are defined for vector search,
@@ -233,6 +237,7 @@ class LanceQueryBuilder(ABC):
def __init__(self, table: "Table"):
self._table = table
self._limit = 10
+ self._offset = 0
self._columns = None
self._where = None
self._prefilter = False
@@ -371,6 +376,25 @@ class LanceQueryBuilder(ABC):
self._limit = limit
return self
+ def offset(self, offset: int) -> LanceQueryBuilder:
+ """Set the offset for the results.
+
+ Parameters
+ ----------
+ offset: int
+ The offset to start fetching results from.
+
+ Returns
+ -------
+ LanceQueryBuilder
+ The LanceQueryBuilder object.
+ """
+ if offset is None or offset <= 0:
+ self._offset = 0
+ else:
+ self._offset = offset
+ return self
+
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
"""Set the columns to return.
@@ -649,6 +673,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
refine_factor=self._refine_factor,
vector_column=self._vector_column,
with_row_id=self._with_row_id,
+ offset=self._offset,
)
result_set = self._table._execute_query(query, batch_size)
if self._reranker is not None:
@@ -780,6 +805,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
"columns": self._fts_columns,
},
vector=[],
+ offset=self._offset,
)
results = self._table._execute_query(query)
results = results.read_all()
@@ -826,7 +852,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
)
if len(row_ids) == 0:
empty_schema = pa.schema([pa.field("_score", pa.float32())])
- return pa.Table.from_pylist([], schema=empty_schema)
+ return pa.Table.from_batches([], schema=empty_schema)
scores = pa.array(scores)
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
output_tbl = output_tbl.append_column("_score", scores)
@@ -939,6 +965,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._reranker = RRFReranker()
self._nprobes = None
self._refine_factor = None
+ self._phrase_query = False
def _validate_query(self, query, vector=None, text=None):
if query is not None and (vector is not None or text is not None):
@@ -960,6 +987,23 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
return vector_query, text_query
+ def phrase_query(self, phrase_query: bool = True) -> LanceHybridQueryBuilder:
+ """Set whether to use phrase query.
+
+ Parameters
+ ----------
+ phrase_query: bool, default True
+ If True, then the query will be wrapped in quotes and
+ double quotes replaced by single quotes.
+
+ Returns
+ -------
+ LanceHybridQueryBuilder
+ The LanceHybridQueryBuilder object.
+ """
+ self._phrase_query = phrase_query
+ return self
+
def to_arrow(self) -> pa.Table:
vector_query, fts_query = self._validate_query(
self._query, self._vector, self._text
@@ -986,6 +1030,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
if self._with_row_id:
self._vector_query.with_row_id(True)
self._fts_query.with_row_id(True)
+ if self._phrase_query:
+ self._fts_query.phrase_query(True)
if self._nprobes:
self._vector_query.nprobes(self._nprobes)
if self._refine_factor:
@@ -1220,6 +1266,18 @@ class AsyncQueryBase(object):
self._inner.limit(limit)
return self
+ def offset(self, offset: int) -> AsyncQuery:
+ """
+ Set the offset for the results.
+
+ Parameters
+ ----------
+ offset: int
+ The offset to start fetching results from.
+ """
+ self._inner.offset(offset)
+ return self
+
async def to_batches(
self, *, max_batch_length: Optional[int] = None
) -> AsyncRecordBatchReader:
diff --git a/python/python/lancedb/remote/arrow.py b/python/python/lancedb/remote/arrow.py
index 753087cf..ac39e247 100644
--- a/python/python/lancedb/remote/arrow.py
+++ b/python/python/lancedb/remote/arrow.py
@@ -11,12 +11,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+from typing import Iterable, Union
import pyarrow as pa
-def to_ipc_binary(table: pa.Table) -> bytes:
+def to_ipc_binary(table: Union[pa.Table, Iterable[pa.RecordBatch]]) -> bytes:
"""Serialize a PyArrow Table to IPC binary."""
sink = pa.BufferOutputStream()
+ if isinstance(table, Iterable):
+ table = pa.Table.from_batches(table)
with pa.ipc.new_stream(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue().to_pybytes()
diff --git a/python/python/lancedb/remote/db.py b/python/python/lancedb/remote/db.py
index 0dd6bb6d..bb7554a4 100644
--- a/python/python/lancedb/remote/db.py
+++ b/python/python/lancedb/remote/db.py
@@ -11,7 +11,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import inspect
import logging
import uuid
from concurrent.futures import ThreadPoolExecutor
@@ -26,7 +25,7 @@ from ..common import DATA
from ..db import DBConnection
from ..embeddings import EmbeddingFunctionConfig
from ..pydantic import LanceModel
-from ..table import Table, _sanitize_data
+from ..table import Table, sanitize_create_table
from ..util import validate_table_name
from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
@@ -228,8 +227,6 @@ class RemoteDBConnection(DBConnection):
"""
validate_table_name(name)
- if data is None and schema is None:
- raise ValueError("Either data or schema must be provided.")
if embedding_functions is not None:
logging.warning(
"embedding_functions is not yet supported on LanceDB Cloud."
@@ -239,24 +236,9 @@ class RemoteDBConnection(DBConnection):
if mode is not None:
logging.warning("mode is not yet supported on LanceDB Cloud.")
- if inspect.isclass(schema) and issubclass(schema, LanceModel):
- # convert LanceModel to pyarrow schema
- # note that it's possible this contains
- # embedding function metadata already
- schema = schema.to_arrow_schema()
-
- if data is not None:
- data, schema = _sanitize_data(
- data,
- schema,
- metadata=None,
- on_bad_vectors=on_bad_vectors,
- fill_value=fill_value,
- )
- else:
- if schema is None:
- raise ValueError("Either data or schema must be provided")
- data = pa.Table.from_pylist([], schema=schema)
+ data, schema = sanitize_create_table(
+ data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
+ )
from .table import RemoteTable
diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py
index 46df91c2..53e624a0 100644
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -117,15 +117,50 @@ def _sanitize_data(
data = _sanitize_schema(
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
+ if schema is None:
+ schema = data.schema
elif isinstance(data, Iterable):
data = _to_record_batch_generator(
data, schema, metadata, on_bad_vectors, fill_value
)
+ if schema is None:
+ data, schema = _generator_to_data_and_schema(data)
+ if schema is None:
+ raise ValueError("Cannot infer schema from generator data")
else:
raise TypeError(f"Unsupported data type: {type(data)}")
return data, schema
+def sanitize_create_table(
+ data, schema, metadata=None, on_bad_vectors="error", fill_value=0.0
+):
+ if inspect.isclass(schema) and issubclass(schema, LanceModel):
+ # convert LanceModel to pyarrow schema
+ # note that it's possible this contains
+ # embedding function metadata already
+ schema = schema.to_arrow_schema()
+
+ if data is not None:
+ data, schema = _sanitize_data(
+ data,
+ schema,
+ metadata=metadata,
+ on_bad_vectors=on_bad_vectors,
+ fill_value=fill_value,
+ )
+ if schema is None:
+ if data is None:
+ raise ValueError("Either data or schema must be provided")
+ elif hasattr(data, "schema"):
+ schema = data.schema
+
+ if metadata:
+ schema = schema.with_metadata(metadata)
+
+ return data, schema
+
+
def _schema_from_hf(data, schema):
"""
Extract pyarrow schema from HuggingFace DatasetDict
@@ -187,8 +222,30 @@ def _append_vector_col(data: pa.Table, metadata: dict, schema: Optional[pa.Schem
return data
+def _generator_to_data_and_schema(
+ data: Iterable,
+) -> Tuple[Iterable[pa.RecordBatch], pa.Schema]:
+ def _with_first_generator(first, data):
+ yield first
+ yield from data
+
+ first = next(data, None)
+ schema = None
+ if isinstance(first, pa.RecordBatch):
+ schema = first.schema
+ data = _with_first_generator(first, data)
+ elif isinstance(first, pa.Table):
+ schema = first.schema
+ data = _with_first_generator(first.to_batches(), data)
+ return data, schema
+
+
def _to_record_batch_generator(
- data: Iterable, schema, metadata, on_bad_vectors, fill_value
+ data: Iterable,
+ schema,
+ metadata,
+ on_bad_vectors,
+ fill_value,
):
for batch in data:
# always convert to table because we need to sanitize the data
@@ -1569,12 +1626,6 @@ class LanceTable(Table):
The embedding functions to use when creating the table.
"""
tbl = LanceTable(db, name)
- if inspect.isclass(schema) and issubclass(schema, LanceModel):
- # convert LanceModel to pyarrow schema
- # note that it's possible this contains
- # embedding function metadata already
- schema = schema.to_arrow_schema()
-
metadata = None
if embedding_functions is not None:
# If we passed in embedding functions explicitly
@@ -1583,33 +1634,11 @@ class LanceTable(Table):
registry = EmbeddingFunctionRegistry.get_instance()
metadata = registry.get_table_metadata(embedding_functions)
- if data is not None:
- data, schema = _sanitize_data(
- data,
- schema,
- metadata=metadata,
- on_bad_vectors=on_bad_vectors,
- fill_value=fill_value,
- )
+ data, schema = sanitize_create_table(
+ data, schema, metadata, on_bad_vectors, fill_value
+ )
- if schema is None:
- if data is None:
- raise ValueError("Either data or schema must be provided")
- elif hasattr(data, "schema"):
- schema = data.schema
- elif isinstance(data, Iterable):
- if metadata:
- raise TypeError(
- (
- "Persistent embedding functions not yet "
- "supported for generator data input"
- )
- )
-
- if metadata:
- schema = schema.with_metadata(metadata)
-
- empty = pa.Table.from_pylist([], schema=schema)
+ empty = pa.Table.from_batches([], schema=schema)
try:
lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode)
except OSError as err:
@@ -1708,6 +1737,7 @@ class LanceTable(Table):
full_text_query=query.full_text_query,
with_row_id=query.with_row_id,
batch_size=batch_size,
+ offset=query.offset,
).to_reader()
def _do_merge(
diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py
index 373ae2b6..5b7f3c42 100644
--- a/python/python/tests/test_db.py
+++ b/python/python/tests/test_db.py
@@ -233,6 +233,43 @@ def test_create_mode(tmp_path):
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
+def test_create_table_from_iterator(tmp_path):
+ db = lancedb.connect(tmp_path)
+
+ def gen_data():
+ for _ in range(10):
+ yield pa.RecordBatch.from_arrays(
+ [
+ pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
+ pa.array(["foo"]),
+ pa.array([10.0]),
+ ],
+ ["vector", "item", "price"],
+ )
+
+ table = db.create_table("test", data=gen_data())
+ assert table.count_rows() == 10
+
+
+@pytest.mark.asyncio
+async def test_create_table_from_iterator_async(tmp_path):
+ db = await lancedb.connect_async(tmp_path)
+
+ def gen_data():
+ for _ in range(10):
+ yield pa.RecordBatch.from_arrays(
+ [
+ pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
+ pa.array(["foo"]),
+ pa.array([10.0]),
+ ],
+ ["vector", "item", "price"],
+ )
+
+ table = await db.create_table("test", data=gen_data())
+ assert await table.count_rows() == 10
+
+
def test_create_exist_ok(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(
diff --git a/python/python/tests/test_query.py b/python/python/tests/test_query.py
index ae50c991..11750e4d 100644
--- a/python/python/tests/test_query.py
+++ b/python/python/tests/test_query.py
@@ -51,6 +51,7 @@ class MockTable:
"refine_factor": query.refine_factor,
},
batch_size=batch_size,
+ offset=query.offset,
).to_reader()
@@ -106,6 +107,13 @@ def test_cast(table):
assert r0.float_field == 1.0
+def test_offset(table):
+ results_without_offset = LanceVectorQueryBuilder(table, [0, 0], "vector")
+ assert len(results_without_offset.to_pandas()) == 2
+ results_with_offset = LanceVectorQueryBuilder(table, [0, 0], "vector").offset(1)
+ assert len(results_with_offset.to_pandas()) == 1
+
+
def test_query_builder(table):
rs = (
LanceVectorQueryBuilder(table, [0, 0], "vector")
@@ -269,7 +277,10 @@ async def test_query_async(table_async: AsyncTable):
table_async.query().select({"foo": "id", "bar": "id + 1"}),
expected_columns=["foo", "bar"],
)
+
await check_query(table_async.query().limit(1), expected_num_rows=1)
+ await check_query(table_async.query().offset(1), expected_num_rows=1)
+
await check_query(
table_async.query().nearest_to(pa.array([1, 2])), expected_num_rows=2
)
diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py
index 6ca2f5f1..65cf0c9d 100644
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -2,13 +2,13 @@
# SPDX-FileCopyrightText: Copyright The Lance Authors
import functools
+import os
from copy import copy
from datetime import date, datetime, timedelta
from pathlib import Path
from time import sleep
from typing import List
from unittest.mock import PropertyMock, patch
-import os
import lance
import lancedb
@@ -907,6 +907,16 @@ def test_hybrid_search(db, tmp_path):
"Our father who art in heaven", query_type="hybrid"
).to_pydantic(MyTable)
+ # Test that double and single quote characters are handled with phrase_query()
+ (
+ table.search(
+ '"Aren\'t you a little short for a stormtrooper?" -- Leia',
+ query_type="hybrid",
+ )
+ .phrase_query(True)
+ .to_pydantic(MyTable)
+ )
+
assert result1 == result3
# with post filters
diff --git a/python/src/query.rs b/python/src/query.rs
index f88e60b4..42bd4a13 100644
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -64,6 +64,10 @@ impl Query {
self.inner = self.inner.clone().limit(limit as usize);
}
+ pub fn offset(&mut self, offset: u32) {
+ self.inner = self.inner.clone().offset(offset as usize);
+ }
+
pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult {
let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
let array = make_array(data);
@@ -138,6 +142,10 @@ impl VectorQuery {
self.inner = self.inner.clone().limit(limit as usize);
}
+ pub fn offset(&mut self, offset: u32) {
+ self.inner = self.inner.clone().offset(offset as usize);
+ }
+
pub fn column(&mut self, column: String) {
self.inner = self.inner.clone().column(&column);
}
diff --git a/rust/ffi/node/src/table.rs b/rust/ffi/node/src/table.rs
index 10e7f19b..3e49d742 100644
--- a/rust/ffi/node/src/table.rs
+++ b/rust/ffi/node/src/table.rs
@@ -391,7 +391,7 @@ impl JsTable {
materialize_deletions_threshold.value(&mut cx) as f32;
}
if let Some(num_threads) = js_options.get_opt::(&mut cx, "numThreads")? {
- options.num_threads = num_threads.value(&mut cx) as usize;
+ options.num_threads = Some(num_threads.value(&mut cx) as usize);
}
rt.spawn(async move {
diff --git a/rust/lancedb/src/query.rs b/rust/lancedb/src/query.rs
index 714200ae..d2895668 100644
--- a/rust/lancedb/src/query.rs
+++ b/rust/lancedb/src/query.rs
@@ -338,6 +338,12 @@ pub trait QueryBase {
/// it will default to 10.
fn limit(self, limit: usize) -> Self;
+ /// Set the offset of the query.
+
+ /// By default, it fetches starting with the first row.
+ /// This method can be used to skip the first `offset` rows.
+ fn offset(self, offset: usize) -> Self;
+
/// Only return rows which match the filter.
///
/// The filter should be supplied as an SQL query string. For example:
@@ -408,6 +414,11 @@ impl QueryBase for T {
self
}
+ fn offset(mut self, offset: usize) -> Self {
+ self.mut_query().offset = Some(offset);
+ self
+ }
+
fn only_if(mut self, filter: impl AsRef) -> Self {
self.mut_query().filter = Some(filter.as_ref().to_string());
self
@@ -520,6 +531,9 @@ pub struct Query {
/// limit the number of rows to return.
pub(crate) limit: Option,
+ /// Offset of the query.
+ pub(crate) offset: Option,
+
/// Apply filter to the returned rows.
pub(crate) filter: Option,
@@ -541,6 +555,7 @@ impl Query {
Self {
parent,
limit: None,
+ offset: None,
filter: None,
full_text_search: None,
select: Select::All,
@@ -858,6 +873,7 @@ mod tests {
let query = table
.query()
.limit(100)
+ .offset(1)
.nearest_to(&[9.8, 8.7])
.unwrap()
.nprobes(1000)
@@ -870,6 +886,7 @@ mod tests {
new_vector
);
assert_eq!(query.base.limit.unwrap(), 100);
+ assert_eq!(query.base.offset.unwrap(), 1);
assert_eq!(query.nprobes, 1000);
assert!(query.use_index);
assert_eq!(query.distance_type, Some(DistanceType::Cosine));
@@ -916,10 +933,26 @@ mod tests {
let result = query.execute().await;
let mut stream = result.expect("should have result");
// should only have one batch
+
while let Some(batch) = stream.next().await {
// pre filter should return 10 rows
assert!(batch.expect("should be Ok").num_rows() == 10);
}
+
+ let query = table
+ .query()
+ .limit(10)
+ .offset(1)
+ .only_if(String::from("id % 2 == 0"))
+ .nearest_to(&[0.1; 4])
+ .unwrap();
+ let result = query.execute().await;
+ let mut stream = result.expect("should have result");
+ // should only have one batch
+ while let Some(batch) = stream.next().await {
+ // pre filter should return 10 rows
+ assert!(batch.expect("should be Ok").num_rows() == 9);
+ }
}
#[tokio::test]
diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs
index 12254819..88c23533 100644
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -1852,9 +1852,16 @@ impl TableInternal for NativeTable {
query_vector,
query.base.limit.unwrap_or(DEFAULT_TOP_K),
)?;
+ scanner.limit(
+ query.base.limit.map(|limit| limit as i64),
+ query.base.offset.map(|offset| offset as i64),
+ )?;
} else {
// If there is no vector query, it's ok to not have a limit
- scanner.limit(query.base.limit.map(|limit| limit as i64), None)?;
+ scanner.limit(
+ query.base.limit.map(|limit| limit as i64),
+ query.base.offset.map(|offset| offset as i64),
+ )?;
}
scanner.nprobs(query.nprobes);
@@ -2781,7 +2788,7 @@ mod tests {
.get_index_type(index_uuid)
.await
.unwrap(),
- Some("IVF".to_string())
+ Some("IVF_PQ".to_string())
);
assert_eq!(
table