Merge branch 'main' of https://github.com/lancedb/lancedb into yang/relative-lance-dep

This commit is contained in:
BubbleCal
2024-09-09 08:09:13 +08:00
24 changed files with 602 additions and 135 deletions

109
.github/workflows/java-publish.yml vendored Normal file
View File

@@ -0,0 +1,109 @@
name: Build and publish Java packages
on:
release:
types: [released]
pull_request:
paths:
- .github/workflows/java-publish.yml
jobs:
macos-arm64:
name: Build on MacOS Arm64
runs-on: macos-14
timeout-minutes: 45
defaults:
run:
working-directory: ./java/core/lancedb-jni
steps:
- name: Checkout repository
uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
- name: Install dependencies
run: |
brew install protobuf
- name: Build release
run: |
cargo build --release
- uses: actions/upload-artifact@v4
with:
name: liblancedb_jni_darwin_aarch64.zip
path: target/release/liblancedb_jni.dylib
retention-days: 1
if-no-files-found: error
linux-arm64:
name: Build on Linux Arm64
runs-on: warp-ubuntu-2204-arm64-8x
timeout-minutes: 45
defaults:
run:
working-directory: ./java/core/lancedb-jni
steps:
- name: Checkout repository
uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: "1.79.0"
cache-workspaces: "./java/core/lancedb-jni"
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
rustflags: "-C debuginfo=1"
- name: Install dependencies
run: |
sudo apt -y -qq update
sudo apt install -y protobuf-compiler libssl-dev pkg-config
- name: Build release
run: |
cargo build --release
- uses: actions/upload-artifact@v4
with:
name: liblancedb_jni_linux_aarch64.zip
path: target/release/liblancedb_jni.so
retention-days: 1
if-no-files-found: error
linux-x86:
runs-on: warp-ubuntu-2204-x64-8x
timeout-minutes: 30
needs: [macos-arm64, linux-arm64]
defaults:
run:
working-directory: ./java
steps:
- name: Checkout repository
uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
- name: Set up Java 8
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 8
cache: "maven"
server-id: ossrh
server-username: SONATYPE_USER
server-password: SONATYPE_TOKEN
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
- name: Install dependencies
run: |
sudo apt -y -qq update
sudo apt install -y protobuf-compiler libssl-dev pkg-config
- name: Download artifact
uses: actions/download-artifact@v4
- name: Copy native libs
run: |
mkdir -p ./core/target/classes/nativelib/darwin-aarch64 ./core/target/classes/nativelib/linux-aarch64
cp ../liblancedb_jni_darwin_aarch64.zip/liblancedb_jni.dylib ./core/target/classes/nativelib/darwin-aarch64/liblancedb_jni.dylib
cp ../liblancedb_jni_linux_aarch64.zip/liblancedb_jni.so ./core/target/classes/nativelib/linux-aarch64/liblancedb_jni.so
- name: Set github
run: |
git config --global user.email "LanceDB Github Runner"
git config --global user.name "dev+gha@lancedb.com"
- name: Publish with Java 8
run: |
echo "use-agent" >> ~/.gnupg/gpg.conf
echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
export GPG_TTY=$(tty)
mvn --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh
env:
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}

View File

@@ -26,6 +26,7 @@ theme:
- content.code.copy
- content.tabs.link
- content.action.edit
- content.tooltips
- toc.follow
- navigation.top
- navigation.tabs
@@ -35,6 +36,7 @@ theme:
- navigation.instant
icon:
repo: fontawesome/brands/github
annotation: material/arrow-right-circle
custom_dir: overrides
plugins:
@@ -76,7 +78,12 @@ markdown_extensions:
- pymdownx.tabbed:
alternate_style: true
- md_in_html
- abbr
- attr_list
- pymdownx.snippets
- pymdownx.emoji:
emoji_index: !!python/name:material.extensions.emoji.twemoji
emoji_generator: !!python/name:material.extensions.emoji.to_svg
nav:
- Home:

View File

@@ -15,11 +15,13 @@ HNSW also combines this with the ideas behind a classic 1-dimensional search dat
## k-Nearest Neighbor Graphs and k-approximate Nearest neighbor Graphs
The k-nearest neighbor graph actually predates its use for ANN search. Its construction is quite simple:
* Each vector in the dataset is given an associated vertex.
* Each vertex has outgoing edges to its k nearest neighbors. That is, the k closest other vertices by Euclidean distance between the two corresponding vectors. This can be thought of as a "friend list" for the vertex.
* For some applications (including nearest-neighbor search), the incoming edges are also added.
Eventually, it was realized that the following greedy search method over such a graph typically results in good approximate nearest neighbors:
* Given a query vector, start at some fixed "entry point" vertex (e.g. the approximate center node).
* Look at that vertex's neighbors. If any of them are closer to the query vector than the current vertex, then move to that vertex.
* Repeat until a local optimum is found.
@@ -36,15 +38,18 @@ One downside of k-NN and k-ANN graphs alone is that one must typically build the
## HNSW: Hierarchical Navigable Small Worlds
HNSW builds on k-ANN in two main ways:
* Instead of getting the k-approximate nearest neighbors for a large value of k, it sparsifies the k-ANN graph using a carefully chosen "edge pruning" heuristic, allowing for the number of edges per vertex to be limited to a relatively small constant.
* The "entry point" vertex is chosen dynamically using a recursively constructed data structure on a subset of the data, similarly to a skip list.
This recursive structure can be thought of as separating into layers:
* At the bottom-most layer, an k-ANN graph on the whole dataset is present.
* At the second layer, a k-ANN graph on a fraction of the dataset (e.g. 10%) is present.
* At the Lth layer, a k-ANN graph is present. It is over a (constant) fraction (e.g. 10%) of the vectors/vertices present in the L-1th layer.
Then the greedy search routine operates as follows:
* At the top layer (using an arbitrary vertex as an entry point), use the greedy local search routine on the k-ANN graph to get an approximate nearest neighbor at that layer.
* Using the approximate nearest neighbor found in the previous layer as an entry point, find an approximate nearest neighbor in the next layer with the same method.
* Repeat until the bottom-most layer is reached. Then use the entry point to find multiple nearest neighbors (e.g. top 10).

View File

@@ -17,7 +17,7 @@ from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
db = lancedb.connect(tmp_path)
func = get_registry.get("imagebind").create()
func = get_registry().get("imagebind").create()
class ImageBindModel(LanceModel):
text: str

View File

@@ -20,7 +20,7 @@ from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
db = lancedb.connect(tmp_path)
func = get_registry.get("open-clip").create()
func = get_registry().get("open-clip").create()
class Images(LanceModel):
label: str

View File

@@ -4,13 +4,14 @@ Using cohere API requires cohere package, which can be installed using `pip inst
You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API.
Supported models are:
* embed-english-v3.0
* embed-multilingual-v3.0
* embed-english-light-v3.0
* embed-multilingual-light-v3.0
* embed-english-v2.0
* embed-english-light-v2.0
* embed-multilingual-v2.0
- embed-english-v3.0
- embed-multilingual-v3.0
- embed-english-light-v3.0
- embed-multilingual-light-v3.0
- embed-english-v2.0
- embed-english-light-v2.0
- embed-multilingual-v2.0
Supported parameters (to be passed in `create` method) are:

View File

@@ -1,30 +1,84 @@
There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models.
# 📚 Available Embedding Models
## Text embedding functions
Contains the text embedding functions registered by default.
There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models. 🚀
* Embedding functions have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with exponential backoff.
* Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7.
Before jumping on the list of available models, let's understand how to get an embedding model initialized and configured to use in our code:
**Available Text Embeddings**:
!!! example "Example usage"
```python
model = get_registry()
.get("openai")
.create(name="text-embedding-ada-002")
```
- [Sentence Transformers](available_embedding_models/text_embedding_functions/sentence_transformers.md)
- [Huggingface Embedding Models](available_embedding_models/text_embedding_functions/huggingface_embedding.md)
- [Ollama Embeddings](available_embedding_models/text_embedding_functions/ollama_embedding.md)
- [OpenAI Embeddings](available_embedding_models/text_embedding_functions/openai_embedding.md)
- [Instructor Embeddings](available_embedding_models/text_embedding_functions/instructor_embedding.md)
- [Gemini Embeddings](available_embedding_models/text_embedding_functions/gemini_embedding.md)
- [Cohere Embeddings](available_embedding_models/text_embedding_functions/cohere_embedding.md)
- [Jina Embeddings](available_embedding_models/text_embedding_functions/jina_embedding.md)
- [AWS Bedrock Text Embedding Functions](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md)
- [IBM Watsonx.ai Embeddings](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md)
Now let's understand the above syntax:
```python
model = get_registry().get("model_id").create(...params)
```
**This👆 line effectively creates a configured instance of an `embedding function` with `model` of choice that is ready for use.**
- `get_registry()` : This function call returns an instance of a `EmbeddingFunctionRegistry` object. This registry manages the registration and retrieval of embedding functions.
- `.get("model_id")` : This method call on the registry object and retrieves the **embedding models functions** associated with the `"model_id"` (1) .
{ .annotate }
1. Hover over the names in table below to find out the `model_id` of different embedding functions.
- `.create(...params)` : This method call is on the object returned by the `get` method. It instantiates an embedding model function using the **specified parameters**.
??? question "What parameters does the `.create(...params)` method accepts?"
**Checkout the documentation of specific embedding models (links in the table below👇) to know what parameters it takes**.
!!! tip "Moving on"
Now that we know how to get the **desired embedding model** and use it in our code, let's explore the comprehensive **list** of embedding models **supported by LanceDB**, in the tables below.
## Text Embedding Functions 📝
These functions are registered by default to handle text embeddings.
- 🔄 **Embedding functions** have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with **exponential backoff**.
- 🌕 Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7.
🌟 **Available Text Embeddings**
| **Embedding** :material-information-outline:{ title="Hover over the name to find out the model_id" } | **Description** | **Documentation** |
|-----------|-------------|---------------|
| [**Sentence Transformers**](available_embedding_models/text_embedding_functions/sentence_transformers.md "sentence-transformers") | 🧠 **SentenceTransformers** is a Python framework for state-of-the-art sentence, text, and image embeddings. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/sbert_2.png" alt="Sentence Transformers Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/sentence_transformers.md)|
| [**Huggingface Models**](available_embedding_models/text_embedding_functions/huggingface_embedding.md "huggingface") |🤗 We offer support for all **Huggingface** models. The default model is `colbert-ir/colbertv2.0`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/hugging_face.png" alt="Huggingface Icon" width="130" height="35">](available_embedding_models/text_embedding_functions/huggingface_embedding.md) |
| [**Ollama Embeddings**](available_embedding_models/text_embedding_functions/ollama_embedding.md "ollama") | 🔍 Generate embeddings via the **Ollama** python library. Ollama supports embedding models, making it possible to build RAG apps. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/Ollama.png" alt="Ollama Icon" width="110" height="35">](available_embedding_models/text_embedding_functions/ollama_embedding.md)|
| [**OpenAI Embeddings**](available_embedding_models/text_embedding_functions/openai_embedding.md "openai")| 🔑 **OpenAIs** text embeddings measure the relatedness of text strings. **LanceDB** supports state-of-the-art embeddings from OpenAI. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/openai.png" alt="OpenAI Icon" width="100" height="35">](available_embedding_models/text_embedding_functions/openai_embedding.md)|
| [**Instructor Embeddings**](available_embedding_models/text_embedding_functions/instructor_embedding.md "instructor") | 📚 **Instructor**: An instruction-finetuned text embedding model that can generate text embeddings tailored to any task and domains by simply providing the task instruction, without any finetuning. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/instructor_embedding.png" alt="Instructor Embedding Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/instructor_embedding.md) |
| [**Gemini Embeddings**](available_embedding_models/text_embedding_functions/gemini_embedding.md "gemini-text") | 🌌 Googles Gemini API generates state-of-the-art embeddings for words, phrases, and sentences. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/gemini.png" alt="Gemini Icon" width="95" height="35">](available_embedding_models/text_embedding_functions/gemini_embedding.md) |
| [**Cohere Embeddings**](available_embedding_models/text_embedding_functions/cohere_embedding.md "cohere") | 💬 This will help you get started with **Cohere** embedding models using LanceDB. Using cohere API requires cohere package. Install it via `pip`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/cohere.png" alt="Cohere Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/cohere_embedding.md) |
| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |
| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/aws_bedrock.png" alt="AWS Bedrock Icon" width="120" height="35">](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |
| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/watsonx.png" alt="Watsonx Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
## Multi-modal embedding functions
Multi-modal embedding functions allow you to query your table using both images and text.
**Available Multi-modal Embeddings** :
[st-key]: "sentence-transformers"
[hf-key]: "huggingface"
[ollama-key]: "ollama"
[openai-key]: "openai"
[instructor-key]: "instructor"
[gemini-key]: "gemini-text"
[cohere-key]: "cohere"
[jina-key]: "jina"
[aws-key]: "bedrock-text"
[watsonx-key]: "watsonx"
- [OpenClip Embeddings](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md)
- [Imagebind Embeddings](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)
- [Jina Embeddings](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md)
## Multi-modal Embedding Functions🖼️
Multi-modal embedding functions allow you to query your table using both images and text. 💬🖼️
🌐 **Available Multi-modal Embeddings**
| Embedding :material-information-outline:{ title="Hover over the name to find out the model_id" } | Description | Documentation |
|-----------|-------------|---------------|
| [**OpenClip Embeddings**](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md "open-clip") | 🎨 We support CLIP model embeddings using the open source alternative, **open-clip** which supports various customizations. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/openclip_github.png" alt="openclip Icon" width="150" height="35">](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) |
| [**Imagebind Embeddings**](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md "imageind") | 🌌 We have support for **imagebind model embeddings**. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/imagebind_meta.png" alt="imagebind Icon" width="150" height="35">](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)|
| [**Jina Multi-modal Embeddings**](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md "jina") | 🔗 **Jina embeddings** can also be used to embed both **text** and **image** data, only some of the models support image data and you can check the detailed documentation. 👉 | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="jina Icon" width="90" height="35">](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) |
!!! note
If you'd like to request support for additional **embedding functions**, please feel free to open an issue on our LanceDB [GitHub issue page](https://github.com/lancedb/lancedb/issues).

View File

@@ -416,7 +416,6 @@ You can create an empty table for scenarios where you want to add data to the ta
=== "Python"
```python
An empty table can be initialized via a PyArrow schema.

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.1-SNAPSHOT</version>
<version>0.0.3</version>
<relativePath>../pom.xml</relativePath>
</parent>
@@ -68,7 +68,7 @@
</goals>
<configuration>
<path>lancedb-jni</path>
<!--<release>true</release>-->
<release>true</release>
<!-- Copy native libraries to target/classes for runtime access -->
<copyTo>${project.build.directory}/classes/nativelib</copyTo>
<copyWithPlatformDir>true</copyWithPlatformDir>

View File

@@ -6,15 +6,28 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.1-SNAPSHOT</version>
<version>0.0.3</version>
<packaging>pom</packaging>
<name>Lance Parent</name>
<name>LanceDB Parent</name>
<description>LanceDB vector database Java API</description>
<url>http://lancedb.com/</url>
<developers>
<developer>
<name>Lance DB Dev Group</name>
<email>dev@lancedb.com</email>
</developer>
</developers>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
</license>
</licenses>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<arrow.version>15.0.0</arrow.version>
</properties>
@@ -22,6 +35,12 @@
<module>core</module>
</modules>
<scm>
<connection>scm:git:https://github.com/lancedb/lancedb.git</connection>
<developerConnection>scm:git:ssh://git@github.com/lancedb/lancedb.git</developerConnection>
<url>https://github.com/lancedb/lancedb</url>
</scm>
<dependencyManagement>
<dependencies>
<dependency>
@@ -62,8 +81,45 @@
</dependencies>
</dependencyManagement>
<distributionManagement>
<snapshotRepository>
<id>ossrh</id>
<url>https://s01.oss.sonatype.org/content/repositories/snapshots</url>
</snapshotRepository>
<repository>
<id>ossrh</id>
<url>https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>
</distributionManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
@@ -126,4 +182,82 @@
</plugins>
</pluginManagement>
</build>
<profiles>
<profile>
<id>jdk8</id>
<activation>
<jdk>[1.8,1.8.999]</jdk>
</activation>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
</profile>
<profile>
<id>jdk11+</id>
<activation>
<jdk>[11,)</jdk>
</activation>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.5</version>
<configuration>
<argLine>--add-opens=java.base/java.nio=ALL-UNNAMED</argLine>
<forkNode implementation="org.apache.maven.plugin.surefire.extensions.SurefireForkNodeFactory" />
<useSystemClassLoader>false</useSystemClassLoader>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>deploy-to-ossrh</id>
<build>
<plugins>
<plugin>
<groupId>org.sonatype.central</groupId>
<artifactId>central-publishing-maven-plugin</artifactId>
<version>0.4.0</version>
<extensions>true</extensions>
<configuration>
<publishingServerId>ossrh</publishingServerId>
<tokenAuth>true</tokenAuth>
</configuration>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://s01.oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.5</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>

View File

@@ -3,7 +3,7 @@ name = "lancedb"
# version in Cargo.toml
dependencies = [
"deprecation",
"pylance==0.17.0-beta.2",
"pylance==0.17.0",
"ratelimiter~=1.0",
"requests>=2.31.0",
"retry>=0.9.2",

View File

@@ -73,6 +73,7 @@ class Query:
def where(self, filter: str): ...
def select(self, columns: Tuple[str, str]): ...
def limit(self, limit: int): ...
def offset(self, offset: int): ...
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
def nearest_to_text(self, query: dict) -> Query: ...
async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
@@ -83,6 +84,7 @@ class VectorQuery:
def select(self, columns: List[str]): ...
def select_with_projection(self, columns: Tuple[str, str]): ...
def limit(self, limit: int): ...
def offset(self, offset: int): ...
def column(self, column: str): ...
def distance_type(self, distance_type: str): ...
def postfilter(self): ...

View File

@@ -14,7 +14,6 @@
from __future__ import annotations
import asyncio
import inspect
import os
from abc import abstractmethod
from pathlib import Path
@@ -27,8 +26,13 @@ from pyarrow import fs
from lancedb.common import data_to_reader, validate_schema
from ._lancedb import connect as lancedb_connect
from .pydantic import LanceModel
from .table import AsyncTable, LanceTable, Table, _sanitize_data, _table_path
from .table import (
AsyncTable,
LanceTable,
Table,
_table_path,
sanitize_create_table,
)
from .util import (
fs_from_uri,
get_uri_location,
@@ -37,6 +41,7 @@ from .util import (
)
if TYPE_CHECKING:
from .pydantic import LanceModel
from datetime import timedelta
from ._lancedb import Connection as LanceDbConnection
@@ -722,12 +727,6 @@ class AsyncConnection(object):
... await db.create_table("table4", make_batches(), schema=schema)
>>> asyncio.run(iterable_example())
"""
if inspect.isclass(schema) and issubclass(schema, LanceModel):
# convert LanceModel to pyarrow schema
# note that it's possible this contains
# embedding function metadata already
schema = schema.to_arrow_schema()
metadata = None
# Defining defaults here and not in function prototype. In the future
@@ -738,31 +737,9 @@ class AsyncConnection(object):
if fill_value is None:
fill_value = 0.0
if data is not None:
data, schema = _sanitize_data(
data,
schema,
metadata=metadata,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
if schema is None:
if data is None:
raise ValueError("Either data or schema must be provided")
elif hasattr(data, "schema"):
schema = data.schema
elif isinstance(data, Iterable):
if metadata:
raise TypeError(
(
"Persistent embedding functions not yet "
"supported for generator data input"
)
)
if metadata:
schema = schema.with_metadata(metadata)
data, schema = sanitize_create_table(
data, schema, metadata, on_bad_vectors, fill_value
)
validate_schema(schema)
if exist_ok is None:

View File

@@ -42,9 +42,9 @@ if TYPE_CHECKING:
import PIL
import polars as pl
from .common import VEC
from ._lancedb import Query as LanceQuery
from ._lancedb import VectorQuery as LanceVectorQuery
from .common import VEC
from .pydantic import LanceModel
from .table import Table
@@ -85,6 +85,8 @@ class Query(pydantic.BaseModel):
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
tuning advice.
offset: int
The offset to start fetching results from
"""
vector_column: Optional[str] = None
@@ -119,6 +121,8 @@ class Query(pydantic.BaseModel):
with_row_id: bool = False
offset: int = 0
class LanceQueryBuilder(ABC):
"""An abstract query builder. Subclasses are defined for vector search,
@@ -233,6 +237,7 @@ class LanceQueryBuilder(ABC):
def __init__(self, table: "Table"):
self._table = table
self._limit = 10
self._offset = 0
self._columns = None
self._where = None
self._prefilter = False
@@ -371,6 +376,25 @@ class LanceQueryBuilder(ABC):
self._limit = limit
return self
def offset(self, offset: int) -> LanceQueryBuilder:
"""Set the offset for the results.
Parameters
----------
offset: int
The offset to start fetching results from.
Returns
-------
LanceQueryBuilder
The LanceQueryBuilder object.
"""
if offset is None or offset <= 0:
self._offset = 0
else:
self._offset = offset
return self
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
"""Set the columns to return.
@@ -649,6 +673,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
refine_factor=self._refine_factor,
vector_column=self._vector_column,
with_row_id=self._with_row_id,
offset=self._offset,
)
result_set = self._table._execute_query(query, batch_size)
if self._reranker is not None:
@@ -780,6 +805,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
"columns": self._fts_columns,
},
vector=[],
offset=self._offset,
)
results = self._table._execute_query(query)
results = results.read_all()
@@ -826,7 +852,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
)
if len(row_ids) == 0:
empty_schema = pa.schema([pa.field("_score", pa.float32())])
return pa.Table.from_pylist([], schema=empty_schema)
return pa.Table.from_batches([], schema=empty_schema)
scores = pa.array(scores)
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
output_tbl = output_tbl.append_column("_score", scores)
@@ -939,6 +965,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._reranker = RRFReranker()
self._nprobes = None
self._refine_factor = None
self._phrase_query = False
def _validate_query(self, query, vector=None, text=None):
if query is not None and (vector is not None or text is not None):
@@ -960,6 +987,23 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
return vector_query, text_query
def phrase_query(self, phrase_query: bool = True) -> LanceHybridQueryBuilder:
"""Set whether to use phrase query.
Parameters
----------
phrase_query: bool, default True
If True, then the query will be wrapped in quotes and
double quotes replaced by single quotes.
Returns
-------
LanceHybridQueryBuilder
The LanceHybridQueryBuilder object.
"""
self._phrase_query = phrase_query
return self
def to_arrow(self) -> pa.Table:
vector_query, fts_query = self._validate_query(
self._query, self._vector, self._text
@@ -986,6 +1030,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
if self._with_row_id:
self._vector_query.with_row_id(True)
self._fts_query.with_row_id(True)
if self._phrase_query:
self._fts_query.phrase_query(True)
if self._nprobes:
self._vector_query.nprobes(self._nprobes)
if self._refine_factor:
@@ -1220,6 +1266,18 @@ class AsyncQueryBase(object):
self._inner.limit(limit)
return self
def offset(self, offset: int) -> AsyncQuery:
"""
Set the offset for the results.
Parameters
----------
offset: int
The offset to start fetching results from.
"""
self._inner.offset(offset)
return self
async def to_batches(
self, *, max_batch_length: Optional[int] = None
) -> AsyncRecordBatchReader:

View File

@@ -11,12 +11,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Iterable, Union
import pyarrow as pa
def to_ipc_binary(table: pa.Table) -> bytes:
def to_ipc_binary(table: Union[pa.Table, Iterable[pa.RecordBatch]]) -> bytes:
"""Serialize a PyArrow Table to IPC binary."""
sink = pa.BufferOutputStream()
if isinstance(table, Iterable):
table = pa.Table.from_batches(table)
with pa.ipc.new_stream(sink, table.schema) as writer:
writer.write_table(table)
return sink.getvalue().to_pybytes()

View File

@@ -11,7 +11,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import logging
import uuid
from concurrent.futures import ThreadPoolExecutor
@@ -26,7 +25,7 @@ from ..common import DATA
from ..db import DBConnection
from ..embeddings import EmbeddingFunctionConfig
from ..pydantic import LanceModel
from ..table import Table, _sanitize_data
from ..table import Table, sanitize_create_table
from ..util import validate_table_name
from .arrow import to_ipc_binary
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
@@ -228,8 +227,6 @@ class RemoteDBConnection(DBConnection):
"""
validate_table_name(name)
if data is None and schema is None:
raise ValueError("Either data or schema must be provided.")
if embedding_functions is not None:
logging.warning(
"embedding_functions is not yet supported on LanceDB Cloud."
@@ -239,24 +236,9 @@ class RemoteDBConnection(DBConnection):
if mode is not None:
logging.warning("mode is not yet supported on LanceDB Cloud.")
if inspect.isclass(schema) and issubclass(schema, LanceModel):
# convert LanceModel to pyarrow schema
# note that it's possible this contains
# embedding function metadata already
schema = schema.to_arrow_schema()
if data is not None:
data, schema = _sanitize_data(
data,
schema,
metadata=None,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
else:
if schema is None:
raise ValueError("Either data or schema must be provided")
data = pa.Table.from_pylist([], schema=schema)
data, schema = sanitize_create_table(
data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
from .table import RemoteTable

View File

@@ -117,15 +117,50 @@ def _sanitize_data(
data = _sanitize_schema(
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
if schema is None:
schema = data.schema
elif isinstance(data, Iterable):
data = _to_record_batch_generator(
data, schema, metadata, on_bad_vectors, fill_value
)
if schema is None:
data, schema = _generator_to_data_and_schema(data)
if schema is None:
raise ValueError("Cannot infer schema from generator data")
else:
raise TypeError(f"Unsupported data type: {type(data)}")
return data, schema
def sanitize_create_table(
data, schema, metadata=None, on_bad_vectors="error", fill_value=0.0
):
if inspect.isclass(schema) and issubclass(schema, LanceModel):
# convert LanceModel to pyarrow schema
# note that it's possible this contains
# embedding function metadata already
schema = schema.to_arrow_schema()
if data is not None:
data, schema = _sanitize_data(
data,
schema,
metadata=metadata,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
if schema is None:
if data is None:
raise ValueError("Either data or schema must be provided")
elif hasattr(data, "schema"):
schema = data.schema
if metadata:
schema = schema.with_metadata(metadata)
return data, schema
def _schema_from_hf(data, schema):
"""
Extract pyarrow schema from HuggingFace DatasetDict
@@ -187,8 +222,30 @@ def _append_vector_col(data: pa.Table, metadata: dict, schema: Optional[pa.Schem
return data
def _generator_to_data_and_schema(
data: Iterable,
) -> Tuple[Iterable[pa.RecordBatch], pa.Schema]:
def _with_first_generator(first, data):
yield first
yield from data
first = next(data, None)
schema = None
if isinstance(first, pa.RecordBatch):
schema = first.schema
data = _with_first_generator(first, data)
elif isinstance(first, pa.Table):
schema = first.schema
data = _with_first_generator(first.to_batches(), data)
return data, schema
def _to_record_batch_generator(
data: Iterable, schema, metadata, on_bad_vectors, fill_value
data: Iterable,
schema,
metadata,
on_bad_vectors,
fill_value,
):
for batch in data:
# always convert to table because we need to sanitize the data
@@ -1569,12 +1626,6 @@ class LanceTable(Table):
The embedding functions to use when creating the table.
"""
tbl = LanceTable(db, name)
if inspect.isclass(schema) and issubclass(schema, LanceModel):
# convert LanceModel to pyarrow schema
# note that it's possible this contains
# embedding function metadata already
schema = schema.to_arrow_schema()
metadata = None
if embedding_functions is not None:
# If we passed in embedding functions explicitly
@@ -1583,33 +1634,11 @@ class LanceTable(Table):
registry = EmbeddingFunctionRegistry.get_instance()
metadata = registry.get_table_metadata(embedding_functions)
if data is not None:
data, schema = _sanitize_data(
data,
schema,
metadata=metadata,
on_bad_vectors=on_bad_vectors,
fill_value=fill_value,
)
data, schema = sanitize_create_table(
data, schema, metadata, on_bad_vectors, fill_value
)
if schema is None:
if data is None:
raise ValueError("Either data or schema must be provided")
elif hasattr(data, "schema"):
schema = data.schema
elif isinstance(data, Iterable):
if metadata:
raise TypeError(
(
"Persistent embedding functions not yet "
"supported for generator data input"
)
)
if metadata:
schema = schema.with_metadata(metadata)
empty = pa.Table.from_pylist([], schema=schema)
empty = pa.Table.from_batches([], schema=schema)
try:
lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode)
except OSError as err:
@@ -1708,6 +1737,7 @@ class LanceTable(Table):
full_text_query=query.full_text_query,
with_row_id=query.with_row_id,
batch_size=batch_size,
offset=query.offset,
).to_reader()
def _do_merge(

View File

@@ -233,6 +233,43 @@ def test_create_mode(tmp_path):
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
def test_create_table_from_iterator(tmp_path):
db = lancedb.connect(tmp_path)
def gen_data():
for _ in range(10):
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
pa.array(["foo"]),
pa.array([10.0]),
],
["vector", "item", "price"],
)
table = db.create_table("test", data=gen_data())
assert table.count_rows() == 10
@pytest.mark.asyncio
async def test_create_table_from_iterator_async(tmp_path):
db = await lancedb.connect_async(tmp_path)
def gen_data():
for _ in range(10):
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
pa.array(["foo"]),
pa.array([10.0]),
],
["vector", "item", "price"],
)
table = await db.create_table("test", data=gen_data())
assert await table.count_rows() == 10
def test_create_exist_ok(tmp_path):
db = lancedb.connect(tmp_path)
data = pd.DataFrame(

View File

@@ -51,6 +51,7 @@ class MockTable:
"refine_factor": query.refine_factor,
},
batch_size=batch_size,
offset=query.offset,
).to_reader()
@@ -106,6 +107,13 @@ def test_cast(table):
assert r0.float_field == 1.0
def test_offset(table):
results_without_offset = LanceVectorQueryBuilder(table, [0, 0], "vector")
assert len(results_without_offset.to_pandas()) == 2
results_with_offset = LanceVectorQueryBuilder(table, [0, 0], "vector").offset(1)
assert len(results_with_offset.to_pandas()) == 1
def test_query_builder(table):
rs = (
LanceVectorQueryBuilder(table, [0, 0], "vector")
@@ -269,7 +277,10 @@ async def test_query_async(table_async: AsyncTable):
table_async.query().select({"foo": "id", "bar": "id + 1"}),
expected_columns=["foo", "bar"],
)
await check_query(table_async.query().limit(1), expected_num_rows=1)
await check_query(table_async.query().offset(1), expected_num_rows=1)
await check_query(
table_async.query().nearest_to(pa.array([1, 2])), expected_num_rows=2
)

View File

@@ -2,13 +2,13 @@
# SPDX-FileCopyrightText: Copyright The Lance Authors
import functools
import os
from copy import copy
from datetime import date, datetime, timedelta
from pathlib import Path
from time import sleep
from typing import List
from unittest.mock import PropertyMock, patch
import os
import lance
import lancedb
@@ -907,6 +907,16 @@ def test_hybrid_search(db, tmp_path):
"Our father who art in heaven", query_type="hybrid"
).to_pydantic(MyTable)
# Test that double and single quote characters are handled with phrase_query()
(
table.search(
'"Aren\'t you a little short for a stormtrooper?" -- Leia',
query_type="hybrid",
)
.phrase_query(True)
.to_pydantic(MyTable)
)
assert result1 == result3
# with post filters

View File

@@ -64,6 +64,10 @@ impl Query {
self.inner = self.inner.clone().limit(limit as usize);
}
pub fn offset(&mut self, offset: u32) {
self.inner = self.inner.clone().offset(offset as usize);
}
pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<VectorQuery> {
let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
let array = make_array(data);
@@ -138,6 +142,10 @@ impl VectorQuery {
self.inner = self.inner.clone().limit(limit as usize);
}
pub fn offset(&mut self, offset: u32) {
self.inner = self.inner.clone().offset(offset as usize);
}
pub fn column(&mut self, column: String) {
self.inner = self.inner.clone().column(&column);
}

View File

@@ -391,7 +391,7 @@ impl JsTable {
materialize_deletions_threshold.value(&mut cx) as f32;
}
if let Some(num_threads) = js_options.get_opt::<JsNumber, _, _>(&mut cx, "numThreads")? {
options.num_threads = num_threads.value(&mut cx) as usize;
options.num_threads = Some(num_threads.value(&mut cx) as usize);
}
rt.spawn(async move {

View File

@@ -338,6 +338,12 @@ pub trait QueryBase {
/// it will default to 10.
fn limit(self, limit: usize) -> Self;
/// Set the offset of the query.
/// By default, it fetches starting with the first row.
/// This method can be used to skip the first `offset` rows.
fn offset(self, offset: usize) -> Self;
/// Only return rows which match the filter.
///
/// The filter should be supplied as an SQL query string. For example:
@@ -408,6 +414,11 @@ impl<T: HasQuery> QueryBase for T {
self
}
fn offset(mut self, offset: usize) -> Self {
self.mut_query().offset = Some(offset);
self
}
fn only_if(mut self, filter: impl AsRef<str>) -> Self {
self.mut_query().filter = Some(filter.as_ref().to_string());
self
@@ -520,6 +531,9 @@ pub struct Query {
/// limit the number of rows to return.
pub(crate) limit: Option<usize>,
/// Offset of the query.
pub(crate) offset: Option<usize>,
/// Apply filter to the returned rows.
pub(crate) filter: Option<String>,
@@ -541,6 +555,7 @@ impl Query {
Self {
parent,
limit: None,
offset: None,
filter: None,
full_text_search: None,
select: Select::All,
@@ -858,6 +873,7 @@ mod tests {
let query = table
.query()
.limit(100)
.offset(1)
.nearest_to(&[9.8, 8.7])
.unwrap()
.nprobes(1000)
@@ -870,6 +886,7 @@ mod tests {
new_vector
);
assert_eq!(query.base.limit.unwrap(), 100);
assert_eq!(query.base.offset.unwrap(), 1);
assert_eq!(query.nprobes, 1000);
assert!(query.use_index);
assert_eq!(query.distance_type, Some(DistanceType::Cosine));
@@ -916,10 +933,26 @@ mod tests {
let result = query.execute().await;
let mut stream = result.expect("should have result");
// should only have one batch
while let Some(batch) = stream.next().await {
// pre filter should return 10 rows
assert!(batch.expect("should be Ok").num_rows() == 10);
}
let query = table
.query()
.limit(10)
.offset(1)
.only_if(String::from("id % 2 == 0"))
.nearest_to(&[0.1; 4])
.unwrap();
let result = query.execute().await;
let mut stream = result.expect("should have result");
// should only have one batch
while let Some(batch) = stream.next().await {
// pre filter should return 10 rows
assert!(batch.expect("should be Ok").num_rows() == 9);
}
}
#[tokio::test]

View File

@@ -1852,9 +1852,16 @@ impl TableInternal for NativeTable {
query_vector,
query.base.limit.unwrap_or(DEFAULT_TOP_K),
)?;
scanner.limit(
query.base.limit.map(|limit| limit as i64),
query.base.offset.map(|offset| offset as i64),
)?;
} else {
// If there is no vector query, it's ok to not have a limit
scanner.limit(query.base.limit.map(|limit| limit as i64), None)?;
scanner.limit(
query.base.limit.map(|limit| limit as i64),
query.base.offset.map(|offset| offset as i64),
)?;
}
scanner.nprobs(query.nprobes);
@@ -2781,7 +2788,7 @@ mod tests {
.get_index_type(index_uuid)
.await
.unwrap(),
Some("IVF".to_string())
Some("IVF_PQ".to_string())
);
assert_eq!(
table