mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-10 13:52:58 +00:00
Merge branch 'main' of https://github.com/lancedb/lancedb into yang/relative-lance-dep
This commit is contained in:
109
.github/workflows/java-publish.yml
vendored
Normal file
109
.github/workflows/java-publish.yml
vendored
Normal file
@@ -0,0 +1,109 @@
|
||||
name: Build and publish Java packages
|
||||
on:
|
||||
release:
|
||||
types: [released]
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/java-publish.yml
|
||||
|
||||
jobs:
|
||||
macos-arm64:
|
||||
name: Build on MacOS Arm64
|
||||
runs-on: macos-14
|
||||
timeout-minutes: 45
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./java/core/lancedb-jni
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
brew install protobuf
|
||||
- name: Build release
|
||||
run: |
|
||||
cargo build --release
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: liblancedb_jni_darwin_aarch64.zip
|
||||
path: target/release/liblancedb_jni.dylib
|
||||
retention-days: 1
|
||||
if-no-files-found: error
|
||||
linux-arm64:
|
||||
name: Build on Linux Arm64
|
||||
runs-on: warp-ubuntu-2204-arm64-8x
|
||||
timeout-minutes: 45
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./java/core/lancedb-jni
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
toolchain: "1.79.0"
|
||||
cache-workspaces: "./java/core/lancedb-jni"
|
||||
# Disable full debug symbol generation to speed up CI build and keep memory down
|
||||
# "1" means line tables only, which is useful for panic tracebacks.
|
||||
rustflags: "-C debuginfo=1"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt -y -qq update
|
||||
sudo apt install -y protobuf-compiler libssl-dev pkg-config
|
||||
- name: Build release
|
||||
run: |
|
||||
cargo build --release
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: liblancedb_jni_linux_aarch64.zip
|
||||
path: target/release/liblancedb_jni.so
|
||||
retention-days: 1
|
||||
if-no-files-found: error
|
||||
linux-x86:
|
||||
runs-on: warp-ubuntu-2204-x64-8x
|
||||
timeout-minutes: 30
|
||||
needs: [macos-arm64, linux-arm64]
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./java
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Set up Java 8
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
distribution: temurin
|
||||
java-version: 8
|
||||
cache: "maven"
|
||||
server-id: ossrh
|
||||
server-username: SONATYPE_USER
|
||||
server-password: SONATYPE_TOKEN
|
||||
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
|
||||
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt -y -qq update
|
||||
sudo apt install -y protobuf-compiler libssl-dev pkg-config
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@v4
|
||||
- name: Copy native libs
|
||||
run: |
|
||||
mkdir -p ./core/target/classes/nativelib/darwin-aarch64 ./core/target/classes/nativelib/linux-aarch64
|
||||
cp ../liblancedb_jni_darwin_aarch64.zip/liblancedb_jni.dylib ./core/target/classes/nativelib/darwin-aarch64/liblancedb_jni.dylib
|
||||
cp ../liblancedb_jni_linux_aarch64.zip/liblancedb_jni.so ./core/target/classes/nativelib/linux-aarch64/liblancedb_jni.so
|
||||
- name: Set github
|
||||
run: |
|
||||
git config --global user.email "LanceDB Github Runner"
|
||||
git config --global user.name "dev+gha@lancedb.com"
|
||||
- name: Publish with Java 8
|
||||
run: |
|
||||
echo "use-agent" >> ~/.gnupg/gpg.conf
|
||||
echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
|
||||
export GPG_TTY=$(tty)
|
||||
mvn --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh
|
||||
env:
|
||||
SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
|
||||
SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
|
||||
@@ -26,6 +26,7 @@ theme:
|
||||
- content.code.copy
|
||||
- content.tabs.link
|
||||
- content.action.edit
|
||||
- content.tooltips
|
||||
- toc.follow
|
||||
- navigation.top
|
||||
- navigation.tabs
|
||||
@@ -35,6 +36,7 @@ theme:
|
||||
- navigation.instant
|
||||
icon:
|
||||
repo: fontawesome/brands/github
|
||||
annotation: material/arrow-right-circle
|
||||
custom_dir: overrides
|
||||
|
||||
plugins:
|
||||
@@ -76,7 +78,12 @@ markdown_extensions:
|
||||
- pymdownx.tabbed:
|
||||
alternate_style: true
|
||||
- md_in_html
|
||||
- abbr
|
||||
- attr_list
|
||||
- pymdownx.snippets
|
||||
- pymdownx.emoji:
|
||||
emoji_index: !!python/name:material.extensions.emoji.twemoji
|
||||
emoji_generator: !!python/name:material.extensions.emoji.to_svg
|
||||
|
||||
nav:
|
||||
- Home:
|
||||
|
||||
@@ -15,11 +15,13 @@ HNSW also combines this with the ideas behind a classic 1-dimensional search dat
|
||||
|
||||
## k-Nearest Neighbor Graphs and k-approximate Nearest neighbor Graphs
|
||||
The k-nearest neighbor graph actually predates its use for ANN search. Its construction is quite simple:
|
||||
|
||||
* Each vector in the dataset is given an associated vertex.
|
||||
* Each vertex has outgoing edges to its k nearest neighbors. That is, the k closest other vertices by Euclidean distance between the two corresponding vectors. This can be thought of as a "friend list" for the vertex.
|
||||
* For some applications (including nearest-neighbor search), the incoming edges are also added.
|
||||
|
||||
Eventually, it was realized that the following greedy search method over such a graph typically results in good approximate nearest neighbors:
|
||||
|
||||
* Given a query vector, start at some fixed "entry point" vertex (e.g. the approximate center node).
|
||||
* Look at that vertex's neighbors. If any of them are closer to the query vector than the current vertex, then move to that vertex.
|
||||
* Repeat until a local optimum is found.
|
||||
@@ -36,15 +38,18 @@ One downside of k-NN and k-ANN graphs alone is that one must typically build the
|
||||
## HNSW: Hierarchical Navigable Small Worlds
|
||||
|
||||
HNSW builds on k-ANN in two main ways:
|
||||
|
||||
* Instead of getting the k-approximate nearest neighbors for a large value of k, it sparsifies the k-ANN graph using a carefully chosen "edge pruning" heuristic, allowing for the number of edges per vertex to be limited to a relatively small constant.
|
||||
* The "entry point" vertex is chosen dynamically using a recursively constructed data structure on a subset of the data, similarly to a skip list.
|
||||
|
||||
This recursive structure can be thought of as separating into layers:
|
||||
|
||||
* At the bottom-most layer, an k-ANN graph on the whole dataset is present.
|
||||
* At the second layer, a k-ANN graph on a fraction of the dataset (e.g. 10%) is present.
|
||||
* At the Lth layer, a k-ANN graph is present. It is over a (constant) fraction (e.g. 10%) of the vectors/vertices present in the L-1th layer.
|
||||
|
||||
Then the greedy search routine operates as follows:
|
||||
|
||||
* At the top layer (using an arbitrary vertex as an entry point), use the greedy local search routine on the k-ANN graph to get an approximate nearest neighbor at that layer.
|
||||
* Using the approximate nearest neighbor found in the previous layer as an entry point, find an approximate nearest neighbor in the next layer with the same method.
|
||||
* Repeat until the bottom-most layer is reached. Then use the entry point to find multiple nearest neighbors (e.g. top 10).
|
||||
|
||||
@@ -17,7 +17,7 @@ from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
func = get_registry.get("imagebind").create()
|
||||
func = get_registry().get("imagebind").create()
|
||||
|
||||
class ImageBindModel(LanceModel):
|
||||
text: str
|
||||
|
||||
@@ -20,7 +20,7 @@ from lancedb.pydantic import LanceModel, Vector
|
||||
from lancedb.embeddings import get_registry
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
func = get_registry.get("open-clip").create()
|
||||
func = get_registry().get("open-clip").create()
|
||||
|
||||
class Images(LanceModel):
|
||||
label: str
|
||||
|
||||
@@ -4,13 +4,14 @@ Using cohere API requires cohere package, which can be installed using `pip inst
|
||||
You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API.
|
||||
|
||||
Supported models are:
|
||||
* embed-english-v3.0
|
||||
* embed-multilingual-v3.0
|
||||
* embed-english-light-v3.0
|
||||
* embed-multilingual-light-v3.0
|
||||
* embed-english-v2.0
|
||||
* embed-english-light-v2.0
|
||||
* embed-multilingual-v2.0
|
||||
|
||||
- embed-english-v3.0
|
||||
- embed-multilingual-v3.0
|
||||
- embed-english-light-v3.0
|
||||
- embed-multilingual-light-v3.0
|
||||
- embed-english-v2.0
|
||||
- embed-english-light-v2.0
|
||||
- embed-multilingual-v2.0
|
||||
|
||||
|
||||
Supported parameters (to be passed in `create` method) are:
|
||||
|
||||
@@ -1,30 +1,84 @@
|
||||
There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models.
|
||||
# 📚 Available Embedding Models
|
||||
|
||||
## Text embedding functions
|
||||
Contains the text embedding functions registered by default.
|
||||
There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models. 🚀
|
||||
|
||||
* Embedding functions have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with exponential backoff.
|
||||
* Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7.
|
||||
Before jumping on the list of available models, let's understand how to get an embedding model initialized and configured to use in our code:
|
||||
|
||||
**Available Text Embeddings**:
|
||||
!!! example "Example usage"
|
||||
```python
|
||||
model = get_registry()
|
||||
.get("openai")
|
||||
.create(name="text-embedding-ada-002")
|
||||
```
|
||||
|
||||
- [Sentence Transformers](available_embedding_models/text_embedding_functions/sentence_transformers.md)
|
||||
- [Huggingface Embedding Models](available_embedding_models/text_embedding_functions/huggingface_embedding.md)
|
||||
- [Ollama Embeddings](available_embedding_models/text_embedding_functions/ollama_embedding.md)
|
||||
- [OpenAI Embeddings](available_embedding_models/text_embedding_functions/openai_embedding.md)
|
||||
- [Instructor Embeddings](available_embedding_models/text_embedding_functions/instructor_embedding.md)
|
||||
- [Gemini Embeddings](available_embedding_models/text_embedding_functions/gemini_embedding.md)
|
||||
- [Cohere Embeddings](available_embedding_models/text_embedding_functions/cohere_embedding.md)
|
||||
- [Jina Embeddings](available_embedding_models/text_embedding_functions/jina_embedding.md)
|
||||
- [AWS Bedrock Text Embedding Functions](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md)
|
||||
- [IBM Watsonx.ai Embeddings](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md)
|
||||
Now let's understand the above syntax:
|
||||
```python
|
||||
model = get_registry().get("model_id").create(...params)
|
||||
```
|
||||
**This👆 line effectively creates a configured instance of an `embedding function` with `model` of choice that is ready for use.**
|
||||
|
||||
- `get_registry()` : This function call returns an instance of a `EmbeddingFunctionRegistry` object. This registry manages the registration and retrieval of embedding functions.
|
||||
|
||||
- `.get("model_id")` : This method call on the registry object and retrieves the **embedding models functions** associated with the `"model_id"` (1) .
|
||||
{ .annotate }
|
||||
|
||||
1. Hover over the names in table below to find out the `model_id` of different embedding functions.
|
||||
|
||||
- `.create(...params)` : This method call is on the object returned by the `get` method. It instantiates an embedding model function using the **specified parameters**.
|
||||
|
||||
??? question "What parameters does the `.create(...params)` method accepts?"
|
||||
**Checkout the documentation of specific embedding models (links in the table below👇) to know what parameters it takes**.
|
||||
|
||||
!!! tip "Moving on"
|
||||
Now that we know how to get the **desired embedding model** and use it in our code, let's explore the comprehensive **list** of embedding models **supported by LanceDB**, in the tables below.
|
||||
|
||||
## Text Embedding Functions 📝
|
||||
These functions are registered by default to handle text embeddings.
|
||||
|
||||
- 🔄 **Embedding functions** have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with **exponential backoff**.
|
||||
|
||||
- 🌕 Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7.
|
||||
|
||||
🌟 **Available Text Embeddings**
|
||||
|
||||
| **Embedding** :material-information-outline:{ title="Hover over the name to find out the model_id" } | **Description** | **Documentation** |
|
||||
|-----------|-------------|---------------|
|
||||
| [**Sentence Transformers**](available_embedding_models/text_embedding_functions/sentence_transformers.md "sentence-transformers") | 🧠 **SentenceTransformers** is a Python framework for state-of-the-art sentence, text, and image embeddings. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/sbert_2.png" alt="Sentence Transformers Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/sentence_transformers.md)|
|
||||
| [**Huggingface Models**](available_embedding_models/text_embedding_functions/huggingface_embedding.md "huggingface") |🤗 We offer support for all **Huggingface** models. The default model is `colbert-ir/colbertv2.0`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/hugging_face.png" alt="Huggingface Icon" width="130" height="35">](available_embedding_models/text_embedding_functions/huggingface_embedding.md) |
|
||||
| [**Ollama Embeddings**](available_embedding_models/text_embedding_functions/ollama_embedding.md "ollama") | 🔍 Generate embeddings via the **Ollama** python library. Ollama supports embedding models, making it possible to build RAG apps. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/Ollama.png" alt="Ollama Icon" width="110" height="35">](available_embedding_models/text_embedding_functions/ollama_embedding.md)|
|
||||
| [**OpenAI Embeddings**](available_embedding_models/text_embedding_functions/openai_embedding.md "openai")| 🔑 **OpenAI’s** text embeddings measure the relatedness of text strings. **LanceDB** supports state-of-the-art embeddings from OpenAI. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/openai.png" alt="OpenAI Icon" width="100" height="35">](available_embedding_models/text_embedding_functions/openai_embedding.md)|
|
||||
| [**Instructor Embeddings**](available_embedding_models/text_embedding_functions/instructor_embedding.md "instructor") | 📚 **Instructor**: An instruction-finetuned text embedding model that can generate text embeddings tailored to any task and domains by simply providing the task instruction, without any finetuning. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/instructor_embedding.png" alt="Instructor Embedding Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/instructor_embedding.md) |
|
||||
| [**Gemini Embeddings**](available_embedding_models/text_embedding_functions/gemini_embedding.md "gemini-text") | 🌌 Google’s Gemini API generates state-of-the-art embeddings for words, phrases, and sentences. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/gemini.png" alt="Gemini Icon" width="95" height="35">](available_embedding_models/text_embedding_functions/gemini_embedding.md) |
|
||||
| [**Cohere Embeddings**](available_embedding_models/text_embedding_functions/cohere_embedding.md "cohere") | 💬 This will help you get started with **Cohere** embedding models using LanceDB. Using cohere API requires cohere package. Install it via `pip`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/cohere.png" alt="Cohere Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/cohere_embedding.md) |
|
||||
| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |
|
||||
| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/aws_bedrock.png" alt="AWS Bedrock Icon" width="120" height="35">](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |
|
||||
| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/watsonx.png" alt="Watsonx Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
|
||||
|
||||
|
||||
## Multi-modal embedding functions
|
||||
Multi-modal embedding functions allow you to query your table using both images and text.
|
||||
|
||||
**Available Multi-modal Embeddings** :
|
||||
[st-key]: "sentence-transformers"
|
||||
[hf-key]: "huggingface"
|
||||
[ollama-key]: "ollama"
|
||||
[openai-key]: "openai"
|
||||
[instructor-key]: "instructor"
|
||||
[gemini-key]: "gemini-text"
|
||||
[cohere-key]: "cohere"
|
||||
[jina-key]: "jina"
|
||||
[aws-key]: "bedrock-text"
|
||||
[watsonx-key]: "watsonx"
|
||||
|
||||
- [OpenClip Embeddings](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md)
|
||||
- [Imagebind Embeddings](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)
|
||||
- [Jina Embeddings](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md)
|
||||
|
||||
## Multi-modal Embedding Functions🖼️
|
||||
|
||||
Multi-modal embedding functions allow you to query your table using both images and text. 💬🖼️
|
||||
|
||||
🌐 **Available Multi-modal Embeddings**
|
||||
|
||||
| Embedding :material-information-outline:{ title="Hover over the name to find out the model_id" } | Description | Documentation |
|
||||
|-----------|-------------|---------------|
|
||||
| [**OpenClip Embeddings**](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md "open-clip") | 🎨 We support CLIP model embeddings using the open source alternative, **open-clip** which supports various customizations. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/openclip_github.png" alt="openclip Icon" width="150" height="35">](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) |
|
||||
| [**Imagebind Embeddings**](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md "imageind") | 🌌 We have support for **imagebind model embeddings**. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/imagebind_meta.png" alt="imagebind Icon" width="150" height="35">](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)|
|
||||
| [**Jina Multi-modal Embeddings**](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md "jina") | 🔗 **Jina embeddings** can also be used to embed both **text** and **image** data, only some of the models support image data and you can check the detailed documentation. 👉 | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="jina Icon" width="90" height="35">](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) |
|
||||
|
||||
!!! note
|
||||
If you'd like to request support for additional **embedding functions**, please feel free to open an issue on our LanceDB [GitHub issue page](https://github.com/lancedb/lancedb/issues).
|
||||
@@ -416,7 +416,6 @@ You can create an empty table for scenarios where you want to add data to the ta
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
|
||||
An empty table can be initialized via a PyArrow schema.
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.1-SNAPSHOT</version>
|
||||
<version>0.0.3</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
</goals>
|
||||
<configuration>
|
||||
<path>lancedb-jni</path>
|
||||
<!--<release>true</release>-->
|
||||
<release>true</release>
|
||||
<!-- Copy native libraries to target/classes for runtime access -->
|
||||
<copyTo>${project.build.directory}/classes/nativelib</copyTo>
|
||||
<copyWithPlatformDir>true</copyWithPlatformDir>
|
||||
|
||||
142
java/pom.xml
142
java/pom.xml
@@ -6,15 +6,28 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.1-SNAPSHOT</version>
|
||||
<version>0.0.3</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>Lance Parent</name>
|
||||
<name>LanceDB Parent</name>
|
||||
<description>LanceDB vector database Java API</description>
|
||||
<url>http://lancedb.com/</url>
|
||||
|
||||
<developers>
|
||||
<developer>
|
||||
<name>Lance DB Dev Group</name>
|
||||
<email>dev@lancedb.com</email>
|
||||
</developer>
|
||||
</developers>
|
||||
<licenses>
|
||||
<license>
|
||||
<name>The Apache Software License, Version 2.0</name>
|
||||
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
|
||||
</license>
|
||||
</licenses>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>11</maven.compiler.source>
|
||||
<maven.compiler.target>11</maven.compiler.target>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
</properties>
|
||||
|
||||
@@ -22,6 +35,12 @@
|
||||
<module>core</module>
|
||||
</modules>
|
||||
|
||||
<scm>
|
||||
<connection>scm:git:https://github.com/lancedb/lancedb.git</connection>
|
||||
<developerConnection>scm:git:ssh://git@github.com/lancedb/lancedb.git</developerConnection>
|
||||
<url>https://github.com/lancedb/lancedb</url>
|
||||
</scm>
|
||||
|
||||
<dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
@@ -62,8 +81,45 @@
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
<distributionManagement>
|
||||
<snapshotRepository>
|
||||
<id>ossrh</id>
|
||||
<url>https://s01.oss.sonatype.org/content/repositories/snapshots</url>
|
||||
</snapshotRepository>
|
||||
<repository>
|
||||
<id>ossrh</id>
|
||||
<url>https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
|
||||
</repository>
|
||||
</distributionManagement>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>2.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>2.9.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-javadocs</id>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-checkstyle-plugin</artifactId>
|
||||
@@ -126,4 +182,82 @@
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>jdk8</id>
|
||||
<activation>
|
||||
<jdk>[1.8,1.8.999]</jdk>
|
||||
</activation>
|
||||
<properties>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
</properties>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>jdk11+</id>
|
||||
<activation>
|
||||
<jdk>[11,)</jdk>
|
||||
</activation>
|
||||
<properties>
|
||||
<maven.compiler.source>11</maven.compiler.source>
|
||||
<maven.compiler.target>11</maven.compiler.target>
|
||||
</properties>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>3.2.5</version>
|
||||
<configuration>
|
||||
<argLine>--add-opens=java.base/java.nio=ALL-UNNAMED</argLine>
|
||||
<forkNode implementation="org.apache.maven.plugin.surefire.extensions.SurefireForkNodeFactory" />
|
||||
<useSystemClassLoader>false</useSystemClassLoader>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>deploy-to-ossrh</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.sonatype.central</groupId>
|
||||
<artifactId>central-publishing-maven-plugin</artifactId>
|
||||
<version>0.4.0</version>
|
||||
<extensions>true</extensions>
|
||||
<configuration>
|
||||
<publishingServerId>ossrh</publishingServerId>
|
||||
<tokenAuth>true</tokenAuth>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.sonatype.plugins</groupId>
|
||||
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||
<version>1.6.13</version>
|
||||
<extensions>true</extensions>
|
||||
<configuration>
|
||||
<serverId>ossrh</serverId>
|
||||
<nexusUrl>https://s01.oss.sonatype.org/</nexusUrl>
|
||||
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-gpg-plugin</artifactId>
|
||||
<version>1.5</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>sign-artifacts</id>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>sign</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
||||
|
||||
@@ -3,7 +3,7 @@ name = "lancedb"
|
||||
# version in Cargo.toml
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.17.0-beta.2",
|
||||
"pylance==0.17.0",
|
||||
"ratelimiter~=1.0",
|
||||
"requests>=2.31.0",
|
||||
"retry>=0.9.2",
|
||||
|
||||
@@ -73,6 +73,7 @@ class Query:
|
||||
def where(self, filter: str): ...
|
||||
def select(self, columns: Tuple[str, str]): ...
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> Query: ...
|
||||
async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
|
||||
@@ -83,6 +84,7 @@ class VectorQuery:
|
||||
def select(self, columns: List[str]): ...
|
||||
def select_with_projection(self, columns: Tuple[str, str]): ...
|
||||
def limit(self, limit: int): ...
|
||||
def offset(self, offset: int): ...
|
||||
def column(self, column: str): ...
|
||||
def distance_type(self, distance_type: str): ...
|
||||
def postfilter(self): ...
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import inspect
|
||||
import os
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
@@ -27,8 +26,13 @@ from pyarrow import fs
|
||||
from lancedb.common import data_to_reader, validate_schema
|
||||
|
||||
from ._lancedb import connect as lancedb_connect
|
||||
from .pydantic import LanceModel
|
||||
from .table import AsyncTable, LanceTable, Table, _sanitize_data, _table_path
|
||||
from .table import (
|
||||
AsyncTable,
|
||||
LanceTable,
|
||||
Table,
|
||||
_table_path,
|
||||
sanitize_create_table,
|
||||
)
|
||||
from .util import (
|
||||
fs_from_uri,
|
||||
get_uri_location,
|
||||
@@ -37,6 +41,7 @@ from .util import (
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .pydantic import LanceModel
|
||||
from datetime import timedelta
|
||||
|
||||
from ._lancedb import Connection as LanceDbConnection
|
||||
@@ -722,12 +727,6 @@ class AsyncConnection(object):
|
||||
... await db.create_table("table4", make_batches(), schema=schema)
|
||||
>>> asyncio.run(iterable_example())
|
||||
"""
|
||||
if inspect.isclass(schema) and issubclass(schema, LanceModel):
|
||||
# convert LanceModel to pyarrow schema
|
||||
# note that it's possible this contains
|
||||
# embedding function metadata already
|
||||
schema = schema.to_arrow_schema()
|
||||
|
||||
metadata = None
|
||||
|
||||
# Defining defaults here and not in function prototype. In the future
|
||||
@@ -738,31 +737,9 @@ class AsyncConnection(object):
|
||||
if fill_value is None:
|
||||
fill_value = 0.0
|
||||
|
||||
if data is not None:
|
||||
data, schema = _sanitize_data(
|
||||
data,
|
||||
schema,
|
||||
metadata=metadata,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
|
||||
if schema is None:
|
||||
if data is None:
|
||||
raise ValueError("Either data or schema must be provided")
|
||||
elif hasattr(data, "schema"):
|
||||
schema = data.schema
|
||||
elif isinstance(data, Iterable):
|
||||
if metadata:
|
||||
raise TypeError(
|
||||
(
|
||||
"Persistent embedding functions not yet "
|
||||
"supported for generator data input"
|
||||
)
|
||||
)
|
||||
|
||||
if metadata:
|
||||
schema = schema.with_metadata(metadata)
|
||||
data, schema = sanitize_create_table(
|
||||
data, schema, metadata, on_bad_vectors, fill_value
|
||||
)
|
||||
validate_schema(schema)
|
||||
|
||||
if exist_ok is None:
|
||||
|
||||
@@ -42,9 +42,9 @@ if TYPE_CHECKING:
|
||||
import PIL
|
||||
import polars as pl
|
||||
|
||||
from .common import VEC
|
||||
from ._lancedb import Query as LanceQuery
|
||||
from ._lancedb import VectorQuery as LanceVectorQuery
|
||||
from .common import VEC
|
||||
from .pydantic import LanceModel
|
||||
from .table import Table
|
||||
|
||||
@@ -85,6 +85,8 @@ class Query(pydantic.BaseModel):
|
||||
|
||||
- See discussion in [Querying an ANN Index][querying-an-ann-index] for
|
||||
tuning advice.
|
||||
offset: int
|
||||
The offset to start fetching results from
|
||||
"""
|
||||
|
||||
vector_column: Optional[str] = None
|
||||
@@ -119,6 +121,8 @@ class Query(pydantic.BaseModel):
|
||||
|
||||
with_row_id: bool = False
|
||||
|
||||
offset: int = 0
|
||||
|
||||
|
||||
class LanceQueryBuilder(ABC):
|
||||
"""An abstract query builder. Subclasses are defined for vector search,
|
||||
@@ -233,6 +237,7 @@ class LanceQueryBuilder(ABC):
|
||||
def __init__(self, table: "Table"):
|
||||
self._table = table
|
||||
self._limit = 10
|
||||
self._offset = 0
|
||||
self._columns = None
|
||||
self._where = None
|
||||
self._prefilter = False
|
||||
@@ -371,6 +376,25 @@ class LanceQueryBuilder(ABC):
|
||||
self._limit = limit
|
||||
return self
|
||||
|
||||
def offset(self, offset: int) -> LanceQueryBuilder:
|
||||
"""Set the offset for the results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offset: int
|
||||
The offset to start fetching results from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
if offset is None or offset <= 0:
|
||||
self._offset = 0
|
||||
else:
|
||||
self._offset = offset
|
||||
return self
|
||||
|
||||
def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
|
||||
"""Set the columns to return.
|
||||
|
||||
@@ -649,6 +673,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
refine_factor=self._refine_factor,
|
||||
vector_column=self._vector_column,
|
||||
with_row_id=self._with_row_id,
|
||||
offset=self._offset,
|
||||
)
|
||||
result_set = self._table._execute_query(query, batch_size)
|
||||
if self._reranker is not None:
|
||||
@@ -780,6 +805,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
"columns": self._fts_columns,
|
||||
},
|
||||
vector=[],
|
||||
offset=self._offset,
|
||||
)
|
||||
results = self._table._execute_query(query)
|
||||
results = results.read_all()
|
||||
@@ -826,7 +852,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
)
|
||||
if len(row_ids) == 0:
|
||||
empty_schema = pa.schema([pa.field("_score", pa.float32())])
|
||||
return pa.Table.from_pylist([], schema=empty_schema)
|
||||
return pa.Table.from_batches([], schema=empty_schema)
|
||||
scores = pa.array(scores)
|
||||
output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
|
||||
output_tbl = output_tbl.append_column("_score", scores)
|
||||
@@ -939,6 +965,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
self._reranker = RRFReranker()
|
||||
self._nprobes = None
|
||||
self._refine_factor = None
|
||||
self._phrase_query = False
|
||||
|
||||
def _validate_query(self, query, vector=None, text=None):
|
||||
if query is not None and (vector is not None or text is not None):
|
||||
@@ -960,6 +987,23 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
|
||||
return vector_query, text_query
|
||||
|
||||
def phrase_query(self, phrase_query: bool = True) -> LanceHybridQueryBuilder:
|
||||
"""Set whether to use phrase query.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
phrase_query: bool, default True
|
||||
If True, then the query will be wrapped in quotes and
|
||||
double quotes replaced by single quotes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceHybridQueryBuilder
|
||||
The LanceHybridQueryBuilder object.
|
||||
"""
|
||||
self._phrase_query = phrase_query
|
||||
return self
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
vector_query, fts_query = self._validate_query(
|
||||
self._query, self._vector, self._text
|
||||
@@ -986,6 +1030,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
||||
if self._with_row_id:
|
||||
self._vector_query.with_row_id(True)
|
||||
self._fts_query.with_row_id(True)
|
||||
if self._phrase_query:
|
||||
self._fts_query.phrase_query(True)
|
||||
if self._nprobes:
|
||||
self._vector_query.nprobes(self._nprobes)
|
||||
if self._refine_factor:
|
||||
@@ -1220,6 +1266,18 @@ class AsyncQueryBase(object):
|
||||
self._inner.limit(limit)
|
||||
return self
|
||||
|
||||
def offset(self, offset: int) -> AsyncQuery:
|
||||
"""
|
||||
Set the offset for the results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
offset: int
|
||||
The offset to start fetching results from.
|
||||
"""
|
||||
self._inner.offset(offset)
|
||||
return self
|
||||
|
||||
async def to_batches(
|
||||
self, *, max_batch_length: Optional[int] = None
|
||||
) -> AsyncRecordBatchReader:
|
||||
|
||||
@@ -11,12 +11,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Iterable, Union
|
||||
import pyarrow as pa
|
||||
|
||||
|
||||
def to_ipc_binary(table: pa.Table) -> bytes:
|
||||
def to_ipc_binary(table: Union[pa.Table, Iterable[pa.RecordBatch]]) -> bytes:
|
||||
"""Serialize a PyArrow Table to IPC binary."""
|
||||
sink = pa.BufferOutputStream()
|
||||
if isinstance(table, Iterable):
|
||||
table = pa.Table.from_batches(table)
|
||||
with pa.ipc.new_stream(sink, table.schema) as writer:
|
||||
writer.write_table(table)
|
||||
return sink.getvalue().to_pybytes()
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
@@ -26,7 +25,7 @@ from ..common import DATA
|
||||
from ..db import DBConnection
|
||||
from ..embeddings import EmbeddingFunctionConfig
|
||||
from ..pydantic import LanceModel
|
||||
from ..table import Table, _sanitize_data
|
||||
from ..table import Table, sanitize_create_table
|
||||
from ..util import validate_table_name
|
||||
from .arrow import to_ipc_binary
|
||||
from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
|
||||
@@ -228,8 +227,6 @@ class RemoteDBConnection(DBConnection):
|
||||
|
||||
"""
|
||||
validate_table_name(name)
|
||||
if data is None and schema is None:
|
||||
raise ValueError("Either data or schema must be provided.")
|
||||
if embedding_functions is not None:
|
||||
logging.warning(
|
||||
"embedding_functions is not yet supported on LanceDB Cloud."
|
||||
@@ -239,24 +236,9 @@ class RemoteDBConnection(DBConnection):
|
||||
if mode is not None:
|
||||
logging.warning("mode is not yet supported on LanceDB Cloud.")
|
||||
|
||||
if inspect.isclass(schema) and issubclass(schema, LanceModel):
|
||||
# convert LanceModel to pyarrow schema
|
||||
# note that it's possible this contains
|
||||
# embedding function metadata already
|
||||
schema = schema.to_arrow_schema()
|
||||
|
||||
if data is not None:
|
||||
data, schema = _sanitize_data(
|
||||
data,
|
||||
schema,
|
||||
metadata=None,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
else:
|
||||
if schema is None:
|
||||
raise ValueError("Either data or schema must be provided")
|
||||
data = pa.Table.from_pylist([], schema=schema)
|
||||
data, schema = sanitize_create_table(
|
||||
data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||
)
|
||||
|
||||
from .table import RemoteTable
|
||||
|
||||
|
||||
@@ -117,15 +117,50 @@ def _sanitize_data(
|
||||
data = _sanitize_schema(
|
||||
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||
)
|
||||
if schema is None:
|
||||
schema = data.schema
|
||||
elif isinstance(data, Iterable):
|
||||
data = _to_record_batch_generator(
|
||||
data, schema, metadata, on_bad_vectors, fill_value
|
||||
)
|
||||
if schema is None:
|
||||
data, schema = _generator_to_data_and_schema(data)
|
||||
if schema is None:
|
||||
raise ValueError("Cannot infer schema from generator data")
|
||||
else:
|
||||
raise TypeError(f"Unsupported data type: {type(data)}")
|
||||
return data, schema
|
||||
|
||||
|
||||
def sanitize_create_table(
|
||||
data, schema, metadata=None, on_bad_vectors="error", fill_value=0.0
|
||||
):
|
||||
if inspect.isclass(schema) and issubclass(schema, LanceModel):
|
||||
# convert LanceModel to pyarrow schema
|
||||
# note that it's possible this contains
|
||||
# embedding function metadata already
|
||||
schema = schema.to_arrow_schema()
|
||||
|
||||
if data is not None:
|
||||
data, schema = _sanitize_data(
|
||||
data,
|
||||
schema,
|
||||
metadata=metadata,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
if schema is None:
|
||||
if data is None:
|
||||
raise ValueError("Either data or schema must be provided")
|
||||
elif hasattr(data, "schema"):
|
||||
schema = data.schema
|
||||
|
||||
if metadata:
|
||||
schema = schema.with_metadata(metadata)
|
||||
|
||||
return data, schema
|
||||
|
||||
|
||||
def _schema_from_hf(data, schema):
|
||||
"""
|
||||
Extract pyarrow schema from HuggingFace DatasetDict
|
||||
@@ -187,8 +222,30 @@ def _append_vector_col(data: pa.Table, metadata: dict, schema: Optional[pa.Schem
|
||||
return data
|
||||
|
||||
|
||||
def _generator_to_data_and_schema(
|
||||
data: Iterable,
|
||||
) -> Tuple[Iterable[pa.RecordBatch], pa.Schema]:
|
||||
def _with_first_generator(first, data):
|
||||
yield first
|
||||
yield from data
|
||||
|
||||
first = next(data, None)
|
||||
schema = None
|
||||
if isinstance(first, pa.RecordBatch):
|
||||
schema = first.schema
|
||||
data = _with_first_generator(first, data)
|
||||
elif isinstance(first, pa.Table):
|
||||
schema = first.schema
|
||||
data = _with_first_generator(first.to_batches(), data)
|
||||
return data, schema
|
||||
|
||||
|
||||
def _to_record_batch_generator(
|
||||
data: Iterable, schema, metadata, on_bad_vectors, fill_value
|
||||
data: Iterable,
|
||||
schema,
|
||||
metadata,
|
||||
on_bad_vectors,
|
||||
fill_value,
|
||||
):
|
||||
for batch in data:
|
||||
# always convert to table because we need to sanitize the data
|
||||
@@ -1569,12 +1626,6 @@ class LanceTable(Table):
|
||||
The embedding functions to use when creating the table.
|
||||
"""
|
||||
tbl = LanceTable(db, name)
|
||||
if inspect.isclass(schema) and issubclass(schema, LanceModel):
|
||||
# convert LanceModel to pyarrow schema
|
||||
# note that it's possible this contains
|
||||
# embedding function metadata already
|
||||
schema = schema.to_arrow_schema()
|
||||
|
||||
metadata = None
|
||||
if embedding_functions is not None:
|
||||
# If we passed in embedding functions explicitly
|
||||
@@ -1583,33 +1634,11 @@ class LanceTable(Table):
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
metadata = registry.get_table_metadata(embedding_functions)
|
||||
|
||||
if data is not None:
|
||||
data, schema = _sanitize_data(
|
||||
data,
|
||||
schema,
|
||||
metadata=metadata,
|
||||
on_bad_vectors=on_bad_vectors,
|
||||
fill_value=fill_value,
|
||||
)
|
||||
data, schema = sanitize_create_table(
|
||||
data, schema, metadata, on_bad_vectors, fill_value
|
||||
)
|
||||
|
||||
if schema is None:
|
||||
if data is None:
|
||||
raise ValueError("Either data or schema must be provided")
|
||||
elif hasattr(data, "schema"):
|
||||
schema = data.schema
|
||||
elif isinstance(data, Iterable):
|
||||
if metadata:
|
||||
raise TypeError(
|
||||
(
|
||||
"Persistent embedding functions not yet "
|
||||
"supported for generator data input"
|
||||
)
|
||||
)
|
||||
|
||||
if metadata:
|
||||
schema = schema.with_metadata(metadata)
|
||||
|
||||
empty = pa.Table.from_pylist([], schema=schema)
|
||||
empty = pa.Table.from_batches([], schema=schema)
|
||||
try:
|
||||
lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode)
|
||||
except OSError as err:
|
||||
@@ -1708,6 +1737,7 @@ class LanceTable(Table):
|
||||
full_text_query=query.full_text_query,
|
||||
with_row_id=query.with_row_id,
|
||||
batch_size=batch_size,
|
||||
offset=query.offset,
|
||||
).to_reader()
|
||||
|
||||
def _do_merge(
|
||||
|
||||
@@ -233,6 +233,43 @@ def test_create_mode(tmp_path):
|
||||
assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
|
||||
|
||||
|
||||
def test_create_table_from_iterator(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
def gen_data():
|
||||
for _ in range(10):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
|
||||
pa.array(["foo"]),
|
||||
pa.array([10.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
)
|
||||
|
||||
table = db.create_table("test", data=gen_data())
|
||||
assert table.count_rows() == 10
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_table_from_iterator_async(tmp_path):
|
||||
db = await lancedb.connect_async(tmp_path)
|
||||
|
||||
def gen_data():
|
||||
for _ in range(10):
|
||||
yield pa.RecordBatch.from_arrays(
|
||||
[
|
||||
pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
|
||||
pa.array(["foo"]),
|
||||
pa.array([10.0]),
|
||||
],
|
||||
["vector", "item", "price"],
|
||||
)
|
||||
|
||||
table = await db.create_table("test", data=gen_data())
|
||||
assert await table.count_rows() == 10
|
||||
|
||||
|
||||
def test_create_exist_ok(tmp_path):
|
||||
db = lancedb.connect(tmp_path)
|
||||
data = pd.DataFrame(
|
||||
|
||||
@@ -51,6 +51,7 @@ class MockTable:
|
||||
"refine_factor": query.refine_factor,
|
||||
},
|
||||
batch_size=batch_size,
|
||||
offset=query.offset,
|
||||
).to_reader()
|
||||
|
||||
|
||||
@@ -106,6 +107,13 @@ def test_cast(table):
|
||||
assert r0.float_field == 1.0
|
||||
|
||||
|
||||
def test_offset(table):
|
||||
results_without_offset = LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
assert len(results_without_offset.to_pandas()) == 2
|
||||
results_with_offset = LanceVectorQueryBuilder(table, [0, 0], "vector").offset(1)
|
||||
assert len(results_with_offset.to_pandas()) == 1
|
||||
|
||||
|
||||
def test_query_builder(table):
|
||||
rs = (
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
@@ -269,7 +277,10 @@ async def test_query_async(table_async: AsyncTable):
|
||||
table_async.query().select({"foo": "id", "bar": "id + 1"}),
|
||||
expected_columns=["foo", "bar"],
|
||||
)
|
||||
|
||||
await check_query(table_async.query().limit(1), expected_num_rows=1)
|
||||
await check_query(table_async.query().offset(1), expected_num_rows=1)
|
||||
|
||||
await check_query(
|
||||
table_async.query().nearest_to(pa.array([1, 2])), expected_num_rows=2
|
||||
)
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
# SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
import functools
|
||||
import os
|
||||
from copy import copy
|
||||
from datetime import date, datetime, timedelta
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from typing import List
|
||||
from unittest.mock import PropertyMock, patch
|
||||
import os
|
||||
|
||||
import lance
|
||||
import lancedb
|
||||
@@ -907,6 +907,16 @@ def test_hybrid_search(db, tmp_path):
|
||||
"Our father who art in heaven", query_type="hybrid"
|
||||
).to_pydantic(MyTable)
|
||||
|
||||
# Test that double and single quote characters are handled with phrase_query()
|
||||
(
|
||||
table.search(
|
||||
'"Aren\'t you a little short for a stormtrooper?" -- Leia',
|
||||
query_type="hybrid",
|
||||
)
|
||||
.phrase_query(True)
|
||||
.to_pydantic(MyTable)
|
||||
)
|
||||
|
||||
assert result1 == result3
|
||||
|
||||
# with post filters
|
||||
|
||||
@@ -64,6 +64,10 @@ impl Query {
|
||||
self.inner = self.inner.clone().limit(limit as usize);
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: u32) {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<VectorQuery> {
|
||||
let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
|
||||
let array = make_array(data);
|
||||
@@ -138,6 +142,10 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().limit(limit as usize);
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: u32) {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn column(&mut self, column: String) {
|
||||
self.inner = self.inner.clone().column(&column);
|
||||
}
|
||||
|
||||
@@ -391,7 +391,7 @@ impl JsTable {
|
||||
materialize_deletions_threshold.value(&mut cx) as f32;
|
||||
}
|
||||
if let Some(num_threads) = js_options.get_opt::<JsNumber, _, _>(&mut cx, "numThreads")? {
|
||||
options.num_threads = num_threads.value(&mut cx) as usize;
|
||||
options.num_threads = Some(num_threads.value(&mut cx) as usize);
|
||||
}
|
||||
|
||||
rt.spawn(async move {
|
||||
|
||||
@@ -338,6 +338,12 @@ pub trait QueryBase {
|
||||
/// it will default to 10.
|
||||
fn limit(self, limit: usize) -> Self;
|
||||
|
||||
/// Set the offset of the query.
|
||||
|
||||
/// By default, it fetches starting with the first row.
|
||||
/// This method can be used to skip the first `offset` rows.
|
||||
fn offset(self, offset: usize) -> Self;
|
||||
|
||||
/// Only return rows which match the filter.
|
||||
///
|
||||
/// The filter should be supplied as an SQL query string. For example:
|
||||
@@ -408,6 +414,11 @@ impl<T: HasQuery> QueryBase for T {
|
||||
self
|
||||
}
|
||||
|
||||
fn offset(mut self, offset: usize) -> Self {
|
||||
self.mut_query().offset = Some(offset);
|
||||
self
|
||||
}
|
||||
|
||||
fn only_if(mut self, filter: impl AsRef<str>) -> Self {
|
||||
self.mut_query().filter = Some(filter.as_ref().to_string());
|
||||
self
|
||||
@@ -520,6 +531,9 @@ pub struct Query {
|
||||
/// limit the number of rows to return.
|
||||
pub(crate) limit: Option<usize>,
|
||||
|
||||
/// Offset of the query.
|
||||
pub(crate) offset: Option<usize>,
|
||||
|
||||
/// Apply filter to the returned rows.
|
||||
pub(crate) filter: Option<String>,
|
||||
|
||||
@@ -541,6 +555,7 @@ impl Query {
|
||||
Self {
|
||||
parent,
|
||||
limit: None,
|
||||
offset: None,
|
||||
filter: None,
|
||||
full_text_search: None,
|
||||
select: Select::All,
|
||||
@@ -858,6 +873,7 @@ mod tests {
|
||||
let query = table
|
||||
.query()
|
||||
.limit(100)
|
||||
.offset(1)
|
||||
.nearest_to(&[9.8, 8.7])
|
||||
.unwrap()
|
||||
.nprobes(1000)
|
||||
@@ -870,6 +886,7 @@ mod tests {
|
||||
new_vector
|
||||
);
|
||||
assert_eq!(query.base.limit.unwrap(), 100);
|
||||
assert_eq!(query.base.offset.unwrap(), 1);
|
||||
assert_eq!(query.nprobes, 1000);
|
||||
assert!(query.use_index);
|
||||
assert_eq!(query.distance_type, Some(DistanceType::Cosine));
|
||||
@@ -916,10 +933,26 @@ mod tests {
|
||||
let result = query.execute().await;
|
||||
let mut stream = result.expect("should have result");
|
||||
// should only have one batch
|
||||
|
||||
while let Some(batch) = stream.next().await {
|
||||
// pre filter should return 10 rows
|
||||
assert!(batch.expect("should be Ok").num_rows() == 10);
|
||||
}
|
||||
|
||||
let query = table
|
||||
.query()
|
||||
.limit(10)
|
||||
.offset(1)
|
||||
.only_if(String::from("id % 2 == 0"))
|
||||
.nearest_to(&[0.1; 4])
|
||||
.unwrap();
|
||||
let result = query.execute().await;
|
||||
let mut stream = result.expect("should have result");
|
||||
// should only have one batch
|
||||
while let Some(batch) = stream.next().await {
|
||||
// pre filter should return 10 rows
|
||||
assert!(batch.expect("should be Ok").num_rows() == 9);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -1852,9 +1852,16 @@ impl TableInternal for NativeTable {
|
||||
query_vector,
|
||||
query.base.limit.unwrap_or(DEFAULT_TOP_K),
|
||||
)?;
|
||||
scanner.limit(
|
||||
query.base.limit.map(|limit| limit as i64),
|
||||
query.base.offset.map(|offset| offset as i64),
|
||||
)?;
|
||||
} else {
|
||||
// If there is no vector query, it's ok to not have a limit
|
||||
scanner.limit(query.base.limit.map(|limit| limit as i64), None)?;
|
||||
scanner.limit(
|
||||
query.base.limit.map(|limit| limit as i64),
|
||||
query.base.offset.map(|offset| offset as i64),
|
||||
)?;
|
||||
}
|
||||
|
||||
scanner.nprobs(query.nprobes);
|
||||
@@ -2781,7 +2788,7 @@ mod tests {
|
||||
.get_index_type(index_uuid)
|
||||
.await
|
||||
.unwrap(),
|
||||
Some("IVF".to_string())
|
||||
Some("IVF_PQ".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
table
|
||||
|
||||
Reference in New Issue
Block a user