From e784c6311dc03d4091748e7b8524625538a7c93f Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Tue, 25 Apr 2023 21:40:28 -0700 Subject: [PATCH 01/18] tree github build script from remote --- docs/src/examples/s3_lambda.md | 72 ++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 docs/src/examples/s3_lambda.md diff --git a/docs/src/examples/s3_lambda.md b/docs/src/examples/s3_lambda.md new file mode 100644 index 00000000..46cb6c53 --- /dev/null +++ b/docs/src/examples/s3_lambda.md @@ -0,0 +1,72 @@ +# S3, Lambda and Lance + +## Store your data on S3 and use Lambda to compute embeddings and retrieve queries in production easily. + +This is a great option if you're wanting to scale with your use case and save effort and costs of maintenance. + +Let's walk through how to get a simple Lambda function that queries the SIFT dataset on S3. + +Before we start, you'll need to ensure you create a secure account access to AWS. We recommend using user policies, as this way AWS can share credentials securely without you having to pass around environment variables into Lambda. + +We'll also use a container to ship our Lambda code. This is a good option for Lambda as you don't have the space limits that you would otherwise by building a package yourself. + +First, let's create a new `Dockerfile` using the AWS python container base: + +```docker +FROM public.ecr.aws/lambda/python:3.10 + +RUN pip3 install --upgrade pip +RUN pip3 install --no-cache-dir -U numpy --target "${LAMBDA_TASK_ROOT}" +RUN pip3 install --no-cache-dir -U pylance --target "${LAMBDA_TASK_ROOT}" + +COPY app.py ${LAMBDA_TASK_ROOT} + +CMD [ "app.handler" ] +``` + +Now let's make a simple Lambda function that queries the SIFT dataset, and allows the user to enter a vector and change the nearest neighbour parameter in `app.py`. + +```python +import time +import json + +import numpy as np +import lance +from lance.vector import vec_to_table + +s3_dataset = lance.dataset("s3://eto-public/datasets/sift/vec_data.lance") + +def handler(event, context): + status_code = 200 + num_k = 10 + + if event['query_vector'] is None: + status_code = 404 + return { + "statusCode": status_code, + "headers": { + "Content-Type": "application/json" + }, + "body": json.dumps({ + "Error ": "No vector to query was issued" + }) + } + + # Shape of SIFT is (128,1M), d=float32 + query_vector = np.array(event['query_vector'], dtype=np.float32) + + if event['num_k'] is not None: + num_k = event['num_k'] + + if event['debug'] is not None: + rs = s3_dataset.to_table(nearest={"column": "vector", "k": num_k, "q": query_vector}) + else: + rs = s3_dataset.to_table(nearest={"column": "vector", "k": num_k, "q": query_vector}) + + return { + "statusCode": status_code, + "headers": { + "Content-Type": "application/json" + }, + "body": rs.to_pandas().to_json() + }``` \ No newline at end of file From 41eadf6fd96a3de54e925a702aa7e09b911ab91e Mon Sep 17 00:00:00 2001 From: Jai Date: Wed, 26 Apr 2023 10:18:31 -0700 Subject: [PATCH 02/18] Update s3_lambda.md --- docs/src/examples/s3_lambda.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/src/examples/s3_lambda.md b/docs/src/examples/s3_lambda.md index 46cb6c53..59218686 100644 --- a/docs/src/examples/s3_lambda.md +++ b/docs/src/examples/s3_lambda.md @@ -2,6 +2,8 @@ ## Store your data on S3 and use Lambda to compute embeddings and retrieve queries in production easily. +## s3-lambda + This is a great option if you're wanting to scale with your use case and save effort and costs of maintenance. Let's walk through how to get a simple Lambda function that queries the SIFT dataset on S3. @@ -69,4 +71,4 @@ def handler(event, context): "Content-Type": "application/json" }, "body": rs.to_pandas().to_json() - }``` \ No newline at end of file + }``` From 0346d5319ed2f075c8c73b5c5dbcd9a48ae56ed8 Mon Sep 17 00:00:00 2001 From: Jai Date: Wed, 26 Apr 2023 10:18:47 -0700 Subject: [PATCH 03/18] Update s3_lambda.md --- docs/src/examples/s3_lambda.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/examples/s3_lambda.md b/docs/src/examples/s3_lambda.md index 59218686..58e93b0d 100644 --- a/docs/src/examples/s3_lambda.md +++ b/docs/src/examples/s3_lambda.md @@ -2,7 +2,7 @@ ## Store your data on S3 and use Lambda to compute embeddings and retrieve queries in production easily. -## s3-lambda +s3-lambda This is a great option if you're wanting to scale with your use case and save effort and costs of maintenance. From 3d3ba913ed6b7217caaa08b8a2a7a0093aecd8f2 Mon Sep 17 00:00:00 2001 From: Jai Date: Wed, 26 Apr 2023 10:19:27 -0700 Subject: [PATCH 04/18] Update s3_lambda.md --- docs/src/examples/s3_lambda.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/examples/s3_lambda.md b/docs/src/examples/s3_lambda.md index 58e93b0d..d25242b3 100644 --- a/docs/src/examples/s3_lambda.md +++ b/docs/src/examples/s3_lambda.md @@ -2,7 +2,7 @@ ## Store your data on S3 and use Lambda to compute embeddings and retrieve queries in production easily. -s3-lambda +s3-lambda This is a great option if you're wanting to scale with your use case and save effort and costs of maintenance. @@ -71,4 +71,4 @@ def handler(event, context): "Content-Type": "application/json" }, "body": rs.to_pandas().to_json() - }``` + } From ca8d8e82b7836c4accc8773ff29734fd0d9ffde3 Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Wed, 26 Apr 2023 14:44:20 -0700 Subject: [PATCH 05/18] add simple langchain example --- docs/src/examples/langchain.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 docs/src/examples/langchain.md diff --git a/docs/src/examples/langchain.md b/docs/src/examples/langchain.md new file mode 100644 index 00000000..636a53ad --- /dev/null +++ b/docs/src/examples/langchain.md @@ -0,0 +1,15 @@ +# Lance + LangChain on Pandas 2.0 + +## simple Pandas 2.0 documentation Q&A answering bot using LangChain + +To demonstrate using Lance, we’re going to build a simple Q&A answering bot using LangChain — an open-source framework that allows you to build composable LLM-based applications easily. We’ll use chat-langchain, a simple Q&A answering bot app as an example. Note: in this fork of chat-langchain, we’re also using a forked version of LangChain integration where we’ve built a Lance integration. + +The first step is to generate embeddings. You could build a bot using your own data, like a wiki page or internal documentation. For this example, we’re going to use the Pandas API documentation. LangChain offers document loaders to read and pre-process many document types. Since the Pandas API is in HTML, reading the docs is straightforward: + +```python +for p in Path("./pandas.documentation").rglob("*.html"): + if p.is_dir(): + continue + loader = UnstructuredHTMLLoader(p) + raw_document = loader.load() + docs = docs + raw_document \ No newline at end of file From aa23d911f5d657495eb1a6e965b22c9ce94ffbc5 Mon Sep 17 00:00:00 2001 From: Jai Date: Wed, 26 Apr 2023 14:50:09 -0700 Subject: [PATCH 06/18] Update langchain.md --- docs/src/examples/langchain.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/src/examples/langchain.md b/docs/src/examples/langchain.md index 636a53ad..cf79e44e 100644 --- a/docs/src/examples/langchain.md +++ b/docs/src/examples/langchain.md @@ -2,6 +2,8 @@ ## simple Pandas 2.0 documentation Q&A answering bot using LangChain + + To demonstrate using Lance, we’re going to build a simple Q&A answering bot using LangChain — an open-source framework that allows you to build composable LLM-based applications easily. We’ll use chat-langchain, a simple Q&A answering bot app as an example. Note: in this fork of chat-langchain, we’re also using a forked version of LangChain integration where we’ve built a Lance integration. The first step is to generate embeddings. You could build a bot using your own data, like a wiki page or internal documentation. For this example, we’re going to use the Pandas API documentation. LangChain offers document loaders to read and pre-process many document types. Since the Pandas API is in HTML, reading the docs is straightforward: @@ -12,4 +14,4 @@ for p in Path("./pandas.documentation").rglob("*.html"): continue loader = UnstructuredHTMLLoader(p) raw_document = loader.load() - docs = docs + raw_document \ No newline at end of file + docs = docs + raw_document From c930b94917c9b6afb216539c7fec372e29769926 Mon Sep 17 00:00:00 2001 From: Jai Date: Thu, 27 Apr 2023 07:12:52 -0700 Subject: [PATCH 07/18] Update s3_lambda.md --- docs/src/examples/s3_lambda.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/examples/s3_lambda.md b/docs/src/examples/s3_lambda.md index d25242b3..084c5abb 100644 --- a/docs/src/examples/s3_lambda.md +++ b/docs/src/examples/s3_lambda.md @@ -1,4 +1,4 @@ -# S3, Lambda and Lance +# Serverless LanceDB ## Store your data on S3 and use Lambda to compute embeddings and retrieve queries in production easily. From 87fb4d06455a2e8e16ae2ca81c09fe617ffc16f1 Mon Sep 17 00:00:00 2001 From: Jai Date: Thu, 27 Apr 2023 07:13:18 -0700 Subject: [PATCH 08/18] Update langchain.md --- docs/src/examples/langchain.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/examples/langchain.md b/docs/src/examples/langchain.md index cf79e44e..27a50834 100644 --- a/docs/src/examples/langchain.md +++ b/docs/src/examples/langchain.md @@ -1,4 +1,4 @@ -# Lance + LangChain on Pandas 2.0 +# Code Documentation Q&A Bot ## simple Pandas 2.0 documentation Q&A answering bot using LangChain From 7cd36196b47ab34632f28e5010a52db4d6a467e1 Mon Sep 17 00:00:00 2001 From: Jai Date: Thu, 27 Apr 2023 11:08:29 -0700 Subject: [PATCH 09/18] Update langchain.md --- docs/src/examples/langchain.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/examples/langchain.md b/docs/src/examples/langchain.md index 27a50834..d93ae1ad 100644 --- a/docs/src/examples/langchain.md +++ b/docs/src/examples/langchain.md @@ -1,6 +1,6 @@ -# Code Documentation Q&A Bot +# Code Documentation Q&A Bot 2.0 -## simple Pandas 2.0 documentation Q&A answering bot using LangChain +## Simple Pandas 2.0 documentation Q&A answering bot using LangChain From 33ac42a51c59110417c8cf908d6f52d73bf3433e Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:55:25 -0700 Subject: [PATCH 10/18] bump version for v0.1.1 --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 2884c8ee..d0db3b6d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "lancedb" -version = "0.1" +version = "0.1.1" dependencies = ["pylance>=0.4.4", "ratelimiter", "retry", "tqdm"] description = "lancedb" authors = [ From 906551b0010a925405a7684c952b0f9a35d49355 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 27 Apr 2023 10:31:50 -0700 Subject: [PATCH 11/18] initialize the rust core --- rust/Cargo.toml | 9 +++++++++ rust/src/lib.rs | 14 ++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 rust/Cargo.toml create mode 100644 rust/src/lib.rs diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 00000000..a16e339f --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "vectordb" +version = "0.0.1" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +lance = "0.4.3" diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 00000000..7d12d9af --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} From 976344257c92fe4226c4085317bf10c33b6de990 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 27 Apr 2023 10:36:01 -0700 Subject: [PATCH 12/18] add cargo metadata --- rust/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index a16e339f..87f8d0d6 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -2,6 +2,9 @@ name = "vectordb" version = "0.0.1" edition = "2021" +description = "Serverless, low-latency vector database for AI applications" +license = "Apache-2.0" +repository = "https://github.com/lancedb/lancedb" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html From 4336ed050d689556785c7349083092783461bad3 Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Wed, 3 May 2023 07:30:46 -0700 Subject: [PATCH 13/18] add new feature to readme.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index ca5f5821..572e18bb 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,10 @@ The key features of LanceDB include: * Production-scale vector search with no servers to manage. +* Optimized for multi-modal data (text, images, videos, point clouds and more). + +* Native Python and Javascript/Typescript support (coming soon). + * Combine attribute-based information with vectors and store them as a single source-of-truth. * Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure. From 66f7d5cec98154317a5fbca0cc69ad3c0cb8f10f Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Wed, 3 May 2023 07:50:44 -0700 Subject: [PATCH 14/18] also update docs index --- docs/src/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/src/index.md b/docs/src/index.md index 706fb53d..314132cb 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,6 +6,10 @@ The key features of LanceDB include: * Production-scale vector search with no servers to manage. +* Optimized for multi-modal data (text, images, videos, point clouds and more). + +* Native Python and Javascript/Typescript support (coming soon). + * Combine attribute-based information with vectors and store them as a single source-of-truth. * Zero-copy, automatic versioning, manage versions of your data without needing extra infrastructure. From c3d90b2c7889b6cd506289deda6235d826be6440 Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Wed, 3 May 2023 19:55:10 -0700 Subject: [PATCH 15/18] update tagline --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 572e18bb..79e9b40b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ LanceDB Logo -**Serverless, low-latency vector database for AI applications** +**Developer-friendly, serverless vector database for AI applications** DocumentationBlog • From 6556e42e6ded36dc01e1ca1a9dfdd94222b863c4 Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Thu, 4 May 2023 08:16:33 -0700 Subject: [PATCH 16/18] update lambda example to lancedb --- docs/src/examples/langchain.md | 121 +++++++++++++++++++++++++++++++++ docs/src/examples/s3_lambda.md | 65 ++++++++++++++---- 2 files changed, 171 insertions(+), 15 deletions(-) diff --git a/docs/src/examples/langchain.md b/docs/src/examples/langchain.md index d93ae1ad..93f2b782 100644 --- a/docs/src/examples/langchain.md +++ b/docs/src/examples/langchain.md @@ -15,3 +15,124 @@ for p in Path("./pandas.documentation").rglob("*.html"): loader = UnstructuredHTMLLoader(p) raw_document = loader.load() docs = docs + raw_document +``` + +Once we have pre-processed our input documents, the next step is to generate the embeddings. For this, we’ll use LangChain’s OpenAI API wrapper. Note that you’ll need to sign up for the OpenAI API (this is a paid service). Here we’ll tokenize the documents and store them in a Lance dataset. Using Lance will persist your embeddings locally, you can read more about Lance’s data management features in its [documentation](https://eto-ai.github.io/lance/). + +```python +text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=200 +) +documents = text_splitter.split_documents(docs) +embeddings = OpenAIEmbeddings() +LanceDataset.from_documents(documents, embeddings, uri="pandas.lance") +``` + +Now we’ve got our vector store setup, we can boot up the chat app, which uses LangChain’s chain API to submit an input query to the vector store. Under the hood, this will generate embeddings for your query, perform similarity search using the vector store and generate the resulting text. + +And presto! Your very own Pandas API helper bot, a handy little assistant to help you get up to speed with Pandas — here are some examples: + +First let’s make sure we’re on the right document version for pandas: + +Great, now we can ask some more specific questions: + +So far so good! + +# Integrating Lance into LangChain + +LangChain has a vectorstore abstraction with multiple implementations. This is where we put Lance. In our own langchain fork, we added a lance_dataset.py as a new kind of vectorstore that is just a LanceDataset (pip install pylance). Once you get the embeddings, you can call lance’s vec_to_table() method to create a pyarrow Table from it: + +```python +import lance +from lance.vector import vec_table +embeddings = embedding.embed_documents(texts) +tbl = vec_to_to_table(embeddings) +``` + +Writing the data is just: + +```python +uri = "pandas_documentation.lance" +dataset = lance.write_dataset(tbl, uri) +``` + +If the dataset is small, Lance’s SIMD code for vector distances makes brute forcing faster than numpy. And if the dataset is large, you can create an ANN index in another 1 line of python code. + +```python +dataset.create_index("vector", index_type="IVF_PQ", + num_partitions=256, # ivf partitions + num_sub_vectors=num_sub_vectors) # PQ subvectors +``` + +To make an ANN query to find 10 closest neighbors to the query_vector and also fetch the document and metadata, use the to_table function like this: + +```python +tbl = self.dataset.to_table(columns=["document", "metadata"], + nearest={"column": "vector", + "q": query_vector, + "k": 10}) +``` + +You now have a pyarrow Table that you can use for downstream re-ranking and filtering. + +# Lance datasets are queryable + +Lance datasets are Arrow compatible so you can directly query your Lance datasets using Pandas, DuckDB, Polars to make it super easy to do additional filtering, reranking, enrichment, and or debugging. + +We store metadata in JSON alongside the vectors, but, if you wish, you could define the data model yourself. + +For example if we want to check the number of vectors we’re storing, against what version of documentation in DuckDB, you can load the lance dataset in DuckDB and run SQL against it: + +```sql +SELECT + count(vector), + json_extract(metadata, '$.version') as VERSION +FROM + pandas_docs +GROUP BY + VERSION +``` + +# Lance versions your data automatically + +Oftentimes we have to regenerate our embeddings on a regular basis. This makes debugging a pain to have to track down the right version of your data and the right version of your index to diagnose any issues. With Lance you can create a new version of your dataset by specifying mode=”overwrite” when writing. + +Let’s say we start with some toy data: + +```python +>>> import pandas as pd +>>> import lance +>>> import numpy as np +>>> df = pd.DataFrame({"a": [5]}) +>>> dataset = lance.write_dataset(df, "data.lance") +>>> dataset.to_table().to_pandas() + a +0 5 +``` + +We can create and persist a new version: + +```python +>>> df = pd.DataFrame({"a": [50, 100]}) +>>> dataset = lance.write_dataset(df, "data.lance", mode="overwrite") +>>> dataset.to_table().to_pandas() + a +0 50 +1 100 +``` + +And you can time travel to a previous version by specifying the version number or timestamp when you create the dataset instance. + +```python +>>> dataset.versions() +[{'version': 2, 'timestamp': datetime.datetime(2023, 2, 24, 11, 58, 20, 739008), 'metadata': {}}, + {'version': 1, 'timestamp': datetime.datetime(2023, 2, 24, 11, 56, 59, 690977), 'metadata': {}}] +>>> lance.dataset("data.lance", version=1).to_table().to_pandas() + a +0 5 +``` + +# Where Lance is headed + +Lance’s random access performance makes it ideal to build search engines and high-performance data stores for deep learning. We’re actively working to make Lance support 1B+ scale vector datasets, partitioning and reindexing, and new index types. Lance is written in Rust and comes with a wrapper for python and an extension for duckdb. \ No newline at end of file diff --git a/docs/src/examples/s3_lambda.md b/docs/src/examples/s3_lambda.md index 084c5abb..43d40648 100644 --- a/docs/src/examples/s3_lambda.md +++ b/docs/src/examples/s3_lambda.md @@ -12,31 +12,57 @@ Before we start, you'll need to ensure you create a secure account access to AWS We'll also use a container to ship our Lambda code. This is a good option for Lambda as you don't have the space limits that you would otherwise by building a package yourself. -First, let's create a new `Dockerfile` using the AWS python container base: +# Initial setup: creating a LanceDB Table and storing it remotely on S3 + +We'll use the SIFT vector dataset as an example. To make it easier, we've already made a Lance-format SIFT dataset publically available, which we can access and use to populate our LanceDB Table. + +To do this, download the Lance files locally first from: + +``` +s3://eto-public/datasets/sift/vec_data.lance +``` + +Then, we can write a quick Python script to populate our LanceDB Table: + +```python +import pylance +sift_dataset = pylance.dataset("/path/to/local/vec_data.lance") +df = sift_dataset.to_table().to_pandas() + +import lancedb +db = lancedb.connect(".") +table = db.create_table("vector_example", df) +``` + +Once we've created our Table, we are free to move this data over to S3 so we can remotely host it. + +# Building our Lambda app: a simple event handler for vector search + +Now that we've got a remotely hosted LanceDB Table, we'll want to be able to query it from Lambda. To do so, let's create a new `Dockerfile` using the AWS python container base: ```docker FROM public.ecr.aws/lambda/python:3.10 RUN pip3 install --upgrade pip RUN pip3 install --no-cache-dir -U numpy --target "${LAMBDA_TASK_ROOT}" -RUN pip3 install --no-cache-dir -U pylance --target "${LAMBDA_TASK_ROOT}" +RUN pip3 install --no-cache-dir -U lancedb --target "${LAMBDA_TASK_ROOT}" COPY app.py ${LAMBDA_TASK_ROOT} CMD [ "app.handler" ] ``` -Now let's make a simple Lambda function that queries the SIFT dataset, and allows the user to enter a vector and change the nearest neighbour parameter in `app.py`. +Now let's make a simple Lambda function that queries the SIFT dataset in `app.py`. ```python import time import json import numpy as np -import lance -from lance.vector import vec_to_table +import lancedb -s3_dataset = lance.dataset("s3://eto-public/datasets/sift/vec_data.lance") +db = lancedb.connect("s3://eto-public/tables") +table = db.open_table("vector_example") def handler(event, context): status_code = 200 @@ -56,19 +82,28 @@ def handler(event, context): # Shape of SIFT is (128,1M), d=float32 query_vector = np.array(event['query_vector'], dtype=np.float32) - - if event['num_k'] is not None: - num_k = event['num_k'] - - if event['debug'] is not None: - rs = s3_dataset.to_table(nearest={"column": "vector", "k": num_k, "q": query_vector}) - else: - rs = s3_dataset.to_table(nearest={"column": "vector", "k": num_k, "q": query_vector}) + + rs = table.search(query_vector).limit(2).to_df() return { "statusCode": status_code, "headers": { "Content-Type": "application/json" }, - "body": rs.to_pandas().to_json() + "body": rs.to_json() } +``` + +# Deploying the container to EKS + +The next step is to build and push the container to EKS, where it can then be used to create a new Lambda function. + +It's best to follow the official AWS documentation for how to do this, which you can view here: + +``` +https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#images-upload +``` + +# Final step: setting up your Lambda function + +Once the container is pushed, you can create a Lambda function by selecting the container. From 6ff3c60cd13ec7f7bcc448b0a0d6fbee44b538c1 Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Thu, 4 May 2023 10:14:31 -0700 Subject: [PATCH 17/18] clean up example --- docs/src/examples/s3_lambda.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/src/examples/s3_lambda.md b/docs/src/examples/s3_lambda.md index 43d40648..b63201e4 100644 --- a/docs/src/examples/s3_lambda.md +++ b/docs/src/examples/s3_lambda.md @@ -55,9 +55,7 @@ CMD [ "app.handler" ] Now let's make a simple Lambda function that queries the SIFT dataset in `app.py`. ```python -import time import json - import numpy as np import lancedb @@ -66,7 +64,6 @@ table = db.open_table("vector_example") def handler(event, context): status_code = 200 - num_k = 10 if event['query_vector'] is None: status_code = 404 From 11f423ccf5d8da297a5f5cc6c822ba5869c8f607 Mon Sep 17 00:00:00 2001 From: Jai Chopra Date: Thu, 4 May 2023 17:21:53 -0700 Subject: [PATCH 18/18] clean up --- docs/src/examples/langchain.md | 138 --------------------------------- python/pyproject.toml | 2 +- rust/Cargo.toml | 12 --- rust/src/lib.rs | 14 ---- 4 files changed, 1 insertion(+), 165 deletions(-) delete mode 100644 docs/src/examples/langchain.md delete mode 100644 rust/Cargo.toml delete mode 100644 rust/src/lib.rs diff --git a/docs/src/examples/langchain.md b/docs/src/examples/langchain.md deleted file mode 100644 index 93f2b782..00000000 --- a/docs/src/examples/langchain.md +++ /dev/null @@ -1,138 +0,0 @@ -# Code Documentation Q&A Bot 2.0 - -## Simple Pandas 2.0 documentation Q&A answering bot using LangChain - - - -To demonstrate using Lance, we’re going to build a simple Q&A answering bot using LangChain — an open-source framework that allows you to build composable LLM-based applications easily. We’ll use chat-langchain, a simple Q&A answering bot app as an example. Note: in this fork of chat-langchain, we’re also using a forked version of LangChain integration where we’ve built a Lance integration. - -The first step is to generate embeddings. You could build a bot using your own data, like a wiki page or internal documentation. For this example, we’re going to use the Pandas API documentation. LangChain offers document loaders to read and pre-process many document types. Since the Pandas API is in HTML, reading the docs is straightforward: - -```python -for p in Path("./pandas.documentation").rglob("*.html"): - if p.is_dir(): - continue - loader = UnstructuredHTMLLoader(p) - raw_document = loader.load() - docs = docs + raw_document -``` - -Once we have pre-processed our input documents, the next step is to generate the embeddings. For this, we’ll use LangChain’s OpenAI API wrapper. Note that you’ll need to sign up for the OpenAI API (this is a paid service). Here we’ll tokenize the documents and store them in a Lance dataset. Using Lance will persist your embeddings locally, you can read more about Lance’s data management features in its [documentation](https://eto-ai.github.io/lance/). - -```python -text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=200 -) -documents = text_splitter.split_documents(docs) -embeddings = OpenAIEmbeddings() -LanceDataset.from_documents(documents, embeddings, uri="pandas.lance") -``` - -Now we’ve got our vector store setup, we can boot up the chat app, which uses LangChain’s chain API to submit an input query to the vector store. Under the hood, this will generate embeddings for your query, perform similarity search using the vector store and generate the resulting text. - -And presto! Your very own Pandas API helper bot, a handy little assistant to help you get up to speed with Pandas — here are some examples: - -First let’s make sure we’re on the right document version for pandas: - -Great, now we can ask some more specific questions: - -So far so good! - -# Integrating Lance into LangChain - -LangChain has a vectorstore abstraction with multiple implementations. This is where we put Lance. In our own langchain fork, we added a lance_dataset.py as a new kind of vectorstore that is just a LanceDataset (pip install pylance). Once you get the embeddings, you can call lance’s vec_to_table() method to create a pyarrow Table from it: - -```python -import lance -from lance.vector import vec_table -embeddings = embedding.embed_documents(texts) -tbl = vec_to_to_table(embeddings) -``` - -Writing the data is just: - -```python -uri = "pandas_documentation.lance" -dataset = lance.write_dataset(tbl, uri) -``` - -If the dataset is small, Lance’s SIMD code for vector distances makes brute forcing faster than numpy. And if the dataset is large, you can create an ANN index in another 1 line of python code. - -```python -dataset.create_index("vector", index_type="IVF_PQ", - num_partitions=256, # ivf partitions - num_sub_vectors=num_sub_vectors) # PQ subvectors -``` - -To make an ANN query to find 10 closest neighbors to the query_vector and also fetch the document and metadata, use the to_table function like this: - -```python -tbl = self.dataset.to_table(columns=["document", "metadata"], - nearest={"column": "vector", - "q": query_vector, - "k": 10}) -``` - -You now have a pyarrow Table that you can use for downstream re-ranking and filtering. - -# Lance datasets are queryable - -Lance datasets are Arrow compatible so you can directly query your Lance datasets using Pandas, DuckDB, Polars to make it super easy to do additional filtering, reranking, enrichment, and or debugging. - -We store metadata in JSON alongside the vectors, but, if you wish, you could define the data model yourself. - -For example if we want to check the number of vectors we’re storing, against what version of documentation in DuckDB, you can load the lance dataset in DuckDB and run SQL against it: - -```sql -SELECT - count(vector), - json_extract(metadata, '$.version') as VERSION -FROM - pandas_docs -GROUP BY - VERSION -``` - -# Lance versions your data automatically - -Oftentimes we have to regenerate our embeddings on a regular basis. This makes debugging a pain to have to track down the right version of your data and the right version of your index to diagnose any issues. With Lance you can create a new version of your dataset by specifying mode=”overwrite” when writing. - -Let’s say we start with some toy data: - -```python ->>> import pandas as pd ->>> import lance ->>> import numpy as np ->>> df = pd.DataFrame({"a": [5]}) ->>> dataset = lance.write_dataset(df, "data.lance") ->>> dataset.to_table().to_pandas() - a -0 5 -``` - -We can create and persist a new version: - -```python ->>> df = pd.DataFrame({"a": [50, 100]}) ->>> dataset = lance.write_dataset(df, "data.lance", mode="overwrite") ->>> dataset.to_table().to_pandas() - a -0 50 -1 100 -``` - -And you can time travel to a previous version by specifying the version number or timestamp when you create the dataset instance. - -```python ->>> dataset.versions() -[{'version': 2, 'timestamp': datetime.datetime(2023, 2, 24, 11, 58, 20, 739008), 'metadata': {}}, - {'version': 1, 'timestamp': datetime.datetime(2023, 2, 24, 11, 56, 59, 690977), 'metadata': {}}] ->>> lance.dataset("data.lance", version=1).to_table().to_pandas() - a -0 5 -``` - -# Where Lance is headed - -Lance’s random access performance makes it ideal to build search engines and high-performance data stores for deep learning. We’re actively working to make Lance support 1B+ scale vector datasets, partitioning and reindexing, and new index types. Lance is written in Rust and comes with a wrapper for python and an extension for duckdb. \ No newline at end of file diff --git a/python/pyproject.toml b/python/pyproject.toml index d0db3b6d..2884c8ee 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "lancedb" -version = "0.1.1" +version = "0.1" dependencies = ["pylance>=0.4.4", "ratelimiter", "retry", "tqdm"] description = "lancedb" authors = [ diff --git a/rust/Cargo.toml b/rust/Cargo.toml deleted file mode 100644 index 87f8d0d6..00000000 --- a/rust/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "vectordb" -version = "0.0.1" -edition = "2021" -description = "Serverless, low-latency vector database for AI applications" -license = "Apache-2.0" -repository = "https://github.com/lancedb/lancedb" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -lance = "0.4.3" diff --git a/rust/src/lib.rs b/rust/src/lib.rs deleted file mode 100644 index 7d12d9af..00000000 --- a/rust/src/lib.rs +++ /dev/null @@ -1,14 +0,0 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -}