From 41cca31f4845c40a621ff282910197a3fd8df943 Mon Sep 17 00:00:00 2001 From: Jai Date: Sat, 3 Jun 2023 06:08:31 -0700 Subject: [PATCH] Modal example using LangChain (#143) --- docs/src/examples/modal_langchain.md | 166 ++++++++++++++++++ .../examples/{modal.py => modal_langchain.py} | 53 ++++-- 2 files changed, 201 insertions(+), 18 deletions(-) create mode 100644 docs/src/examples/modal_langchain.md rename docs/src/examples/{modal.py => modal_langchain.py} (66%) diff --git a/docs/src/examples/modal_langchain.md b/docs/src/examples/modal_langchain.md new file mode 100644 index 00000000..82c7396b --- /dev/null +++ b/docs/src/examples/modal_langchain.md @@ -0,0 +1,166 @@ +# Serverless QA Bot with Modal and LangChain + +## use LanceDB's LangChain integration with Modal to run a serverless app + +modal + +We're going to build a QA bot for your documentation using LanceDB's LangChain integration and use Modal for deployment. + +Modal is an end-to-end compute platform for model inference, batch jobs, task queues, web apps and more. It's a great way to deploy your LanceDB models and apps. + +To get started, ensure that you have created an account and logged into [Modal](https://modal.com/). To follow along, the full source code is available on Github [here](https://github.com/lancedb/lancedb/blob/main/docs/src/examples/modal_langchain.py). + +### Setting up Modal + +We'll start by specifying our dependencies and creating a new Modal `Stub`: + +```python +lancedb_image = Image.debian_slim().pip_install( + "lancedb", + "langchain", + "openai", + "pandas", + "tiktoken", + "unstructured", + "tabulate" +) + +stub = Stub( + name="example-langchain-lancedb", + image=lancedb_image, + secrets=[Secret.from_name("my-openai-secret")], +) +``` + +We're using Modal's Secrets injection to secure our OpenAI key. To set your own, you can access the Modal UI and enter your key. + +### Setting up caches for LanceDB and LangChain + +Next, we can setup some globals to cache our LanceDB database, as well as our LangChain docsource: + +```python +docsearch = None +docs_path = Path("docs.pkl") +db_path = Path("lancedb") +``` + +### Downloading our dataset + +We're going use a pregenerated dataset, which stores HTML files of the Pandas 2.0 documentation. +You could switch this out for your own dataset. + +```python +def download_docs(): + pandas_docs = requests.get("https://eto-public.s3.us-west-2.amazonaws.com/datasets/pandas_docs/pandas.documentation.zip") + with open(Path("pandas.documentation.zip"), "wb") as f: + f.write(pandas_docs.content) + + file = zipfile.ZipFile(Path("pandas.documentation.zip")) + file.extractall(path=Path("pandas_docs")) +``` + +### Pre-processing the dataset and generating metadata + +Once we've downloaded it, we want to parse and pre-process them using LangChain, and then vectorize them and store it in LanceDB. +Let's first create a function that uses LangChains `UnstructuredHTMLLoader` to parse them. +We can then add our own metadata to it and store it alongside the data, we'll later be able to use this for filtering metadata. + +```python +def store_docs(): + docs = [] + + if not docs_path.exists(): + for p in Path("pandas_docs/pandas.documentation").rglob("*.html"): + if p.is_dir(): + continue + loader = UnstructuredHTMLLoader(p) + raw_document = loader.load() + + m = {} + m["title"] = get_document_title(raw_document[0]) + m["version"] = "2.0rc0" + raw_document[0].metadata = raw_document[0].metadata | m + raw_document[0].metadata["source"] = str(raw_document[0].metadata["source"]) + docs = docs + raw_document + + with docs_path.open("wb") as fh: + pickle.dump(docs, fh) + else: + with docs_path.open("rb") as fh: + docs = pickle.load(fh) + + return docs +``` + +### Simple LangChain chain for a QA bot + +Now we can create a simple LangChain chain for our QA bot. We'll use the `RecursiveCharacterTextSplitter` to split our documents into chunks, and then use the `OpenAIEmbeddings` to vectorize them. + +Lastly, we'll create a LanceDB table and store the vectorized documents in it, then create a `RetrievalQA` model from the chain and return it. + +```python +def qanda_langchain(query): + download_docs() + docs = store_docs() + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=200, + ) + documents = text_splitter.split_documents(docs) + embeddings = OpenAIEmbeddings() + + db = lancedb.connect(db_path) + table = db.create_table("pandas_docs", data=[ + {"vector": embeddings.embed_query("Hello World"), "text": "Hello World", "id": "1"} + ], mode="overwrite") + docsearch = LanceDB.from_documents(documents, embeddings, connection=table) + qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever()) + return qa.run(query) +``` + +### Creating our Modal entry points + +Now we can create our Modal entry points for our CLI and web endpoint: + +```python +@stub.function() +@web_endpoint(method="GET") +def web(query: str): + answer = qanda_langchain(query) + return { + "answer": answer, + } + +@stub.function() +def cli(query: str): + answer = qanda_langchain(query) + print(answer) +``` + +# Testing it out! + +Testing the CLI: + +```bash +modal run modal_langchain.py --query "What are the major differences in pandas 2.0?" +``` + +Testing the web endpoint: + +```bash +modal serve modal_langchain.py +``` + +In the CLI, Modal will provide you a web endpoint. Copy this endpoint URI for the next step. +Once this is served, then we can hit it with `curl`. + +Note, the first time this runs, it will take a few minutes to download the dataset and vectorize it. +An actual production example would pre-cache/load the dataset and vectorized documents prior + +```bash +curl --get --data-urlencode "query=What are the major differences in pandas 2.0?" https://your-modal-endpoint-app.modal.run + +{"answer":" The major differences in pandas 2.0 include the ability to use any numpy numeric dtype in a Index, installing optional dependencies with pip extras, and enhancements, bug fixes, and performance improvements."} +``` + diff --git a/docs/src/examples/modal.py b/docs/src/examples/modal_langchain.py similarity index 66% rename from docs/src/examples/modal.py rename to docs/src/examples/modal_langchain.py index 07195639..60a11629 100644 --- a/docs/src/examples/modal.py +++ b/docs/src/examples/modal_langchain.py @@ -1,8 +1,10 @@ import sys -from modal import Secret, Stub, Image +from modal import Secret, Stub, Image, web_endpoint import lancedb import re import pickle +import requests +import zipfile from pathlib import Path from langchain.document_loaders import UnstructuredHTMLLoader @@ -12,7 +14,15 @@ from langchain.vectorstores import LanceDB from langchain.llms import OpenAI from langchain.chains import RetrievalQA -lancedb_image = Image.debian_slim().pip_install("lancedb", "langchain", "openai", "tiktoken") +lancedb_image = Image.debian_slim().pip_install( + "lancedb", + "langchain", + "openai", + "pandas", + "tiktoken", + "unstructured", + "tabulate" +) stub = Stub( name="example-langchain-lancedb", @@ -23,7 +33,6 @@ stub = Stub( docsearch = None docs_path = Path("docs.pkl") db_path = Path("lancedb") -doc_cache = [] def get_document_title(document): m = str(document.metadata["source"]) @@ -32,16 +41,24 @@ def get_document_title(document): return(title[0]) return '' +def download_docs(): + pandas_docs = requests.get("https://eto-public.s3.us-west-2.amazonaws.com/datasets/pandas_docs/pandas.documentation.zip") + with open(Path("pandas.documentation.zip"), "wb") as f: + f.write(pandas_docs.content) + + file = zipfile.ZipFile(Path("pandas.documentation.zip")) + file.extractall(path=Path("pandas_docs")) + def store_docs(): docs = [] if not docs_path.exists(): - for p in Path("./pandas.documentation").rglob("*.html"): + for p in Path("pandas_docs/pandas.documentation").rglob("*.html"): if p.is_dir(): continue loader = UnstructuredHTMLLoader(p) raw_document = loader.load() - + m = {} m["title"] = get_document_title(raw_document[0]) m["version"] = "2.0rc0" @@ -55,36 +72,36 @@ def store_docs(): with docs_path.open("rb") as fh: docs = pickle.load(fh) - doc_cache = docs + return docs def qanda_langchain(query): - store_docs() + download_docs() + docs = store_docs() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, ) - documents = text_splitter.split_documents(doc_cache) + documents = text_splitter.split_documents(docs) embeddings = OpenAIEmbeddings() db = lancedb.connect(db_path) table = db.create_table("pandas_docs", data=[ - {"vector": embeddings.embed_query("Hello World")} + {"vector": embeddings.embed_query("Hello World"), "text": "Hello World", "id": "1"} ], mode="overwrite") docsearch = LanceDB.from_documents(documents, embeddings, connection=table) qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever()) return qa.run(query) @stub.function() -def cli(query: str, show_sources: bool = False): +@web_endpoint(method="GET") +def web(query: str): answer = qanda_langchain(query) - # Terminal codes for pretty-printing. - bold, end = "\033[1m", "\033[0m" + return { + "answer": answer, + } - print(f"🦜 {bold}ANSWER:{end}") +@stub.function() +def cli(query: str): + answer = qanda_langchain(query) print(answer) - if show_sources: - print(f"🔗 {bold}SOURCES:{end}") - for text in sources: - print(text) - print("----")