diff --git a/docs/src/notebooks/lancedb_reranking.ipynb b/docs/src/notebooks/lancedb_reranking.ipynb new file mode 100644 index 00000000..7536a8fa --- /dev/null +++ b/docs/src/notebooks/lancedb_reranking.ipynb @@ -0,0 +1,1481 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "b3Y3DOVqtIbc" + }, + "source": [ + "# Example walkthrough\n", + "\n", + "## Optimizing RAG retrieval performance using hybrid search & reranking" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6gUUIxGP0n1Z", + "outputId": "96e24cff-abfa-46dd-ada5-28b6c15b4f47" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.9/20.9 MB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.1/227.1 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.0/40.0 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.8/173.8 kB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m25.5/25.5 MB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m73.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.7/82.7 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m75.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 13.0.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install lancedb sentence-transformers tantivy pyarrow==13.0.0 cohere -q" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DQSVI4GSjU0b" + }, + "source": [ + "## What is a retriever\n", + "VectorDBs are used as retreivers in recommender or chatbot-based systems for retrieving relevant data based on user queries. For example, retriever is a critical component of Retrieval Augmented Generation (RAG) acrhitectures. In this section, we will discuss how to improve the performance of retrievers.\n", + "\n", + "\n", + "\n", + "[source](https://llmstack.ai/assets/images/rag-f517f1f834bdbb94a87765e0edd40ff2.png)\n", + "\n", + "## How do you go about improving retreival performance\n", + "Some of the common techniques are:\n", + "\n", + "- Using different search types - vector/semantic, FTS (BM25)\n", + "- Hybrid search\n", + "- Reranking\n", + "- Fine-tuning the embedding models\n", + "- Using different embedding models\n", + "\n", + "Obviously, the above list is not exhaustive. There are other subtler ways that can improve retrieval performance like experimenting chunking algorithms, using different distance/similarity metrics etc. But for brevity, we'll only cover high level and more impactful techniques here.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3ZCm3-Bog9g7" + }, + "source": [ + "# LanceDB\n", + "- Multimodal DB for AI\n", + "- Powered by an innovative & open-source in-house file format\n", + "- 0 Setup\n", + "- Scales up on disk storage\n", + "- Native support for vector, full-text(BM25) and hybrid search\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b1fzhbQc4O1u" + }, + "source": [ + "## The dataset\n", + "The dataset we'll use is a synthetic QA dataset generated from LLama2 review paper. The paper was divided into chunks, with each chunk being a unique context. An LLM was prompted to ask questions relevant to the context for testing a retreiver.\n", + "The exact code and other utility functions for this can be found in [this](https://github.com/lancedb/ragged) repo\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f_qnH-Dfhi9Z", + "outputId": "9d41e17e-c994-473d-cf17-93568e025252" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-06-26 13:31:34-- https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 680439 (664K) [text/plain]\n", + "Saving to: ‘data_qa.csv’\n", + "\n", + "data_qa.csv 100%[===================>] 664.49K --.-KB/s in 0.009s \n", + "\n", + "2024-06-26 13:31:34 (71.7 MB/s) - ‘data_qa.csv’ saved [680439/680439]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "ZNNAUc6f7ILI" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.read_csv(\"data_qa.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 580 + }, + "id": "4Bp9Fdhz7QsM", + "outputId": "4cb6b384-7991-4911-e5fd-7fbd42487916" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"data\",\n \"rows\": 220,\n \"fields\": [\n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 63,\n \"min\": 0,\n \"max\": 219,\n \"num_unique_values\": 220,\n \"samples\": [\n 132,\n 148,\n 93\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 220,\n \"samples\": [\n \"What type of examination did scholars perform on ChatGPT, and when was the resulting scholarly paper published?\",\n \"How do the performance capabilities of the different models compare in evaluating tasks associated with logical reasoning and reading comprehension, specifically noted in tests like LSAT and SAT?\",\n \"What steps are recommended for users to ensure the responsible use of AI models like Llama 2 in projects or commercial applications?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"context\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 110,\n \"samples\": [\n \"Dialogue Turn Baseline + GAtt\\n2 100% 100%\\n4 10% 100%\\n6 0% 100%\\n20 0% 100%\\nTable30: GAttresults. Llama 2-Chat withGAttisabletorefertoattributes100%ofthetime,forupto20\\nturns from our human evaluation. We limited the evaluated attributes to public figures and hobbies.\\nTheattentionnowspansbeyond20turns. Wetestedthemodelabilitytorememberthesystemarguments\\ntroughahumanevaluation. Thearguments(e.g. hobbies,persona)aredefinedduringthefirstmessage,and\\nthen from turn 2 to 20. We explicitly asked the model to refer to them (e.g. \\u201cWhat is your favorite hobby?\\u201d,\\n\\u201cWhatisyourname?\\u201d),tomeasurethemulti-turnmemoryabilityof Llama 2-Chat . Wereporttheresults\\ninTable30. EquippedwithGAtt, Llama 2-Chat maintains100%accuracy,alwaysreferringtothedefined\\nattribute,andso,upto20turns(wedidnotextendthehumanevaluationmore,andalltheexampleshad\\nlessthan4048tokensintotalovertheturns). Asacomparison, Llama 2-Chat withoutGAttcannotanymore\\nrefer to the attributes after only few turns: from 100% at turn t+1, to 10% at turn t+3 and then 0%.\\nGAttZero-shotGeneralisation. Wetriedatinferencetimetosetconstrainnotpresentinthetrainingof\\nGAtt. For instance, \\u201canswer in one sentence only\\u201d, for which the model remained consistent, as illustrated in\\nFigure 28.\\nWe applied first GAtt to Llama 1 , which was pretrained with a context length of 2048 tokens and then\\nfine-tuned with 4096 max length. We tested if GAtt works beyond 2048 tokens, and the model arguably\\nmanaged to understand attributes beyond this window. This promising result indicates that GAtt could be\\nadapted as an efficient technique for long context attention.\\nA.3.6 How Far Can Model-Based Evaluation Go?\\nTo measure the robustness of our reward model, we collected a test set of prompts for both helpfulness and\\nsafety,andaskedannotatorstojudgequalityoftheanswersbasedona7pointLikert-scale(thehigherthe\\nbetter)usingtriplereviews. AsillustratedinFigure29(inAppendix),weobservethatourrewardmodels\\noverallarewellcalibratedwithhumanpreference. Notethatthisenablesustousetherewardasapoint-wise\\nmetric, despite being trained with a Pairwise Ranking Loss.\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0Reward Model ScoreNo Margin\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0\\nMargin Small\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0\\nMargin Large\\nFigure 27: Reward model score distribution shift caused by incorporating preference rating based margin\\ninrankingloss. Withthemarginterm, weobserveabinary splitpatterninrewarddistribution, especially\\nwith a larger margin.\\n54\",\n \"Model Size CodeCommonsense\\nReasoningWorld\\nKnowledgeReading\\nComprehensionMath MMLU BBH AGI Eval\\nMPT7B 20.5 57.4 41.0 57.5 4.9 26.8 31.0 23.5\\n30B 28.9 64.9 50.0 64.7 9.1 46.9 38.0 33.8\\nFalcon7B 5.6 56.1 42.8 36.0 4.6 26.2 28.0 21.2\\n40B 15.2 69.2 56.7 65.7 12.6 55.4 37.1 37.0\\nLlama 17B 14.1 60.8 46.2 58.5 6.95 35.1 30.3 23.9\\n13B 18.9 66.1 52.6 62.3 10.9 46.9 37.0 33.9\\n33B 26.0 70.0 58.4 67.6 21.4 57.8 39.8 41.7\\n65B 30.7 70.7 60.5 68.6 30.8 63.4 43.5 47.6\\nLlama 27B 16.8 63.9 48.9 61.3 14.6 45.3 32.6 29.3\\n13B 24.5 66.9 55.4 65.8 28.7 54.8 39.4 39.1\\n34B 27.8 69.9 58.7 68.0 24.2 62.6 44.1 43.4\\n70B37.5 71.9 63.6 69.4 35.2 68.9 51.2 54.2\\nTable3: Overallperformanceongroupedacademicbenchmarkscomparedtoopen-sourcebasemodels.\\n\\u2022Popular Aggregated Benchmarks . We report the overall results for MMLU (5 shot) (Hendrycks\\net al., 2020), Big Bench Hard (BBH) (3 shot) (Suzgun et al., 2022), and AGI Eval (3\\u20135 shot) (Zhong\\net al., 2023). For AGI Eval, we only evaluate on the English tasks and report the average.\\nAs shown in Table 3, Llama 2 models outperform Llama 1 models. In particular, Llama 2 70B improves the\\nresultsonMMLUandBBHby \\u22485and\\u22488points,respectively,comparedto Llama 1 65B.Llama 2 7Band30B\\nmodelsoutperformMPTmodelsofthecorrespondingsizeonallcategoriesbesidescodebenchmarks. Forthe\\nFalcon models, Llama 2 7B and 34B outperform Falcon 7B and 40B models on all categories of benchmarks.\\nAdditionally, Llama 2 70B model outperforms all open-source models.\\nIn addition to open-source models, we also compare Llama 2 70B results to closed-source models. As shown\\nin Table 4, Llama 2 70B is close to GPT-3.5 (OpenAI, 2023) on MMLU and GSM8K, but there is a significant\\ngaponcodingbenchmarks. Llama 2 70BresultsareonparorbetterthanPaLM(540B)(Chowdheryetal.,\\n2022)onalmostallbenchmarks. Thereisstillalargegapinperformancebetween Llama 2 70BandGPT-4\\nand PaLM-2-L.\\nWe also analysed the potential data contamination and share the details in Section A.6.\",\n \"Figure 1: Helpfulness human evaluation results for Llama\\n2-Chatcomparedtootheropen-sourceandclosed-source\\nmodels. Human raters compared model generations on ~4k\\npromptsconsistingofbothsingleandmulti-turnprompts.\\nThe95%confidenceintervalsforthisevaluationarebetween\\n1%and2%. MoredetailsinSection3.4.2. Whilereviewing\\nthese results, it is important to note that human evaluations\\ncanbenoisyduetolimitationsofthepromptset,subjectivity\\nof the review guidelines, subjectivity of individual raters,\\nand the inherent difficulty of comparing generations.\\nFigure 2: Win-rate % for helpfulness and\\nsafety between commercial-licensed base-\\nlines and Llama 2-Chat , according to GPT-\\n4. Tocomplementthehumanevaluation,we\\nused a more capable model, not subject to\\nourownguidance. Greenareaindicatesour\\nmodelisbetteraccordingtoGPT-4. Toremove\\nties, we used win/ (win+loss). The orders in\\nwhichthemodelresponsesarepresentedto\\nGPT-4arerandomlyswappedtoalleviatebias.\\n1 Introduction\\nLarge Language Models (LLMs) have shown great promise as highly capable AI assistants that excel in\\ncomplex reasoning tasks requiring expert knowledge across a wide range of fields, including in specialized\\ndomains such as programming and creative writing. They enable interaction with humans through intuitive\\nchat interfaces, which has led to rapid and widespread adoption among the general public.\\nThecapabilitiesofLLMsareremarkableconsideringtheseeminglystraightforwardnatureofthetraining\\nmethodology. Auto-regressivetransformersarepretrainedonanextensivecorpusofself-superviseddata,\\nfollowed by alignment with human preferences via techniques such as Reinforcement Learning with Human\\nFeedback(RLHF).Althoughthetrainingmethodologyissimple,highcomputationalrequirementshave\\nlimited the development of LLMs to a few players. There have been public releases of pretrained LLMs\\n(such as BLOOM (Scao et al., 2022), LLaMa-1 (Touvron et al., 2023), and Falcon (Penedo et al., 2023)) that\\nmatch the performance of closed pretrained competitors like GPT-3 (Brown et al., 2020) and Chinchilla\\n(Hoffmann et al., 2022), but none of these models are suitable substitutes for closed \\u201cproduct\\u201d LLMs, such\\nasChatGPT,BARD,andClaude. TheseclosedproductLLMsareheavilyfine-tunedtoalignwithhuman\\npreferences, which greatly enhances their usability and safety. This step can require significant costs in\\ncomputeandhumanannotation,andisoftennottransparentoreasilyreproducible,limitingprogresswithin\\nthe community to advance AI alignment research.\\nIn this work, we develop and release Llama 2, a family of pretrained and fine-tuned LLMs, Llama 2 and\\nLlama 2-Chat , at scales up to 70B parameters. On the series of helpfulness and safety benchmarks we tested,\\nLlama 2-Chat models generally perform better than existing open-source models. They also appear to\\nbe on par with some of the closed-source models, at least on the human evaluations we performed (see\\nFigures1and3). Wehavetakenmeasurestoincreasethesafetyofthesemodels,usingsafety-specificdata\\nannotation and tuning, as well as conducting red-teaming and employing iterative evaluations. Additionally,\\nthispapercontributesathoroughdescriptionofourfine-tuningmethodologyandapproachtoimproving\\nLLM safety. We hope that this openness will enable the community to reproduce fine-tuned LLMs and\\ncontinue to improve the safety of those models, paving the way for more responsible development of LLMs.\\nWealsosharenovelobservationswemadeduringthedevelopmentof Llama 2 andLlama 2-Chat ,suchas\\nthe emergence of tool usage and temporal organization of knowledge.\\n3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 220,\n \"samples\": [\n \"Scholars performed a diagnostic analysis to investigate the AI ethics associated with ChatGPT. Their findings were compiled into a research paper that became accessible as a preprint on arXiv in January 2023.\",\n \"The MPT 30B model demonstrates considerable proficiency in logical reasoning and reading comprehension tasks, scoring highly on LSAT-LR, LSAT-RC, and SAT-en tests compared to its peers, such as Falcon 40B and Llama 17B. This is indicative of its advanced analytical and comprehension abilities. Conversely, while Falcon 40B shows strengths in LSAT-LR with a score second only to MPT 30B, it trails in SAT-en performance. This variability underscores the diverse capabilities of models based on their structural design and training paradigms.\",\n \"Users intending to deploy models like Llama 2 are advised to strictly adhere to guidelines laid out in the Responsible Use Guide. This includes employing enhanced safety measures at both the input and output stages of model interaction, as well as carefully tuning the model according to specific use-case requirements to prevent any potential misuse. Additionally, users must comply with the terms set in the Acceptable Use Policy, ensuring their applications do not contravene applicable laws, regulations, and ethical standards. Leveraging provided code examples can further assist developers in replicating the necessary safety protocols and maintaining ethical integrity in their applications.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "data" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0querycontextanswer
00How does the performance of Llama 2-Chat model...Llama 2 : Open Foundation and Fine-Tuned Chat ...Llama 2-Chat models have shown to exceed the p...
11What benefits does the enhancement and safety ...Llama 2 : Open Foundation and Fine-Tuned Chat ...The safety and enhancement measures implemente...
22How does one ensure the reliability and robust...Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2...In the initial steps of model development, the...
33What methodologies are employed to align machi...Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2...Machine learning models can be aligned with de...
44What are some of the primary insights gained f.... . . . . . . . 23\\n4.3 Red Teaming . . . . . ...The key insights gained from evaluating platfo...
...............
215215How are the terms 'clean', 'not clean', 'dirty...Giventhe\\nembarrassinglyparallelnatureofthetas...In the discussed dataset analysis, samples are...
216216How does the size of the model influence the a...Dataset Model Subset Type Avg. Contam. % n ¯X ...The size of the model significantly influences...
217217What impact does the model contamination have ...Dataset Model Subset Type Avg. Contam. % n ¯X ...Model contamination affects various contaminat...
218218What are the different sizes and types availab...A.7 Model Card\\nTable 52 presents a model card...Llama 2 is available in three distinct paramet...
219219Could you discuss the sustainability measures ...A.7 Model Card\\nTable 52 presents a model card...Throughout the training of Llama 2, which invo...
\n", + "

220 rows × 4 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " Unnamed: 0 query \\\n", + "0 0 How does the performance of Llama 2-Chat model... \n", + "1 1 What benefits does the enhancement and safety ... \n", + "2 2 How does one ensure the reliability and robust... \n", + "3 3 What methodologies are employed to align machi... \n", + "4 4 What are some of the primary insights gained f... \n", + ".. ... ... \n", + "215 215 How are the terms 'clean', 'not clean', 'dirty... \n", + "216 216 How does the size of the model influence the a... \n", + "217 217 What impact does the model contamination have ... \n", + "218 218 What are the different sizes and types availab... \n", + "219 219 Could you discuss the sustainability measures ... \n", + "\n", + " context \\\n", + "0 Llama 2 : Open Foundation and Fine-Tuned Chat ... \n", + "1 Llama 2 : Open Foundation and Fine-Tuned Chat ... \n", + "2 Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2... \n", + "3 Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2... \n", + "4 . . . . . . . . 23\\n4.3 Red Teaming . . . . . ... \n", + ".. ... \n", + "215 Giventhe\\nembarrassinglyparallelnatureofthetas... \n", + "216 Dataset Model Subset Type Avg. Contam. % n ¯X ... \n", + "217 Dataset Model Subset Type Avg. Contam. % n ¯X ... \n", + "218 A.7 Model Card\\nTable 52 presents a model card... \n", + "219 A.7 Model Card\\nTable 52 presents a model card... \n", + "\n", + " answer \n", + "0 Llama 2-Chat models have shown to exceed the p... \n", + "1 The safety and enhancement measures implemente... \n", + "2 In the initial steps of model development, the... \n", + "3 Machine learning models can be aligned with de... \n", + "4 The key insights gained from evaluating platfo... \n", + ".. ... \n", + "215 In the discussed dataset analysis, samples are... \n", + "216 The size of the model significantly influences... \n", + "217 Model contamination affects various contaminat... \n", + "218 Llama 2 is available in three distinct paramet... \n", + "219 Throughout the training of Llama 2, which invo... \n", + "\n", + "[220 rows x 4 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DJ1eG8XvPblc" + }, + "source": [ + "## Ingestion\n", + "Let us now ingest the contexts in LanceDB\n", + "\n", + "- Create a schema (Pydantic or Pyarrow)\n", + "- Select an embedding model from LanceDB Embedding API (Allows automatic vectorization of data)\n", + "- Ingest the contexts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 336, + "referenced_widgets": [ + "91f35d9568ec46459fab4efe6a95e734", + "c95044413a9b439fbfcd0632a70944a8", + "6c7346ac20744690956df20d3d03680c", + "1b4311d56bb04fedaa7b64059401d965", + "c98336949ffc4ceda45c2c3dcad2a6a1", + "6c2c03fc41af44339e845fe0a825bea2", + "e7b07a0c8fe14726a611212a8a239baf", + "528bac44c27c4d63ba507abf5525d1b8", + "7f32d9d86346453292783c856be99825", + "9f17cef9530f40d5932107809403efa0", + "494400f3134c4fb4907d607beedeaf81", + "1853dce77eb94f879ffdfd78de2a0efa", + "df5d4ecafccf45a8a1584664a6f723aa", + "a165638ba18a48deb8db106a1b5b2290", + "813dcc840ffc463593fcc426b5496295", + "33d9a142bbf44fa4a90855e27520bb16", + "bb4dba99a16949f1ae327eccc219b6de", + "5f4cda0c0861409a951291cd6c3366f1", + "f267da4ad02349e79c9ba7a0f325b540", + "644944bb940046adb953b22f11fcd7b4", + "4e3fa16f33204bbfb156114dabb28c04", + "054df2d05c7e46638cb31006d67fd367", + "81b9ad10745a4a06a11cc26377154214", + "f2486513d4b54ddca27ee691e5346a46", + "57dbc355b039461684573b7c7e07af3a", + "f5400cefb2154be9a4a659acce6edd5e", + "febf788f5ad941c5aca7be98645936ad", + "de37cca307d1471f9342302b7f3429a9", + "c3982bc50b6e4e84b0ad10e5ba523e5d", + "2bb82aec434c4d53a36090c9ec6323ae", + "5b3f477e3b1d4e2f8a9e254401c6c954", + "36f2266eae21439ab778a816044a879a", + "fab2133c1dcb4a329ee331686233c4fd", + "76636be73f4d42e39719dfa089c4716b", + "c74262510b754e3b846751a7216c7dbf", + "92c398c9e36542f8b61aa8795facb421", + "cd67a7a4ce9a4774b692039c517a882d", + "280a79e1521a4cc0924f065f2cb5bd87", + "e1733e1b9d49420da2fb5397613cd9f9", + "090c494d7e954fa2be0e9fb91f30d9b8", + "df7b8f33120c4e10990e17418f8941cf", + "6565dc6bca3348de941027ea2573d5cf", + "970c0d55bbde4f92ba30900281abe1b3", + "eebec9aee091453c8bfbe48539a240ea", + "ced1b556ccd94eccbf3edd1aefc7df40", + "deef03a9155c45f78712712998cf660d", + "48e2a8f80d8c418792734c88e797e982", + "fcaa21fb513846ecabf64b88aafe2cd0", + "f95a2f8628584be4b9d35b7cfe125939", + "ab132a8c917e4151ae2891616b79ee07", + "3cc4457279334a2da1fccfbb99b27e9a", + "f22c56f6666a462e833bb92fff69f00b", + "283857c87a674aa2b42c7dcf858679ec", + "9bf91263f8d64ce18aa0275edd09644d", + "3280f3b54f1c40b6ad66fac4d4403031", + "c5358677106d4c2aa30991b11bdb8ea4", + "c7f4de8f49884cd585a95af2aaaa470d", + "7a5c63dfc9d74a218c8269a72439e57a", + "d49b8015f4a2463885ae6266c7af2f65", + "35f968dcb12f49f582bf353605126a81", + "4ad87837299b4f12ae33e54b7a9be457", + "b5c20304a60a40d5b134c896bc903f6f", + "99955dde5d33494684403ac0e8ff119c", + "1dd100c461d949ff9b2fb448a9f321f8", + "7630b6b8ba764c98a3f1a986c6cb91e4", + "d618cacb4426451ab3c32be08f7ad07e" + ] + }, + "id": "B_g5pIkBQ66h", + "outputId": "c18d729f-de1e-4970-eb1a-a590ef2a9658" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91f35d9568ec46459fab4efe6a95e734", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/366 [00:00 float:\n", + " eval_results = []\n", + " for idx in tqdm.tqdm(range(len(ds))):\n", + " query = ds[\"query\"][idx]\n", + " reference_context = ds[\"context\"][idx]\n", + " if not reference_context:\n", + " print(\"reference_context is None for query: {idx}. \\\n", + " Skipping this query. Please check your dataset.\")\n", + " continue\n", + " try:\n", + " rs = search_table(table, reranker, query_type, query, top_k)\n", + " except Exception as e:\n", + " print(f'Error with query: {idx} {e}')\n", + " eval_results.append({\n", + " 'is_hit': False,\n", + " 'retrieved': [],\n", + " 'expected': reference_context,\n", + " 'query': query,\n", + " })\n", + " continue\n", + " retrieved_texts = rs['text'].tolist()[:top_k]\n", + " expected_text = reference_context[0] if isinstance(reference_context, list) else reference_context\n", + " is_hit = False\n", + "\n", + " # HACK: to handle new line characters added my llamaindex doc reader\n", + " if expected_text in retrieved_texts or expected_text+'\\n' in retrieved_texts:\n", + " is_hit = True\n", + " eval_result = {\n", + " 'is_hit': is_hit,\n", + " 'retrieved': retrieved_texts,\n", + " 'expected': expected_text,\n", + " 'query': query,\n", + " }\n", + " eval_results.append(eval_result)\n", + "\n", + " result = pd.DataFrame(eval_results)\n", + " hit_rate = result['is_hit'].mean()\n", + " return hit_rate" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iZzAVl2kJ5mV", + "outputId": "88ee5b31-cc3c-402d-d414-d10e3c7cc324" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 220/220 [00:09<00:00, 22.46it/s]\n", + "100%|██████████| 220/220 [00:00<00:00, 330.74it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Vector Search Hit Rate: 0.6409090909090909\n", + "FTS Search Hit Rate: 0.5954545454545455\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "tbl.create_fts_index(\"text\", replace=True)\n", + "hit_rate_vector = hit_rate(data, tbl, \"vector\")\n", + "hit_rate_fts = hit_rate(data, tbl, \"fts\")\n", + "print(f\"\\n Vector Search Hit Rate: {hit_rate_vector}\")\n", + "print(f\"FTS Search Hit Rate: {hit_rate_fts}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Efmb9Gi2s9lD" + }, + "source": [ + "## Hybrid Search\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ydLNeAr4acYj", + "outputId": "34263ad1-1c21-43a5-f592-16e7f7ca8c45" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 220/220 [00:10<00:00, 20.37it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Hybrid Search with LinearCombinationReranker Hit Rate: 0.6454545454545455\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from lancedb.rerankers import LinearCombinationReranker # LanceDB hybrid search uses LinearCombinationReranker by default\n", + "\n", + "reranker = LinearCombinationReranker(weight=0.7)\n", + "hit_rate_hybrid = hit_rate(data, tbl, \"hybrid\", reranker=reranker)\n", + "\n", + "print(f\"\\n Hybrid Search with LinearCombinationReranker Hit Rate: {hit_rate_hybrid}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wswq157ptjTZ" + }, + "source": [ + "## Trying out different rerankers\n", + "\n", + "### 1. Cross Encoder Reranker\n", + "\n", + " \n", + "\n", + "Bi-Encoders produce for a given sentence a sentence embedding. We pass to a BERT independently the sentences A and B, which result in the sentence embeddings u and v. These sentence embedding can then be compared using cosine similarity.\n", + "\n", + "In contrast, for a Cross-Encoder, we pass both sentences simultaneously to the Transformer network. It produces then an output value between 0 and 1 indicating the similarity of the input sentence pair:\n", + "\n", + "A Cross-Encoder does not produce a sentence embedding. Also, we are not able to pass individual sentences to a Cross-Encoder." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 286, + "referenced_widgets": [ + "1833d9358c9b4cb581cc618ac68d7e30", + "8609f6b2059943d9bad3254b3d7ec629", + "25aefdd4c5644c709e03115bc8ae5f99", + "7ba55480b151488782c14afdc459b182", + "5414385b19d54478a98969b4d45736fa", + "c17520bc49fd4539bceaa78318cce796", + "8b7c9f8f944e4c4eb4cff354b42e086a", + "559ce10ceb2e4b558ae7124b2d359031", + "db0cca3c1f9a4fc695bb7362409b2cbe", + "27d1b3b7b4cd4588910f3aa07ab4fc3e", + "caab49ce2bf940eb8ac988f88a0dadef", + "6e37c2f25d3946fca0564fe3363f4725", + "08d187281e1e463785b7a5e88f47f316", + "f43a7a4b03b240b7bcef5befa7c91b46", + "c800dfef86de45a8b13b6621ef4e1a84", + "7926d3c178a949e182328fef895b9cbe", + "42caf4b7c7804a91bf320f38f7481ecb", + "45c142aca09946929dd91e2ad8463fac", + "e215079b606d43738715e21d784b8d2c", + "bd455814e03b4dea8169ba39b782e5dc", + "3f6c357e36ad48c2a73cbf7f9fe74716", + "d202239dc6ac4107b5c5f930cee827d7", + "e5e24aa6f68f41e3ba3f7f0585eddf8f", + "426a927035474cb2ad4936ca2041d2e1", + "cf476ad98f34482ebc14638fa5b7a35f", + "52f061513e3845d7819d0d103afe8258", + "5c476489e4964893a70aef719c38ac46", + "4f7ce5f08915421eb115d3e98cf60007", + "ea80b988836247838d21652c75b319ea", + "306c491ee6a04737aa400d6ff0a8c463", + "f6ecbdd2cad14aa4ad2b184f1889c8f5", + "e48fd3d0d8054fb297361c1e2d17f6ba", + "a9cb97c0b36e4da7bae4571baf0ec9d4", + "607b5519211d47a6ad75d5b2739ee907", + "079979603b3a46c89934f3718cdde9a5", + "3cca14175fd54dc3b2967d0e14a0df66", + "53d7258eb0924da38642be8c714bc6c3", + "5b5c3f2bbdbd456ebc26c6069869dcbd", + "d82d228598ae4eedb1a92c18aa03e985", + "2b361f073b1c49129ede34d359caff88", + "d2948d1af00e42e78058248f45059c30", + "48fee23cb275447bb43942a38cb94a17", + "ea726496c3e8476091d7be2c96def0d7", + "0d6b1481bbcd4a949f48396b7a385f6f", + "c70a82bbcb974214b1563babd29d50ef", + "43447d181ae44d71a449957f5eed54d6", + "706d91b06672487aa62dc974b4b28342", + "7c8b3122064a4289a9d8b246f9c37dfe", + "3c1b70e0586945d5a240a769a61f1ef0", + "44907a6e01ee434383566a39de8c05e8", + "b2358f595c3f47399530d363f63166a0", + "08b6d24ba33942b7bb4f47a7f43c4ad5", + "1b5fb3da3e98440aba828920d512c7fa", + "ae501dd3797e40b3bd0a018b6212f80e", + "8aa85ff7d417493ea7bf43e347fca68d" + ] + }, + "id": "dd0jh4gNtm41", + "outputId": "731f7c32-838d-4e18-c080-83bece99f506" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r 0%| | 0/220 [00:00\n", + "\n", + " \n", + " 2. **Social media personalised timeline**\n", + " \n", + "\n", + " 3. **Recommend blogs, videos, etc. via push notifications**\n", + "\n", + " \"YouTube now gives notifications for \"recommended\", non-subscribed channels\" - https://www.reddit.com/r/assholedesign/comments/807zpe/youtube_now_gives_notifications_for_recommended/\n", + " \n", + " \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}