diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index e8b730a42..b703e3893 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -100,6 +100,7 @@ nav: - Quickstart: reranking/index.md - Cohere Reranker: reranking/cohere.md - Linear Combination Reranker: reranking/linear_combination.md + - Reciprocal Rank Fusion Reranker: reranking/rrf.md - Cross Encoder Reranker: reranking/cross_encoder.md - ColBERT Reranker: reranking/colbert.md - Jina Reranker: reranking/jina.md @@ -185,6 +186,7 @@ nav: - Quickstart: reranking/index.md - Cohere Reranker: reranking/cohere.md - Linear Combination Reranker: reranking/linear_combination.md + - Reciprocal Rank Fusion Reranker: reranking/rrf.md - Cross Encoder Reranker: reranking/cross_encoder.md - ColBERT Reranker: reranking/colbert.md - Jina Reranker: reranking/jina.md diff --git a/docs/src/notebooks/lancedb_reranking.ipynb b/docs/src/notebooks/lancedb_reranking.ipynb index 14adf8af3..0c9346ced 100644 --- a/docs/src/notebooks/lancedb_reranking.ipynb +++ b/docs/src/notebooks/lancedb_reranking.ipynb @@ -13,42 +13,33 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6gUUIxGP0n1Z", - "outputId": "96e24cff-abfa-46dd-ada5-28b6c15b4f47" + "outputId": "0319735d-5986-470b-ad7a-3e6a9a4032f6" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.9/20.9 MB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.1/227.1 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.0/40.0 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.8/173.8 kB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m25.5/25.5 MB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m73.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.7/82.7 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.4/177.4 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m16.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.4/12.4 MB\u001b[0m \u001b[31m51.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.7/82.7 kB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m75.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 13.0.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" ] } ], "source": [ - "!pip install lancedb sentence-transformers tantivy pyarrow==13.0.0 cohere -q" + "!pip install lancedb sentence-transformers cohere tantivy pyarrow==13.0.0 -q" ] }, { @@ -113,23 +104,23 @@ "base_uri": "https://localhost:8080/" }, "id": "f_qnH-Dfhi9Z", - "outputId": "9d41e17e-c994-473d-cf17-93568e025252" + "outputId": "1e22e1b1-a821-4ccb-ff30-1b2d6f8b824e" }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "--2024-06-26 13:31:34-- https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv\n", + "--2024-07-24 14:22:47-- https://raw.githubusercontent.com/AyushExel/assets/main/data_qa.csv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 680439 (664K) [text/plain]\n", "Saving to: ‘data_qa.csv’\n", "\n", - "data_qa.csv 100%[===================>] 664.49K --.-KB/s in 0.009s \n", + "data_qa.csv 100%[===================>] 664.49K --.-KB/s in 0.03s \n", "\n", - "2024-06-26 13:31:34 (71.7 MB/s) - ‘data_qa.csv’ saved [680439/680439]\n", + "2024-07-24 14:22:48 (19.9 MB/s) - ‘data_qa.csv’ saved [680439/680439]\n", "\n" ] } @@ -160,19 +151,57 @@ "height": 580 }, "id": "4Bp9Fdhz7QsM", - "outputId": "4cb6b384-7991-4911-e5fd-7fbd42487916" + "outputId": "fdcbc090-d526-4dcb-98a2-c0d8090f295d" }, "outputs": [ { + "output_type": "execute_result", "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"data\",\n \"rows\": 220,\n \"fields\": [\n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 63,\n \"min\": 0,\n \"max\": 219,\n \"num_unique_values\": 220,\n \"samples\": [\n 132,\n 148,\n 93\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 220,\n \"samples\": [\n \"What type of examination did scholars perform on ChatGPT, and when was the resulting scholarly paper published?\",\n \"How do the performance capabilities of the different models compare in evaluating tasks associated with logical reasoning and reading comprehension, specifically noted in tests like LSAT and SAT?\",\n \"What steps are recommended for users to ensure the responsible use of AI models like Llama 2 in projects or commercial applications?\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"context\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 110,\n \"samples\": [\n \"Dialogue Turn Baseline + GAtt\\n2 100% 100%\\n4 10% 100%\\n6 0% 100%\\n20 0% 100%\\nTable30: GAttresults. Llama 2-Chat withGAttisabletorefertoattributes100%ofthetime,forupto20\\nturns from our human evaluation. We limited the evaluated attributes to public figures and hobbies.\\nTheattentionnowspansbeyond20turns. Wetestedthemodelabilitytorememberthesystemarguments\\ntroughahumanevaluation. Thearguments(e.g. hobbies,persona)aredefinedduringthefirstmessage,and\\nthen from turn 2 to 20. We explicitly asked the model to refer to them (e.g. \\u201cWhat is your favorite hobby?\\u201d,\\n\\u201cWhatisyourname?\\u201d),tomeasurethemulti-turnmemoryabilityof Llama 2-Chat . Wereporttheresults\\ninTable30. EquippedwithGAtt, Llama 2-Chat maintains100%accuracy,alwaysreferringtothedefined\\nattribute,andso,upto20turns(wedidnotextendthehumanevaluationmore,andalltheexampleshad\\nlessthan4048tokensintotalovertheturns). Asacomparison, Llama 2-Chat withoutGAttcannotanymore\\nrefer to the attributes after only few turns: from 100% at turn t+1, to 10% at turn t+3 and then 0%.\\nGAttZero-shotGeneralisation. Wetriedatinferencetimetosetconstrainnotpresentinthetrainingof\\nGAtt. For instance, \\u201canswer in one sentence only\\u201d, for which the model remained consistent, as illustrated in\\nFigure 28.\\nWe applied first GAtt to Llama 1 , which was pretrained with a context length of 2048 tokens and then\\nfine-tuned with 4096 max length. We tested if GAtt works beyond 2048 tokens, and the model arguably\\nmanaged to understand attributes beyond this window. This promising result indicates that GAtt could be\\nadapted as an efficient technique for long context attention.\\nA.3.6 How Far Can Model-Based Evaluation Go?\\nTo measure the robustness of our reward model, we collected a test set of prompts for both helpfulness and\\nsafety,andaskedannotatorstojudgequalityoftheanswersbasedona7pointLikert-scale(thehigherthe\\nbetter)usingtriplereviews. AsillustratedinFigure29(inAppendix),weobservethatourrewardmodels\\noverallarewellcalibratedwithhumanpreference. Notethatthisenablesustousetherewardasapoint-wise\\nmetric, despite being trained with a Pairwise Ranking Loss.\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0Reward Model ScoreNo Margin\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0\\nMargin Small\\n0.0% 2.0% 4.0% 6.0% 8.0%\\nDensity0.00.20.40.60.81.0\\nMargin Large\\nFigure 27: Reward model score distribution shift caused by incorporating preference rating based margin\\ninrankingloss. Withthemarginterm, weobserveabinary splitpatterninrewarddistribution, especially\\nwith a larger margin.\\n54\",\n \"Model Size CodeCommonsense\\nReasoningWorld\\nKnowledgeReading\\nComprehensionMath MMLU BBH AGI Eval\\nMPT7B 20.5 57.4 41.0 57.5 4.9 26.8 31.0 23.5\\n30B 28.9 64.9 50.0 64.7 9.1 46.9 38.0 33.8\\nFalcon7B 5.6 56.1 42.8 36.0 4.6 26.2 28.0 21.2\\n40B 15.2 69.2 56.7 65.7 12.6 55.4 37.1 37.0\\nLlama 17B 14.1 60.8 46.2 58.5 6.95 35.1 30.3 23.9\\n13B 18.9 66.1 52.6 62.3 10.9 46.9 37.0 33.9\\n33B 26.0 70.0 58.4 67.6 21.4 57.8 39.8 41.7\\n65B 30.7 70.7 60.5 68.6 30.8 63.4 43.5 47.6\\nLlama 27B 16.8 63.9 48.9 61.3 14.6 45.3 32.6 29.3\\n13B 24.5 66.9 55.4 65.8 28.7 54.8 39.4 39.1\\n34B 27.8 69.9 58.7 68.0 24.2 62.6 44.1 43.4\\n70B37.5 71.9 63.6 69.4 35.2 68.9 51.2 54.2\\nTable3: Overallperformanceongroupedacademicbenchmarkscomparedtoopen-sourcebasemodels.\\n\\u2022Popular Aggregated Benchmarks . We report the overall results for MMLU (5 shot) (Hendrycks\\net al., 2020), Big Bench Hard (BBH) (3 shot) (Suzgun et al., 2022), and AGI Eval (3\\u20135 shot) (Zhong\\net al., 2023). For AGI Eval, we only evaluate on the English tasks and report the average.\\nAs shown in Table 3, Llama 2 models outperform Llama 1 models. In particular, Llama 2 70B improves the\\nresultsonMMLUandBBHby \\u22485and\\u22488points,respectively,comparedto Llama 1 65B.Llama 2 7Band30B\\nmodelsoutperformMPTmodelsofthecorrespondingsizeonallcategoriesbesidescodebenchmarks. Forthe\\nFalcon models, Llama 2 7B and 34B outperform Falcon 7B and 40B models on all categories of benchmarks.\\nAdditionally, Llama 2 70B model outperforms all open-source models.\\nIn addition to open-source models, we also compare Llama 2 70B results to closed-source models. As shown\\nin Table 4, Llama 2 70B is close to GPT-3.5 (OpenAI, 2023) on MMLU and GSM8K, but there is a significant\\ngaponcodingbenchmarks. Llama 2 70BresultsareonparorbetterthanPaLM(540B)(Chowdheryetal.,\\n2022)onalmostallbenchmarks. Thereisstillalargegapinperformancebetween Llama 2 70BandGPT-4\\nand PaLM-2-L.\\nWe also analysed the potential data contamination and share the details in Section A.6.\",\n \"Figure 1: Helpfulness human evaluation results for Llama\\n2-Chatcomparedtootheropen-sourceandclosed-source\\nmodels. Human raters compared model generations on ~4k\\npromptsconsistingofbothsingleandmulti-turnprompts.\\nThe95%confidenceintervalsforthisevaluationarebetween\\n1%and2%. MoredetailsinSection3.4.2. Whilereviewing\\nthese results, it is important to note that human evaluations\\ncanbenoisyduetolimitationsofthepromptset,subjectivity\\nof the review guidelines, subjectivity of individual raters,\\nand the inherent difficulty of comparing generations.\\nFigure 2: Win-rate % for helpfulness and\\nsafety between commercial-licensed base-\\nlines and Llama 2-Chat , according to GPT-\\n4. Tocomplementthehumanevaluation,we\\nused a more capable model, not subject to\\nourownguidance. Greenareaindicatesour\\nmodelisbetteraccordingtoGPT-4. Toremove\\nties, we used win/ (win+loss). The orders in\\nwhichthemodelresponsesarepresentedto\\nGPT-4arerandomlyswappedtoalleviatebias.\\n1 Introduction\\nLarge Language Models (LLMs) have shown great promise as highly capable AI assistants that excel in\\ncomplex reasoning tasks requiring expert knowledge across a wide range of fields, including in specialized\\ndomains such as programming and creative writing. They enable interaction with humans through intuitive\\nchat interfaces, which has led to rapid and widespread adoption among the general public.\\nThecapabilitiesofLLMsareremarkableconsideringtheseeminglystraightforwardnatureofthetraining\\nmethodology. Auto-regressivetransformersarepretrainedonanextensivecorpusofself-superviseddata,\\nfollowed by alignment with human preferences via techniques such as Reinforcement Learning with Human\\nFeedback(RLHF).Althoughthetrainingmethodologyissimple,highcomputationalrequirementshave\\nlimited the development of LLMs to a few players. There have been public releases of pretrained LLMs\\n(such as BLOOM (Scao et al., 2022), LLaMa-1 (Touvron et al., 2023), and Falcon (Penedo et al., 2023)) that\\nmatch the performance of closed pretrained competitors like GPT-3 (Brown et al., 2020) and Chinchilla\\n(Hoffmann et al., 2022), but none of these models are suitable substitutes for closed \\u201cproduct\\u201d LLMs, such\\nasChatGPT,BARD,andClaude. TheseclosedproductLLMsareheavilyfine-tunedtoalignwithhuman\\npreferences, which greatly enhances their usability and safety. This step can require significant costs in\\ncomputeandhumanannotation,andisoftennottransparentoreasilyreproducible,limitingprogresswithin\\nthe community to advance AI alignment research.\\nIn this work, we develop and release Llama 2, a family of pretrained and fine-tuned LLMs, Llama 2 and\\nLlama 2-Chat , at scales up to 70B parameters. On the series of helpfulness and safety benchmarks we tested,\\nLlama 2-Chat models generally perform better than existing open-source models. They also appear to\\nbe on par with some of the closed-source models, at least on the human evaluations we performed (see\\nFigures1and3). Wehavetakenmeasurestoincreasethesafetyofthesemodels,usingsafety-specificdata\\nannotation and tuning, as well as conducting red-teaming and employing iterative evaluations. Additionally,\\nthispapercontributesathoroughdescriptionofourfine-tuningmethodologyandapproachtoimproving\\nLLM safety. We hope that this openness will enable the community to reproduce fine-tuned LLMs and\\ncontinue to improve the safety of those models, paving the way for more responsible development of LLMs.\\nWealsosharenovelobservationswemadeduringthedevelopmentof Llama 2 andLlama 2-Chat ,suchas\\nthe emergence of tool usage and temporal organization of knowledge.\\n3\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 220,\n \"samples\": [\n \"Scholars performed a diagnostic analysis to investigate the AI ethics associated with ChatGPT. Their findings were compiled into a research paper that became accessible as a preprint on arXiv in January 2023.\",\n \"The MPT 30B model demonstrates considerable proficiency in logical reasoning and reading comprehension tasks, scoring highly on LSAT-LR, LSAT-RC, and SAT-en tests compared to its peers, such as Falcon 40B and Llama 17B. This is indicative of its advanced analytical and comprehension abilities. Conversely, while Falcon 40B shows strengths in LSAT-LR with a score second only to MPT 30B, it trails in SAT-en performance. This variability underscores the diverse capabilities of models based on their structural design and training paradigms.\",\n \"Users intending to deploy models like Llama 2 are advised to strictly adhere to guidelines laid out in the Responsible Use Guide. This includes employing enhanced safety measures at both the input and output stages of model interaction, as well as carefully tuning the model according to specific use-case requirements to prevent any potential misuse. Additionally, users must comply with the terms set in the Acceptable Use Policy, ensuring their applications do not contravene applicable laws, regulations, and ethical standards. Leveraging provided code examples can further assist developers in replicating the necessary safety protocols and maintaining ethical integrity in their applications.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe", - "variable_name": "data" - }, + "text/plain": [ + " Unnamed: 0 query \\\n", + "0 0 How does the performance of Llama 2-Chat model... \n", + "1 1 What benefits does the enhancement and safety ... \n", + "2 2 How does one ensure the reliability and robust... \n", + "3 3 What methodologies are employed to align machi... \n", + "4 4 What are some of the primary insights gained f... \n", + ".. ... ... \n", + "215 215 How are the terms 'clean', 'not clean', 'dirty... \n", + "216 216 How does the size of the model influence the a... \n", + "217 217 What impact does the model contamination have ... \n", + "218 218 What are the different sizes and types availab... \n", + "219 219 Could you discuss the sustainability measures ... \n", + "\n", + " context \\\n", + "0 Llama 2 : Open Foundation and Fine-Tuned Chat ... \n", + "1 Llama 2 : Open Foundation and Fine-Tuned Chat ... \n", + "2 Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2... \n", + "3 Contents\\n1 Introduction 3\\n2 Pretraining 5\\n2... \n", + "4 . . . . . . . . 23\\n4.3 Red Teaming . . . . . ... \n", + ".. ... \n", + "215 Giventhe\\nembarrassinglyparallelnatureofthetas... \n", + "216 Dataset Model Subset Type Avg. Contam. % n ¯X ... \n", + "217 Dataset Model Subset Type Avg. Contam. % n ¯X ... \n", + "218 A.7 Model Card\\nTable 52 presents a model card... \n", + "219 A.7 Model Card\\nTable 52 presents a model card... \n", + "\n", + " answer \n", + "0 Llama 2-Chat models have shown to exceed the p... \n", + "1 The safety and enhancement measures implemente... \n", + "2 In the initial steps of model development, the... \n", + "3 Machine learning models can be aligned with de... \n", + "4 The key insights gained from evaluating platfo... \n", + ".. ... \n", + "215 In the discussed dataset analysis, samples are... \n", + "216 The size of the model significantly influences... \n", + "217 Model contamination affects various contaminat... \n", + "218 Llama 2 is available in three distinct paramet... \n", + "219 Throughout the training of Llama 2, which invo... \n", + "\n", + "[220 rows x 4 columns]" + ], "text/html": [ "\n", - "