[nodejs] Added completed youtube transcript example / docs (#156)

2026-07-07 13:00:40 +00:00 · 2023-06-06 16:26:21 -07:00
parent 3798f56a9b
commit 131b2dc57b
4 changed files with 241 additions and 61 deletions
--- a/node/examples/js-youtube-transcripts/index.js
+++ b/node/examples/js-youtube-transcripts/index.js
@@ -0,0 +1,122 @@
+// Copyright 2023 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+'use strict'
+
+const lancedb = require('vectordb')
+const fs = require('fs/promises')
+const readline = require('readline/promises')
+const { stdin: input, stdout: output } = require('process')
+const { Configuration, OpenAIApi } = require('openai')
+
+// Download file from XYZ
+const INPUT_FILE_NAME = 'data/youtube-transcriptions_sample.jsonl';
+
+(async () => {
+  // You need to provide an OpenAI API key, here we read it from the OPENAI_API_KEY environment variable
+  const apiKey = process.env.OPENAI_API_KEY
+  // The embedding function will create embeddings for the 'context' column
+  const embedFunction = new lancedb.OpenAIEmbeddingFunction('context', apiKey)
+
+  // Connects to LanceDB
+  const db = await lancedb.connect('data/youtube-lancedb')
+
+  // Open the vectors table or create one if it does not exist
+  let tbl
+  if ((await db.tableNames()).includes('vectors')) {
+    tbl = await db.openTable('vectors', embedFunction)
+  } else {
+    tbl = await createEmbeddingsTable(db, embedFunction)
+  }
+
+  // Use OpenAI Completion API to generate and answer based on the context that LanceDB provides
+  const configuration = new Configuration({ apiKey })
+  const openai = new OpenAIApi(configuration)
+  const rl = readline.createInterface({ input, output })
+  try {
+    while (true) {
+      const query = await rl.question('Prompt: ')
+      const results = await tbl
+        .search(query)
+        .select(['title', 'text', 'context'])
+        .limit(3)
+        .execute()
+
+      // console.table(results)
+
+      const response = await openai.createCompletion({
+        model: 'text-davinci-003',
+        prompt: createPrompt(query, results),
+        max_tokens: 400,
+        temperature: 0,
+        top_p: 1,
+        frequency_penalty: 0,
+        presence_penalty: 0
+      })
+      console.log(response.data.choices[0].text)
+    }
+  } catch (err) {
+    console.log('Error: ', err)
+  } finally {
+    rl.close()
+  }
+  process.exit(1)
+})()
+
+async function createEmbeddingsTable (db, embedFunction) {
+  console.log(`Creating embeddings from ${INPUT_FILE_NAME}`)
+  // read the input file into a JSON array, skipping empty lines
+  const lines = (await fs.readFile(INPUT_FILE_NAME, 'utf-8'))
+    .toString()
+    .split('\n')
+    .filter(line => line.length > 0)
+    .map(line => JSON.parse(line))
+
+  const data = contextualize(lines, 20, 'video_id')
+  return await db.createTable('vectors', data, embedFunction)
+}
+
+// Each transcript has a small text column, we include previous transcripts in order to
+// have more context information when creating embeddings
+function contextualize (rows, contextSize, groupColumn) {
+  const grouped = []
+  rows.forEach(row => {
+    if (!grouped[row[groupColumn]]) {
+      grouped[row[groupColumn]] = []
+    }
+    grouped[row[groupColumn]].push(row)
+  })
+
+  const data = []
+  Object.keys(grouped).forEach(key => {
+    for (let i = 0; i < grouped[key].length; i++) {
+      const start = i - contextSize > 0 ? i - contextSize : 0
+      grouped[key][i].context = grouped[key].slice(start, i + 1).map(r => r.text).join(' ')
+    }
+    data.push(...grouped[key])
+  })
+  return data
+}
+
+// Creates a prompt by aggregating all relevant contexts
+function createPrompt (query, context) {
+  let prompt =
+      'Answer the question based on the context below.\n\n' +
+      'Context:\n'
+
+  // need to make sure our prompt is not larger than max size
+  prompt = prompt + context.map(c => c.context).join('\n\n---\n\n').substring(0, 3750)
+  prompt = prompt + `\n\nQuestion: ${query}\nAnswer:`
+  return prompt
+}
--- a/node/examples/js-youtube-transcripts/package.json
+++ b/node/examples/js-youtube-transcripts/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "vectordb-example-js-openai",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "Lance Devs",
+  "license": "Apache-2.0",
+  "dependencies": {
+    "vectordb": "file:../..",
+    "openai": "^3.2.1"
+  }
+}