From d811b89de2e563e7fa27f111b0972fbeedbc37a3 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sat, 27 Jan 2024 22:52:37 -0800 Subject: [PATCH] doc: use code snippet for typescript examples (#880) The typescript code is in a fully function file, that will be run via the CI. --- .github/workflows/docs_test.yml | 42 +++++----- docs/README.md | 9 +++ docs/mkdocs.yml | 3 +- docs/package-lock.json | 132 +++++++++++++++++++++++++++++++ docs/package.json | 20 +++++ docs/src/ann_indexes.md | 4 +- docs/src/basic.md | 35 +++----- docs/src/basic.rs | 1 + docs/src/basic_legacy.ts | 53 +++++++++++++ docs/test/md_testing.js | 12 ++- docs/tsconfig.json | 17 ++++ rust/vectordb/examples/simple.rs | 2 + 12 files changed, 281 insertions(+), 49 deletions(-) create mode 100644 docs/package-lock.json create mode 100644 docs/package.json create mode 120000 docs/src/basic.rs create mode 100644 docs/src/basic_legacy.ts create mode 100644 docs/tsconfig.json diff --git a/.github/workflows/docs_test.yml b/.github/workflows/docs_test.yml index ff513034..22cb6007 100644 --- a/.github/workflows/docs_test.yml +++ b/.github/workflows/docs_test.yml @@ -18,24 +18,20 @@ on: env: # Disable full debug symbol generation to speed up CI build and keep memory down # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" + RUSTFLAGS: "-C debuginfo=1 -C target-cpu=native -C target-feature=+f16c,+avx2,+fma" RUST_BACKTRACE: "1" jobs: test-python: name: Test doc python code - runs-on: ${{ matrix.os }} - strategy: - matrix: - python-minor-version: [ "11" ] - os: ["ubuntu-22.04"] + runs-on: "ubuntu-latest" steps: - name: Checkout uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.${{ matrix.python-minor-version }} + python-version: 3.11 cache: "pip" cache-dependency-path: "docs/test/requirements.txt" - name: Build Python @@ -52,11 +48,7 @@ jobs: for d in *; do cd "$d"; echo "$d".py; python "$d".py; cd ..; done test-node: name: Test doc nodejs code - runs-on: ${{ matrix.os }} - strategy: - matrix: - node-version: [ "18" ] - os: ["ubuntu-22.04"] + runs-on: "ubuntu-latest" steps: - name: Checkout uses: actions/checkout@v4 @@ -64,22 +56,32 @@ jobs: fetch-depth: 0 lfs: true - name: Set up Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: - node-version: ${{ matrix.node-version }} + node-version: 20 - name: Install dependecies needed for ubuntu - if: ${{ matrix.os == 'ubuntu-22.04' }} run: | sudo apt install -y protobuf-compiler libssl-dev - - name: Install node dependencies - run: | - cd docs/test - npm install - name: Rust cache uses: swatinem/rust-cache@v2 + - name: Install node dependencies + run: | + cd node + npm ci + npm run build + cd ../docs + npm install + - name: Run doc test + run: | + cd docs + npm t + - name: Install dependencies for generated code + run: | + cd docs/test + npm install - name: Install LanceDB run: | - cd docs/test/node_modules/vectordb + cd docs/test/node_modules/vectordb npm ci npm run build-release npm run tsc diff --git a/docs/README.md b/docs/README.md index 07306fe4..8a33bd25 100644 --- a/docs/README.md +++ b/docs/README.md @@ -33,3 +33,12 @@ You can run a local server to test the docs prior to deployment by navigating to cd docs mkdocs serve ``` + +### Run doctest for typescript example + +```bash +cd lancedb/docs +npm i +npm run build +npm run all +``` diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 65aba135..fe6eecea 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -67,7 +67,8 @@ markdown_extensions: line_spans: __span pygments_lang_class: true - pymdownx.inlinehilite -- pymdownx.snippets +- pymdownx.snippets: + dedent_subsections: true - pymdownx.superfences - pymdownx.tabbed: alternate_style: true diff --git a/docs/package-lock.json b/docs/package-lock.json new file mode 100644 index 00000000..ead4a547 --- /dev/null +++ b/docs/package-lock.json @@ -0,0 +1,132 @@ +{ + "name": "lancedb-docs-test", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "lancedb-docs-test", + "version": "1.0.0", + "license": "Apache 2", + "dependencies": { + "apache-arrow": "file:../node/node_modules/apache-arrow", + "vectordb": "file:../node" + }, + "devDependencies": { + "@types/node": "^20.11.8", + "typescript": "^5.3.3" + } + }, + "../node": { + "name": "vectordb", + "version": "0.4.6", + "cpu": [ + "x64", + "arm64" + ], + "license": "Apache-2.0", + "os": [ + "darwin", + "linux", + "win32" + ], + "dependencies": { + "@apache-arrow/ts": "^14.0.2", + "@neon-rs/load": "^0.0.74", + "apache-arrow": "^14.0.2", + "axios": "^1.4.0" + }, + "devDependencies": { + "@neon-rs/cli": "^0.0.160", + "@types/chai": "^4.3.4", + "@types/chai-as-promised": "^7.1.5", + "@types/mocha": "^10.0.1", + "@types/node": "^18.16.2", + "@types/sinon": "^10.0.15", + "@types/temp": "^0.9.1", + "@types/uuid": "^9.0.3", + "@typescript-eslint/eslint-plugin": "^5.59.1", + "cargo-cp-artifact": "^0.1", + "chai": "^4.3.7", + "chai-as-promised": "^7.1.1", + "eslint": "^8.39.0", + "eslint-config-standard-with-typescript": "^34.0.1", + "eslint-plugin-import": "^2.26.0", + "eslint-plugin-n": "^15.7.0", + "eslint-plugin-promise": "^6.1.1", + "mocha": "^10.2.0", + "openai": "^4.24.1", + "sinon": "^15.1.0", + "temp": "^0.9.4", + "ts-node": "^10.9.1", + "ts-node-dev": "^2.0.0", + "typedoc": "^0.24.7", + "typedoc-plugin-markdown": "^3.15.3", + "typescript": "*", + "uuid": "^9.0.0" + }, + "optionalDependencies": { + "@lancedb/vectordb-darwin-arm64": "0.4.6", + "@lancedb/vectordb-darwin-x64": "0.4.6", + "@lancedb/vectordb-linux-arm64-gnu": "0.4.6", + "@lancedb/vectordb-linux-x64-gnu": "0.4.6", + "@lancedb/vectordb-win32-x64-msvc": "0.4.6" + } + }, + "../node/node_modules/apache-arrow": { + "version": "14.0.2", + "license": "Apache-2.0", + "dependencies": { + "@types/command-line-args": "5.2.0", + "@types/command-line-usage": "5.0.2", + "@types/node": "20.3.0", + "@types/pad-left": "2.1.1", + "command-line-args": "5.2.1", + "command-line-usage": "7.0.1", + "flatbuffers": "23.5.26", + "json-bignum": "^0.0.3", + "pad-left": "^2.1.0", + "tslib": "^2.5.3" + }, + "bin": { + "arrow2csv": "bin/arrow2csv.js" + } + }, + "node_modules/@types/node": { + "version": "20.11.8", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.8.tgz", + "integrity": "sha512-i7omyekpPTNdv4Jb/Rgqg0RU8YqLcNsI12quKSDkRXNfx7Wxdm6HhK1awT3xTgEkgxPn3bvnSpiEAc7a7Lpyow==", + "dev": true, + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/apache-arrow": { + "resolved": "../node/node_modules/apache-arrow", + "link": true + }, + "node_modules/typescript": { + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.3.3.tgz", + "integrity": "sha512-pXWcraxM0uxAS+tN0AG/BF2TyqmHO014Z070UsJ+pFvYuRSq8KH8DmWpnbXe0pEPDHXZV3FcAbJkijJ5oNEnWw==", + "dev": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "dev": true + }, + "node_modules/vectordb": { + "resolved": "../node", + "link": true + } + } +} diff --git a/docs/package.json b/docs/package.json new file mode 100644 index 00000000..792cdc89 --- /dev/null +++ b/docs/package.json @@ -0,0 +1,20 @@ +{ + "name": "lancedb-docs-test", + "version": "1.0.0", + "description": "auto-generated tests from doc", + "author": "dev@lancedb.com", + "license": "Apache 2", + "dependencies": { + "apache-arrow": "file:../node/node_modules/apache-arrow", + "vectordb": "file:../node" + }, + "scripts": { + "build": "tsc -b && cd ../node && npm run build", + "example": "npm run build && node", + "test": "npm run build && node $(ls dist/*.js)" + }, + "devDependencies": { + "@types/node": "^20.11.8", + "typescript": "^5.3.3" + } +} diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 93a2494f..230e5a3e 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -56,7 +56,7 @@ Lance supports `IVF_PQ` index type by default. data.push({vector: Array(1536).fill(i), id: `${i}`, content: "", longId: `${i}`},) } const table = await db.createTable('my_vectors', data) - await table.createIndex({ type: 'ivf_pq', column: 'vector', num_partitions: 256, num_sub_vectors: 96 }) + await table.createIndex({ type: 'ivf_pq', column: 'vector', num_partitions: 16, num_sub_vectors: 48 }) ``` - **metric** (default: "L2"): The distance metric to use. By default it uses euclidean distance "`L2`". @@ -221,4 +221,4 @@ On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows `num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and -more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency. \ No newline at end of file +more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency. diff --git a/docs/src/basic.md b/docs/src/basic.md index 8e108fb8..1d070a27 100644 --- a/docs/src/basic.md +++ b/docs/src/basic.md @@ -54,22 +54,18 @@ === "Javascript" - ```javascript - const lancedb = require("vectordb"); + ```typescript + --8<-- "src/basic_legacy.ts:import" - const uri = "data/sample-lancedb"; - const db = await lancedb.connect(uri); - ``` + --8<-- "src/basic_legacy.ts:open_db" + ``` === "Rust" ```rust - use vectordb::connect; - #[tokio::main] async fn main() -> Result<()> { - let uri = "data/sample-lancedb"; - let db = connect(uri).await?; + --8<-- "src/basic.rs:connect" } ``` @@ -105,11 +101,7 @@ If you need a reminder of the uri, you can call `db.uri()`. === "Javascript" ```javascript - const tbl = await db.createTable( - "myTable", - [{"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, - {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}] - ) + --8<-- "src/basic_legacy.ts:create_table" ``` If the table already exists, LanceDB will raise an error by default. @@ -160,10 +152,7 @@ In this case, you can create an empty table and specify the schema. === "Javascript" ```typescript - import { Schema, Field, FixedSizeList, DataType } from "apache-arrow"; - - schema = new Schema([new new Field("vec", new FixedSizeList(2, new Field("item", new Float32())))]) - tbl = await db.createTable({ name: "empty_table", schema: schema }); + --8<-- "src/basic_legacy.ts:create_empty_table" ``` === "Rust" @@ -269,7 +258,7 @@ Once you've embedded the query, you can find its nearest neighbors using the fol === "Javascript" ```javascript - const query = await tbl.search([100, 100]).limit(2).execute(); + --8<-- "src/basic_legacy.ts:search" ``` === "Rust" @@ -291,13 +280,13 @@ For tables with more than 50K vectors, creating an ANN index is recommended to s === "Python" - ```python + ```py tbl.create_index() ``` === "Javascript" - ```javascript + ```{.typescript .ignore} await tbl.createIndex({}) ``` @@ -324,7 +313,7 @@ This can delete any number of rows that match the filter. === "Javascript" ```javascript - await tbl.delete('item = "fizz"') + --8<-- "src/basic_legacy.ts:delete" ``` === "Rust" @@ -362,7 +351,7 @@ Use the `drop_table()` method on the database to remove a table. === "JavaScript" ```javascript - await db.dropTable('myTable') + --8<-- "src/basic_legacy.ts:drop_table" ``` This permanently removes the table and is not recoverable, unlike deleting rows. diff --git a/docs/src/basic.rs b/docs/src/basic.rs new file mode 120000 index 00000000..8899e302 --- /dev/null +++ b/docs/src/basic.rs @@ -0,0 +1 @@ +../../rust/vectordb/examples/simple.rs \ No newline at end of file diff --git a/docs/src/basic_legacy.ts b/docs/src/basic_legacy.ts new file mode 100644 index 00000000..442fc6ce --- /dev/null +++ b/docs/src/basic_legacy.ts @@ -0,0 +1,53 @@ +// --8<-- [start:import] +import * as lancedb from "vectordb"; +import { Schema, Field, Float32, FixedSizeList, Int32 } from "apache-arrow"; +// --8<-- [end:import] +import * as fs from "fs"; +import { Table as ArrowTable, Utf8 } from "apache-arrow"; + +const example = async () => { + fs.rmSync("data/sample-lancedb", { recursive: true, force: true }); + // --8<-- [start:open_db] + const uri = "data/sample-lancedb"; + const db = await lancedb.connect(uri); + // --8<-- [end:open_db] + + // --8<-- [start:create_table] + const tbl = await db.createTable( + "myTable", + [ + { vector: [3.1, 4.1], item: "foo", price: 10.0 }, + { vector: [5.9, 26.5], item: "bar", price: 20.0 }, + ], + { writeMode: lancedb.WriteMode.Overwrite } + ); + // --8<-- [end:create_table] + + // --8<-- [start:create_empty_table] + const schema = new Schema([ + new Field("id", new Int32()), + new Field("name", new Utf8()), + ]); + const empty_tbl = await db.createTable({ name: "empty_table", schema }); + // --8<-- [end:create_empty_table] + + // --8<-- [start:search] + const query = await tbl.search([100, 100]).limit(2).execute(); + // --8<-- [end:search] + console.log(query); + + // --8<-- [start:delete] + await tbl.delete('item = "fizz"'); + // --8<-- [end:delete] + + // --8<-- [start:drop_table] + await db.dropTable("myTable"); + // --8<-- [end:drop_table] +}; + +async function main() { + await example(); + console.log("Basic example: done"); +} + +main(); diff --git a/docs/test/md_testing.js b/docs/test/md_testing.js index 9e03a857..fae34fb6 100644 --- a/docs/test/md_testing.js +++ b/docs/test/md_testing.js @@ -9,7 +9,10 @@ const excludedGlobs = [ "../src/embedding.md", "../src/examples/*.md", "../src/guides/tables.md", + "../src/guides/storage.md", "../src/embeddings/*.md", + "../src/javascript/**/*.md", + "../src/basic.md", ]; const nodePrefix = "javascript"; @@ -47,8 +50,11 @@ for (const file of files.filter((file) => !excludedFiles.includes(file))) { if (lines.length > 0) { const fileName = path.basename(file, ".md"); const outPath = path.join(nodeFolder, fileName, `${fileName}${nodeFile}`); - console.log(outPath) + console.log(outPath); fs.mkdirSync(path.dirname(outPath), { recursive: true }); - fs.writeFileSync(outPath, asyncPrefix + "\n" + lines.join("\n") + asyncSuffix); + fs.writeFileSync( + outPath, + asyncPrefix + "\n" + lines.join("\n") + asyncSuffix + ); } -} \ No newline at end of file +} diff --git a/docs/tsconfig.json b/docs/tsconfig.json new file mode 100644 index 00000000..23a30f8b --- /dev/null +++ b/docs/tsconfig.json @@ -0,0 +1,17 @@ +{ + "include": [ + "src/*.ts", + ], + "compilerOptions": { + "target": "es2022", + "module": "nodenext", + "declaration": true, + "outDir": "./dist", + "strict": true, + "allowJs": true, + "resolveJsonModule": true, + }, + "exclude": [ + "./dist/*", + ] +} diff --git a/rust/vectordb/examples/simple.rs b/rust/vectordb/examples/simple.rs index 31ca4b59..9f37bb55 100644 --- a/rust/vectordb/examples/simple.rs +++ b/rust/vectordb/examples/simple.rs @@ -24,8 +24,10 @@ use vectordb::{connect, Result, Table, TableRef}; #[tokio::main] async fn main() -> Result<()> { + // --8<-- [start:connect] let uri = "data/sample-lancedb"; let db = connect(uri).await?; + // --8<-- [end:connect] let tbl = create_table(db).await?; create_index(tbl.as_ref()).await?; let batches = search(tbl.as_ref()).await?;