Compare commits

...

15 Commits

Author SHA1 Message Date
David Myriel
9e278fc5a6 fix small details 2025-05-05 23:03:17 +02:00
David Myriel
09fed1f286 add quickstart doc 2025-05-05 22:02:11 +02:00
Will Jones
cee2b5ea42 chore: upgrade pyarrow pin (#2192)
Closes #2191


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Chores**
- Updated the required version of the pyarrow package to version 16 or
higher.
- Adjusted automated testing workflows to install pyarrow version 16 for
compatibility checks.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-05-05 11:23:13 -07:00
Alex Pilon
f315f9665a feat: implement bindings to return merge stats (#2367)
Based on this comment:
https://github.com/lancedb/lancedb/issues/2228#issuecomment-2730463075
and https://github.com/lancedb/lance/pull/2357

Here is my attempt at implementing bindings for returning merge stats
from a `merge_insert.execute` call for lancedb.

Note: I have almost no idea what I am doing in Rust but tried to follow
existing code patterns and pay attention to compiler hints.
- The change in nodejs binding appeared to be necessary to get
compilation to work, presumably this could actual work properly by
returning some kind of NAPI JS object of the stats data?
- I am unsure of what to do with the remote/table.rs changes -
necessarily for compilation to work; I assume this is related to LanceDB
cloud, but unsure the best way to handle that at this point.

Proof of function:

```python
import pandas as pd
import lancedb


db = lancedb.connect("/tmp/test.db")

test_data = pd.DataFrame(
    {
        "title": ["Hello", "Test Document", "Example", "Data Sample", "Last One"],
        "id": [1, 2, 3, 4, 5],
        "content": [
            "World",
            "This is a test",
            "Another example",
            "More test data",
            "Final entry",
        ],
    }
)

table = db.create_table("documents", data=test_data, exist_ok=True, mode="overwrite")

update_data = pd.DataFrame(
    {
        "title": [
            "Hello, World",
            "Test Document, it's good",
            "Example",
            "Data Sample",
            "Last One",
            "New One",
        ],
        "id": [1, 2, 3, 4, 5, 6],
        "content": [
            "World",
            "This is a test",
            "Another example",
            "More test data",
            "Final entry",
            "New content",
        ],
    }
)

stats = (
    table.merge_insert(on="id")
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute(update_data)
)

print(stats)
```

returns

```
{'num_inserted_rows': 1, 'num_updated_rows': 5, 'num_deleted_rows': 0}
```

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

## Summary by CodeRabbit

- **New Features**
- Merge-insert operations now return detailed statistics, including
counts of inserted, updated, and deleted rows.
- **Bug Fixes**
- Tests updated to validate returned merge-insert statistics for
accuracy.
- **Documentation**
- Method documentation improved to reflect new return values and clarify
merge operation results.
- Added documentation for the new `MergeStats` interface detailing
operation statistics.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2025-05-01 10:00:20 -07:00
Andrew C. Oliver
5deb26bc8b fix: prevent embedded objects from returning null in all of their fields (#2355)
metadata{filename=xyz} filename would be there structurally, but ALWAYS
null.

I didn't include this as a file but it may be useful for understanding
the problem for people searching on this issue so I'm including it here
as documentation. Before this patch any field that is more than 1 deep
is accepted but returns null values for subfields when queried.

```js
const lancedb = require('@lancedb/lancedb');

// Debug logger
function debug(message, data) {
  console.log(`[TEST] ${message}`, data !== undefined ? data : '');
}

// Log when our unwrapArrowObject is called
const kParent = Symbol.for("parent");
const kRowIndex = Symbol.for("rowIndex");

// Override console.log for our test
const originalConsoleLog = console.log;
console.log = function() {
  // Filter out noisy logs
  if (arguments[0] && typeof arguments[0] === 'string' && arguments[0].includes('[INFO] [LanceDB]')) {
    originalConsoleLog.apply(console, arguments);
  }
  originalConsoleLog.apply(console, arguments);
};

async function main() {
  debug('Starting test...');
  
  // Connect to the database
  debug('Connecting to database...');
  const db = await lancedb.connect('./.lancedb');
  
  // Try to open an existing table, or create a new one if it doesn't exist
  let table;
  try {
    table = await db.openTable('test_nested_fields');
    debug('Opened existing table');
  } catch (e) {
    debug('Creating new table...');
    
    // Create test data with nested metadata structure
    const data = [
      {
        id: 'test1',
        vector: [1, 2, 3],
        metadata: {
          filePath: "/path/to/file1.ts",
          startLine: 10,
          endLine: 20,
          text: "function test() { return true; }"
        }
      },
      {
        id: 'test2',
        vector: [4, 5, 6],
        metadata: {
          filePath: "/path/to/file2.ts",
          startLine: 30,
          endLine: 40,
          text: "function test2() { return false; }"
        }
      }
    ];
    
    debug('Data to be inserted:', JSON.stringify(data, null, 2));
    
    // Create the table
    table = await db.createTable('test_nested_fields', data);
    debug('Table created successfully');
  }
  
  // Query the table and get results
  debug('Querying table...');
  const results = await table.search([1, 2, 3]).limit(10).toArray();
  
  // Log the results
  debug('Number of results:', results.length);
  
  if (results.length > 0) {
    const firstResult = results[0];
    debug('First result properties:', Object.keys(firstResult));
    
    // Check if metadata is accessible and what properties it has
    if (firstResult.metadata) {
      debug('Metadata properties:', Object.keys(firstResult.metadata));
      debug('Metadata filePath:', firstResult.metadata.filePath);
      debug('Metadata startLine:', firstResult.metadata.startLine);
      
      // Destructure to see if that helps
      const { filePath, startLine, endLine, text } = firstResult.metadata;
      debug('Destructured values:', { filePath, startLine, endLine, text });
      
      // Check if it's a proxy object
      debug('Result is proxy?', Object.getPrototypeOf(firstResult) === Object.prototype ? false : true);
      debug('Metadata is proxy?', Object.getPrototypeOf(firstResult.metadata) === Object.prototype ? false : true);
    } else {
      debug('Metadata is not accessible!');
    }
  }
  
  // Close the database
  await db.close();
}

main().catch(e => {
  console.error('Error:', e);
}); 
```

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

## Summary by CodeRabbit

- **Bug Fixes**
- Improved handling of nested struct fields to ensure accurate
preservation of values during serialization and deserialization.
- Enhanced robustness when accessing nested object properties, reducing
errors with missing or null values.

- **Tests**
- Added tests to verify correct handling of nested struct fields through
serialization and deserialization.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2025-05-01 09:38:55 -07:00
Lance Release
3cc670ac38 Updating package-lock.json 2025-04-29 23:21:19 +00:00
Lance Release
4ade3e31e2 Updating package-lock.json 2025-04-29 22:19:46 +00:00
Lance Release
a222d2cd91 Updating package-lock.json 2025-04-29 22:19:30 +00:00
Lance Release
508e621f3d Bump version: 0.19.1-beta.0 → 0.19.1-beta.1 2025-04-29 22:19:14 +00:00
Lance Release
a1a0472f3f Bump version: 0.22.1-beta.0 → 0.22.1-beta.1 2025-04-29 22:18:53 +00:00
Wyatt Alt
3425a6d339 feat: upgrade lance to v0.27.0-beta.2 (#2364)
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Chores**
- Updated dependencies for related components to use the latest version
from a specific repository source. No changes to features or public
functionality.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-04-29 14:59:56 -07:00
Ryan Green
af54e0ce06 feat: add table stats API (#2363)
* Add a new "table stats" API to expose basic table and fragment
statistics with local and remote table implementations

### Questions
* This is using `calculate_data_stats` to determine total bytes in the
table. This seems like a potentially expensive operation - are there any
concerns about performance for large datasets?

### Notes
* bytes_on_disk seems to be stored at the column level but there does
not seem to be a way to easily calculate total bytes per fragment. This
may need to be added in lance before we can support fragment size
(bytes) statistics.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Added a method to retrieve comprehensive table statistics, including
total rows, index counts, storage size, and detailed fragment size
metrics such as minimum, maximum, mean, and percentiles.
- Enabled fetching of table statistics from remote sources through
asynchronous requests.
- Extended table interfaces across Python, Rust, and Node.js to support
synchronous and asynchronous retrieval of table statistics.
- **Tests**
- Introduced tests to verify the accuracy of the new table statistics
feature for both populated and empty tables.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-04-29 15:19:08 -02:30
Lance Release
089905fe8f Updating package-lock.json 2025-04-28 19:13:36 +00:00
Lance Release
554939e5d2 Updating package-lock.json 2025-04-28 17:20:58 +00:00
Lance Release
7a13814922 Updating package-lock.json 2025-04-28 17:20:42 +00:00
51 changed files with 1211 additions and 210 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.19.1-beta.0"
current_version = "0.19.1-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -228,6 +228,7 @@ jobs:
- name: Install lancedb
run: |
pip install "pydantic<2"
pip install pyarrow==16
pip install --extra-index-url https://pypi.fury.io/lancedb/ -e .[tests]
pip install tantivy
- name: Run tests

215
Cargo.lock generated
View File

@@ -463,7 +463,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -474,7 +474,7 @@ checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -602,9 +602,9 @@ dependencies = [
[[package]]
name = "aws-sdk-bedrockruntime"
version = "1.83.0"
version = "1.85.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b82a56e1a0c4b145031c3a99e68127eec0a4206ad34a5653ddf04afc18053376"
checksum = "6f6c003cd82739447a18d7616468b047341c125efff11fdafc77a5e777a861c9"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -628,9 +628,9 @@ dependencies = [
[[package]]
name = "aws-sdk-dynamodb"
version = "1.72.0"
version = "1.72.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "412cd587b03bacb2f7b94a5446cc77dee49a8fa848e636f9545df3aadbbfaf8b"
checksum = "b14d5b5d6849d1caa7b404ea57cbe25ed8ba25c3c7d47f45bcbd5b51e098ceac"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -1117,7 +1117,7 @@ dependencies = [
"regex",
"rustc-hash 1.1.0",
"shlex",
"syn 2.0.100",
"syn 2.0.101",
"which",
]
@@ -1229,9 +1229,9 @@ checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
[[package]]
name = "bytemuck"
version = "1.22.0"
version = "1.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540"
checksum = "9134a6ef01ce4b366b50689c94f82c14bc72bc5d0386829828a2e2752ef7958c"
dependencies = [
"bytemuck_derive",
]
@@ -1244,7 +1244,7 @@ checksum = "3fa76293b4f7bb636ab88fd78228235b5248b4d05cc589aed610f954af5d7c7a"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -1753,7 +1753,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a2785755761f3ddc1492979ce1e48d2c00d09311c39e4466429188f3dd6501"
dependencies = [
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -1783,7 +1783,7 @@ dependencies = [
"proc-macro2",
"quote",
"strsim",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -1794,7 +1794,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
dependencies = [
"darling_core",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2152,7 +2152,7 @@ checksum = "4800e1ff7ecf8f310887e9b54c9c444b8e215ccbc7b21c2f244cfae373b1ece7"
dependencies = [
"datafusion-expr",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2321,7 +2321,7 @@ checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2342,7 +2342,7 @@ dependencies = [
"darling",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2352,7 +2352,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
dependencies = [
"derive_builder_core",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2416,7 +2416,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2509,7 +2509,7 @@ dependencies = [
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2521,7 +2521,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -2737,9 +2737,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbbb5d86fdf9f56c54cf7ec48f4471c0e901af458ee9821677dc8ba0c38bc0be"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"rand 0.8.5",
]
@@ -2815,7 +2814,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -3517,7 +3516,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -3681,7 +3680,7 @@ checksum = "199b7932d97e325aff3a7030e141eafe7f2c6268e1d1b24859b753a627f45254"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -3728,9 +3727,8 @@ dependencies = [
[[package]]
name = "lance"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "285c3f98a7182e2f35eabc9be67927bb9167b236c6d9c45d894928cbe330067c"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow",
"arrow-arith",
@@ -3792,9 +3790,8 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f877f217d6f93b24b54a2390a988a32f99d6608fe8af7766d93bd515e77dd2a"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -3811,9 +3808,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52fa4661c532db5b53102e2b394c9735bf6e707c337dfa5b9d98baba5c0cba13"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -3849,9 +3845,8 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3efa610cc1168aaf96734f2f7911fb874609304716aab3318a86193da883f700"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow",
"arrow-array",
@@ -3880,9 +3875,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c61affd39495caa923f6a49a7cb0a9f36fea2d7231a039e557f908e0b3b59cf"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow",
"arrow-array",
@@ -3897,9 +3891,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "051d65ab02790552c8790fe22915fbdd1629f3e1fa2a6ef69946e77c9d2b6f8e"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrayref",
"arrow",
@@ -3938,9 +3931,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e4aff9ff0801c82d8fcb88cacec4880b6aaf53c6480291d50a4fcc12e6853c4"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -3974,9 +3966,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3823b5147002a3115456c4dd1e2b16c645c08f4653e6e9dc624b9381ba29c87f"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow",
"arrow-array",
@@ -4029,9 +4020,8 @@ dependencies = [
[[package]]
name = "lance-io"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a401202f6a1997db4ea5e9eb1a73a352736b320808a2e8497686c44fe6badf01"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow",
"arrow-arith",
@@ -4069,9 +4059,8 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82cc5333ed9d12f1745e849ad161746da0b12ae3d4c9897d1937411e6533f504"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow-array",
"arrow-ord",
@@ -4094,9 +4083,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41a43c76277808c452135f33a6b46ca8ec6ba38167534ff5240b46098ed81e73"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow",
"arrow-array",
@@ -4135,9 +4123,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f697b2c273e4629c782205e563282c08a74fe237ca8dd36cf10f862951887a70"
version = "0.27.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.27.0-beta.2#cf903b470be1aaff2998830bd0358226f27f4185"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -4148,7 +4135,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.19.0-beta.11"
version = "0.19.1-beta.1"
dependencies = [
"arrow",
"arrow-array",
@@ -4235,7 +4222,7 @@ dependencies = [
[[package]]
name = "lancedb-node"
version = "0.19.0-beta.11"
version = "0.19.1-beta.1"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4260,7 +4247,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.19.0-beta.11"
version = "0.19.1-beta.1"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4279,7 +4266,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.22.0-beta.11"
version = "0.22.1-beta.1"
dependencies = [
"arrow",
"env_logger",
@@ -4681,7 +4668,7 @@ checksum = "c402a4092d5e204f32c9e155431046831fa712637043c58cb73bc6bc6c9663b5"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -4749,7 +4736,7 @@ dependencies = [
"napi-derive-backend",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -4764,7 +4751,7 @@ dependencies = [
"quote",
"regex",
"semver 1.0.26",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -4964,7 +4951,7 @@ dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -5231,7 +5218,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -5657,7 +5644,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6"
dependencies = [
"proc-macro2",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -5704,7 +5691,7 @@ dependencies = [
"prost",
"prost-types",
"regex",
"syn 2.0.100",
"syn 2.0.101",
"tempfile",
]
@@ -5718,7 +5705,7 @@ dependencies = [
"itertools 0.14.0",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -5732,9 +5719,9 @@ dependencies = [
[[package]]
name = "psm"
version = "0.1.25"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88"
checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f"
dependencies = [
"cc",
]
@@ -5791,7 +5778,7 @@ checksum = "b2df2884957d2476731f987673befac5d521dff10abb0a7cbe12015bc7702fe9"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -5823,7 +5810,7 @@ dependencies = [
"proc-macro2",
"pyo3-macros-backend",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -5836,14 +5823,14 @@ dependencies = [
"proc-macro2",
"pyo3-build-config",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
name = "quick-xml"
version = "0.37.4"
version = "0.37.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4ce8c88de324ff838700f36fb6ab86c96df0e3c4ab6ef3a9b2044465cce1369"
checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
dependencies = [
"memchr",
"serde",
@@ -6094,7 +6081,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
dependencies = [
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -6312,7 +6299,7 @@ dependencies = [
"regex",
"relative-path",
"rustc_version",
"syn 2.0.100",
"syn 2.0.101",
"unicode-ident",
]
@@ -6646,7 +6633,7 @@ checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -6709,7 +6696,7 @@ dependencies = [
"darling",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -6842,7 +6829,7 @@ dependencies = [
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -6916,7 +6903,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -6927,9 +6914,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "stacker"
version = "0.1.20"
version = "0.1.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "601f9201feb9b09c00266478bf459952b9ef9a6b94edb2f21eba14ab681a60a9"
checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b"
dependencies = [
"cc",
"cfg-if",
@@ -6993,7 +6980,7 @@ dependencies = [
"proc-macro2",
"quote",
"rustversion",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -7015,9 +7002,9 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.100"
version = "2.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
dependencies = [
"proc-macro2",
"quote",
@@ -7052,7 +7039,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -7308,7 +7295,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -7319,7 +7306,7 @@ checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -7454,7 +7441,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -7503,15 +7490,15 @@ dependencies = [
[[package]]
name = "toml_datetime"
version = "0.6.8"
version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3"
[[package]]
name = "toml_edit"
version = "0.22.24"
version = "0.22.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474"
checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e"
dependencies = [
"indexmap 2.9.0",
"toml_datetime",
@@ -7564,7 +7551,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -7834,7 +7821,7 @@ dependencies = [
"log",
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
"wasm-bindgen-shared",
]
@@ -7869,7 +7856,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@@ -7918,9 +7905,9 @@ dependencies = [
[[package]]
name = "webpki-roots"
version = "0.26.8"
version = "0.26.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2210b291f7ea53617fbafcc4939f10914214ec15aace5ba62293a668f322c5c9"
checksum = "29aad86cec885cafd03e8305fd727c418e970a521322c91688414d5b8efba16b"
dependencies = [
"rustls-pki-types",
]
@@ -8031,7 +8018,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -8042,7 +8029,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -8053,7 +8040,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -8064,7 +8051,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -8479,7 +8466,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
"synstructure",
]
@@ -8509,7 +8496,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -8520,7 +8507,7 @@ checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]
@@ -8540,7 +8527,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
"synstructure",
]
@@ -8569,7 +8556,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
"syn 2.0.101",
]
[[package]]

View File

@@ -21,14 +21,14 @@ categories = ["database-implementations"]
rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.26.0", "features" = ["dynamodb"] }
lance-io = "=0.26.0"
lance-index = "=0.26.0"
lance-linalg = "=0.26.0"
lance-table = "=0.26.0"
lance-testing = "=0.26.0"
lance-datafusion = "=0.26.0"
lance-encoding = "=0.26.0"
lance = { "version" = "=0.27.0", "features" = ["dynamodb"], tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-io = { version = "=0.27.0", tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-index = { version = "=0.27.0", tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-linalg = { version = "=0.27.0", tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-table = { version = "=0.27.0", tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-testing = { version = "=0.27.0", tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-datafusion = { version = "=0.27.0", tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
lance-encoding = { version = "=0.27.0", tag = "v0.27.0-beta.2", git="https://github.com/lancedb/lance.git" }
# Note that this one does not include pyarrow
arrow = { version = "54.1", optional = false }
arrow-array = "54.1"

View File

@@ -105,7 +105,8 @@ markdown_extensions:
nav:
- Home:
- LanceDB: index.md
- 🏃🏼‍♂️ Quick start: basic.md
- 👉 Quickstart: quickstart.md
- 🏃🏼‍♂️ Basic Usage: basic.md
- 📚 Concepts:
- Vector search: concepts/vector_search.md
- Indexing:
@@ -237,7 +238,9 @@ nav:
- 👾 JavaScript (lancedb): js/globals.md
- 🦀 Rust: https://docs.rs/lancedb/latest/lancedb/
- Quick start: basic.md
- Getting Started:
- Quickstart: quickstart.md
- Basic Usage: basic.md
- Concepts:
- Vector search: concepts/vector_search.md
- Indexing:

View File

@@ -1,4 +1,4 @@
# Quick start
# Basic Usage
!!! info "LanceDB can be run in a number of ways:"

View File

@@ -33,20 +33,20 @@ Construct a MergeInsertBuilder. __Internal use only.__
### execute()
```ts
execute(data): Promise<void>
execute(data): Promise<MergeStats>
```
Executes the merge insert operation
Nothing is returned but the `Table` is updated
#### Parameters
* **data**: [`Data`](../type-aliases/Data.md)
#### Returns
`Promise`&lt;`void`&gt;
`Promise`&lt;[`MergeStats`](../interfaces/MergeStats.md)&gt;
Statistics about the merge operation: counts of inserted, updated, and deleted rows
***

View File

@@ -615,6 +615,22 @@ of the given query
***
### stats()
```ts
abstract stats(): Promise<TableStatistics>
```
Returns table and fragment statistics
#### Returns
`Promise`&lt;[`TableStatistics`](../interfaces/TableStatistics.md)&gt;
The table and fragment statistics
***
### tags()
```ts

View File

@@ -42,6 +42,8 @@
- [ConnectionOptions](interfaces/ConnectionOptions.md)
- [CreateTableOptions](interfaces/CreateTableOptions.md)
- [ExecutableQuery](interfaces/ExecutableQuery.md)
- [FragmentStatistics](interfaces/FragmentStatistics.md)
- [FragmentSummaryStats](interfaces/FragmentSummaryStats.md)
- [FtsOptions](interfaces/FtsOptions.md)
- [FullTextQuery](interfaces/FullTextQuery.md)
- [FullTextSearchOptions](interfaces/FullTextSearchOptions.md)
@@ -52,6 +54,7 @@
- [IndexStatistics](interfaces/IndexStatistics.md)
- [IvfFlatOptions](interfaces/IvfFlatOptions.md)
- [IvfPqOptions](interfaces/IvfPqOptions.md)
- [MergeStats](interfaces/MergeStats.md)
- [OpenTableOptions](interfaces/OpenTableOptions.md)
- [OptimizeOptions](interfaces/OptimizeOptions.md)
- [OptimizeStats](interfaces/OptimizeStats.md)
@@ -59,6 +62,7 @@
- [RemovalStats](interfaces/RemovalStats.md)
- [RetryConfig](interfaces/RetryConfig.md)
- [TableNamesOptions](interfaces/TableNamesOptions.md)
- [TableStatistics](interfaces/TableStatistics.md)
- [TimeoutConfig](interfaces/TimeoutConfig.md)
- [UpdateOptions](interfaces/UpdateOptions.md)
- [Version](interfaces/Version.md)

View File

@@ -0,0 +1,37 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / FragmentStatistics
# Interface: FragmentStatistics
## Properties
### lengths
```ts
lengths: FragmentSummaryStats;
```
Statistics on the number of rows in the table fragments
***
### numFragments
```ts
numFragments: number;
```
The number of fragments in the table
***
### numSmallFragments
```ts
numSmallFragments: number;
```
The number of uncompacted fragments in the table

View File

@@ -0,0 +1,77 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / FragmentSummaryStats
# Interface: FragmentSummaryStats
## Properties
### max
```ts
max: number;
```
The number of rows in the fragment with the most rows
***
### mean
```ts
mean: number;
```
The mean number of rows in the fragments
***
### min
```ts
min: number;
```
The number of rows in the fragment with the fewest rows
***
### p25
```ts
p25: number;
```
The 25th percentile of number of rows in the fragments
***
### p50
```ts
p50: number;
```
The 50th percentile of number of rows in the fragments
***
### p75
```ts
p75: number;
```
The 75th percentile of number of rows in the fragments
***
### p99
```ts
p99: number;
```
The 99th percentile of number of rows in the fragments

View File

@@ -0,0 +1,31 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / MergeStats
# Interface: MergeStats
## Properties
### numDeletedRows
```ts
numDeletedRows: bigint;
```
***
### numInsertedRows
```ts
numInsertedRows: bigint;
```
***
### numUpdatedRows
```ts
numUpdatedRows: bigint;
```

View File

@@ -0,0 +1,47 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / TableStatistics
# Interface: TableStatistics
## Properties
### fragmentStats
```ts
fragmentStats: FragmentStatistics;
```
Statistics on table fragments
***
### numIndices
```ts
numIndices: number;
```
The number of indices in the table
***
### numRows
```ts
numRows: number;
```
The number of rows in the table
***
### totalBytes
```ts
totalBytes: number;
```
The total number of bytes in the table

101
docs/src/quickstart.md Normal file
View File

@@ -0,0 +1,101 @@
# Getting Started with LanceDB: A Minimal Vector Search Tutorial
Let's set up a LanceDB database, insert vector data, and perform a simple vector search. We'll use simple character classes like "knight" and "rogue" to illustrate semantic relevance.
## 1. Install Dependencies
Before starting, make sure you have the necessary packages:
```bash
pip install lancedb pandas numpy
```
## 2. Import Required Libraries
```python
import lancedb
import pandas as pd
import numpy as np
```
## 3. Connect to LanceDB
You can use a local directory to store your database:
```python
db = lancedb.connect("./lancedb")
```
## 4. Create Sample Data
Add sample text data and corresponding 4D vectors:
```python
data = pd.DataFrame([
{"id": "1", "vector": [1.0, 0.0, 0.0, 0.0], "text": "knight"},
{"id": "2", "vector": [0.9, 0.1, 0.0, 0.0], "text": "warrior"},
{"id": "3", "vector": [0.0, 1.0, 0.0, 0.0], "text": "rogue"},
{"id": "4", "vector": [0.0, 0.9, 0.1, 0.0], "text": "thief"},
{"id": "5", "vector": [0.5, 0.5, 0.0, 0.0], "text": "ranger"},
])
```
## 5. Create a Table in LanceDB
```python
table = db.create_table("rpg_classes", data=data, mode="overwrite")
```
Let's see how the table looks:
```python
print(data)
```
| id | vector | text |
|----|--------|------|
| 1 | [1.0, 0.0, 0.0, 0.0] | knight |
| 2 | [0.9, 0.1, 0.0, 0.0] | warrior |
| 3 | [0.0, 1.0, 0.0, 0.0] | rogue |
| 4 | [0.0, 0.9, 0.1, 0.0] | thief |
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger |
## 6. Perform a Vector Search
Search for the most similar character classes to our query vector:
```python
# Query as if we are searching for "rogue"
results = table.search([0.95, 0.05, 0.0, 0.0]).limit(3).to_df()
print(results)
```
This will return the top 3 closest classes to the vector, effectively showing how LanceDB can be used for semantic search.
| id | vector | text | _distance |
|------|------------------------|----------|-----------|
| 3 | [0.0, 1.0, 0.0, 0.0] | rogue | 0.00 |
| 4 | [0.0, 0.9, 0.1, 0.0] | thief | 0.02 |
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger | 0.50 |
Let's try searching for "knight"
```python
query_vector = [1.0, 0.0, 0.0, 0.0]
results = table.search(query_vector).limit(3).to_pandas()
print(results)
```
| id | vector | text | _distance |
|------|------------------------|----------|-----------|
| 1 | [1.0, 0.0, 0.0, 0.0] | knight | 0.00 |
| 2 | [0.9, 0.1, 0.0, 0.0] | warrior | 0.02 |
| 5 | [0.5, 0.5, 0.0, 0.0] | ranger | 0.50 |
## Next Steps
That's it - you just conducted vector search!
For more beginner tips, check out the [Basic Usage](basic.md) guide.

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.19.1-beta.0</version>
<version>0.19.1-beta.1</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.19.1-beta.0</version>
<version>0.19.1-beta.1</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

44
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.19.0",
"version": "0.19.1-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.19.0",
"version": "0.19.1-beta.1",
"cpu": [
"x64",
"arm64"
@@ -52,11 +52,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.19.0",
"@lancedb/vectordb-darwin-x64": "0.19.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.19.0",
"@lancedb/vectordb-linux-x64-gnu": "0.19.0",
"@lancedb/vectordb-win32-x64-msvc": "0.19.0"
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.1",
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.1"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -327,9 +327,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.19.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.0.tgz",
"integrity": "sha512-cR04V8azbrEfJ3FX5WJjwvkmKySI+dS4laBWqtXaMyLDSX034E3P3Ve8jKfYdP4NaBSGlGZlySpGawEEBLH92A==",
"version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.19.1-beta.1.tgz",
"integrity": "sha512-Epvel0pF5TM6MtIWQ2KhqezqSSHTL3Wr7a2rGAwz6X/XY23i6DbMPpPs0HyeIDzDrhxNfE3cz3S+SiCA6xpR0g==",
"cpu": [
"arm64"
],
@@ -340,9 +340,9 @@
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.19.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.0.tgz",
"integrity": "sha512-qDrui0LR4f2QqFovDx8VcbVY5So5gi0HgHWeh6kypl4R4SS+pYfW3jTPVDz1YpxxlB9GHACM5qBdul6KFpnoug==",
"version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.19.1-beta.1.tgz",
"integrity": "sha512-hOiUSlIoISbiXytp46hToi/r6sF5pImAsfbzCsIq8ExDV4TPa8fjbhcIT80vxxOwc2mpSSK4HsVJYod95RSbEQ==",
"cpu": [
"x64"
],
@@ -353,9 +353,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.19.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.0.tgz",
"integrity": "sha512-peoq/Mh9ml2h6xSngbfVt0yeuIO3ln4/dG9mfubXPJyNlM7tANzD+IY0Xs+B03m+fXbJ7LFZ8de4aIP9pWh4iQ==",
"version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.19.1-beta.1.tgz",
"integrity": "sha512-/1JhGVDEngwrlM8o2TNW8G6nJ9U/VgHKAORmj/cTA7O30helJIoo9jfvUAUy+vZ4VoEwRXQbMI+gaYTg0l3MTg==",
"cpu": [
"arm64"
],
@@ -366,9 +366,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.19.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.0.tgz",
"integrity": "sha512-MUsOXk+InI0ywuygcHvYG8+awrJUnsbrUstTPETN2+QAV7QOX+TlafupLUUrfp1/pUOPt/ZraHEaqFRw1Vdxqg==",
"version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.19.1-beta.1.tgz",
"integrity": "sha512-zNRGSSUt8nTJMmll4NdxhQjwxR8Rezq3T4dsRoiDts5ienMam5HFjYiZ3FkDZQo16rgq2BcbFuH1G8u1chywlg==",
"cpu": [
"x64"
],
@@ -379,9 +379,9 @@
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.19.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.0.tgz",
"integrity": "sha512-stk3uqMAbHxTodmzqMPKUl54GBfVKNDMR3EIo3d299QcXyOdSuEeHgeZa+iy0hHeIFL0TqHi4o8tStNzFLBAHg==",
"version": "0.19.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.19.1-beta.1.tgz",
"integrity": "sha512-yV550AJGlsIFdm1KoHQPJ1TZx121ZXCIdebBtBZj3wOObIhyB/i0kZAtGvwjkmr7EYyfzt1EHZzbjSGVdehIAA==",
"cpu": [
"x64"
],

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"description": " Serverless, low-latency vector database for AI applications",
"private": false,
"main": "dist/index.js",
@@ -89,10 +89,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.0",
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.0"
"@lancedb/vectordb-darwin-x64": "0.19.1-beta.1",
"@lancedb/vectordb-darwin-arm64": "0.19.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.19.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.19.1-beta.1"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.19.1-beta.0"
version = "0.19.1-beta.1"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -374,6 +374,71 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(table2.numRows).toBe(4);
expect(table2.schema).toEqual(schema);
});
it("should correctly retain values in nested struct fields", async function () {
// Define test data with nested struct
const testData = [
{
id: "doc1",
vector: [1, 2, 3],
metadata: {
filePath: "/path/to/file1.ts",
startLine: 10,
endLine: 20,
text: "function test() { return true; }",
},
},
{
id: "doc2",
vector: [4, 5, 6],
metadata: {
filePath: "/path/to/file2.ts",
startLine: 30,
endLine: 40,
text: "function test2() { return false; }",
},
},
];
// Create Arrow table from the data
const table = makeArrowTable(testData);
// Verify schema has the nested struct fields
const metadataField = table.schema.fields.find(
(f) => f.name === "metadata",
);
expect(metadataField).toBeDefined();
// biome-ignore lint/suspicious/noExplicitAny: accessing fields in different Arrow versions
const childNames = metadataField?.type.children.map((c: any) => c.name);
expect(childNames).toEqual([
"filePath",
"startLine",
"endLine",
"text",
]);
// Convert to buffer and back (simulating storage and retrieval)
const buf = await fromTableToBuffer(table);
const retrievedTable = tableFromIPC(buf);
// Verify the retrieved table has the same structure
const rows = [];
for (let i = 0; i < retrievedTable.numRows; i++) {
rows.push(retrievedTable.get(i));
}
// Check values in the first row
const firstRow = rows[0];
expect(firstRow.id).toBe("doc1");
expect(firstRow.vector.toJSON()).toEqual([1, 2, 3]);
// Verify metadata values are preserved (this is where the bug is)
expect(firstRow.metadata).toBeDefined();
expect(firstRow.metadata.filePath).toBe("/path/to/file1.ts");
expect(firstRow.metadata.startLine).toBe(10);
expect(firstRow.metadata.endLine).toBe(20);
expect(firstRow.metadata.text).toBe("function test() { return true; }");
});
});
class DummyEmbedding extends EmbeddingFunction<string> {

View File

@@ -71,6 +71,29 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
await expect(table.countRows()).resolves.toBe(3);
});
it("should show table stats", async () => {
await table.add([{ id: 1 }, { id: 2 }]);
await table.add([{ id: 1 }]);
await expect(table.stats()).resolves.toEqual({
fragmentStats: {
lengths: {
max: 2,
mean: 1,
min: 1,
p25: 1,
p50: 2,
p75: 2,
p99: 2,
},
numFragments: 2,
numSmallFragments: 2,
},
numIndices: 0,
numRows: 3,
totalBytes: 24,
});
});
it("should overwrite data if asked", async () => {
await table.add([{ id: 1 }, { id: 2 }]);
await table.add([{ id: 1 }], { mode: "overwrite" });
@@ -315,11 +338,16 @@ describe("merge insert", () => {
{ a: 3, b: "y" },
{ a: 4, b: "z" },
];
await table
const stats = await table
.mergeInsert("a")
.whenMatchedUpdateAll()
.whenNotMatchedInsertAll()
.execute(newData);
expect(stats.numInsertedRows).toBe(1n);
expect(stats.numUpdatedRows).toBe(2n);
expect(stats.numDeletedRows).toBe(0n);
const expected = [
{ a: 1, b: "a" },
{ a: 2, b: "x" },

View File

@@ -639,8 +639,9 @@ function transposeData(
): Vector {
if (field.type instanceof Struct) {
const childFields = field.type.children;
const fullPath = [...path, field.name];
const childVectors = childFields.map((child) => {
return transposeData(data, child, [...path, child.name]);
return transposeData(data, child, fullPath);
});
const structData = makeData({
type: field.type,
@@ -652,7 +653,14 @@ function transposeData(
const values = data.map((datum) => {
let current: unknown = datum;
for (const key of valuesPath) {
if (isObject(current) && Object.hasOwn(current, key)) {
if (current == null) {
return null;
}
if (
isObject(current) &&
(Object.hasOwn(current, key) || key in current)
) {
current = current[key];
} else {
return null;

View File

@@ -23,8 +23,12 @@ export {
OptimizeStats,
CompactionStats,
RemovalStats,
TableStatistics,
FragmentStatistics,
FragmentSummaryStats,
Tags,
TagContents,
MergeStats,
} from "./native.js";
export {

View File

@@ -1,7 +1,7 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import { Data, Schema, fromDataToBuffer } from "./arrow";
import { NativeMergeInsertBuilder } from "./native";
import { MergeStats, NativeMergeInsertBuilder } from "./native";
/** A builder used to create and run a merge insert operation */
export class MergeInsertBuilder {
@@ -73,9 +73,9 @@ export class MergeInsertBuilder {
/**
* Executes the merge insert operation
*
* Nothing is returned but the `Table` is updated
* @returns Statistics about the merge operation: counts of inserted, updated, and deleted rows
*/
async execute(data: Data): Promise<void> {
async execute(data: Data): Promise<MergeStats> {
let schema: Schema;
if (this.#schema instanceof Promise) {
schema = await this.#schema;
@@ -84,6 +84,6 @@ export class MergeInsertBuilder {
schema = this.#schema;
}
const buffer = await fromDataToBuffer(data, undefined, schema);
await this.#native.execute(buffer);
return await this.#native.execute(buffer);
}
}

View File

@@ -20,6 +20,7 @@ import {
IndexConfig,
IndexStatistics,
OptimizeStats,
TableStatistics,
Tags,
Table as _NativeTable,
} from "./native";
@@ -482,6 +483,13 @@ export abstract class Table {
* Use {@link Table.listIndices} to find the names of the indices.
*/
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
/** Returns table and fragment statistics
*
* @returns {TableStatistics} The table and fragment statistics
*
*/
abstract stats(): Promise<TableStatistics>;
}
export class LocalTable extends Table {
@@ -775,6 +783,11 @@ export class LocalTable extends Table {
}
return stats;
}
async stats(): Promise<TableStatistics> {
return await this.inner.stats();
}
mergeInsert(on: string | string[]): MergeInsertBuilder {
on = Array.isArray(on) ? on : [on];
return new MergeInsertBuilder(this.inner.mergeInsert(on), this.schema());

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.19.0",
"version": "0.19.1-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.19.0",
"version": "0.19.1-beta.1",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.19.1-beta.0",
"version": "0.19.1-beta.1",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -37,7 +37,7 @@ impl NativeMergeInsertBuilder {
}
#[napi(catch_unwind)]
pub async fn execute(&self, buf: Buffer) -> napi::Result<()> {
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeStats> {
let data = ipc_file_to_batches(buf.to_vec())
.and_then(IntoArrow::into_arrow)
.map_err(|e| {
@@ -46,12 +46,14 @@ impl NativeMergeInsertBuilder {
let this = self.clone();
this.inner.execute(data).await.map_err(|e| {
let stats = this.inner.execute(data).await.map_err(|e| {
napi::Error::from_reason(format!(
"Failed to execute merge insert: {}",
convert_error(&e)
))
})
})?;
Ok(stats.into())
}
}
@@ -60,3 +62,20 @@ impl From<MergeInsertBuilder> for NativeMergeInsertBuilder {
Self { inner }
}
}
#[napi(object)]
pub struct MergeStats {
pub num_inserted_rows: BigInt,
pub num_updated_rows: BigInt,
pub num_deleted_rows: BigInt,
}
impl From<lancedb::table::MergeStats> for MergeStats {
fn from(stats: lancedb::table::MergeStats) -> Self {
Self {
num_inserted_rows: stats.num_inserted_rows.into(),
num_updated_rows: stats.num_updated_rows.into(),
num_deleted_rows: stats.num_deleted_rows.into(),
}
}
}

View File

@@ -157,6 +157,12 @@ impl Table {
.default_error()
}
#[napi(catch_unwind)]
pub async fn stats(&self) -> Result<TableStatistics> {
let stats = self.inner_ref()?.stats().await.default_error()?;
Ok(stats.into())
}
#[napi(catch_unwind)]
pub async fn update(
&self,
@@ -555,6 +561,80 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
}
}
#[napi(object)]
pub struct TableStatistics {
/// The total number of bytes in the table
pub total_bytes: i64,
/// The number of rows in the table
pub num_rows: i64,
/// The number of indices in the table
pub num_indices: i64,
/// Statistics on table fragments
pub fragment_stats: FragmentStatistics,
}
#[napi(object)]
pub struct FragmentStatistics {
/// The number of fragments in the table
pub num_fragments: i64,
/// The number of uncompacted fragments in the table
pub num_small_fragments: i64,
/// Statistics on the number of rows in the table fragments
pub lengths: FragmentSummaryStats,
}
#[napi(object)]
pub struct FragmentSummaryStats {
/// The number of rows in the fragment with the fewest rows
pub min: i64,
/// The number of rows in the fragment with the most rows
pub max: i64,
/// The mean number of rows in the fragments
pub mean: i64,
/// The 25th percentile of number of rows in the fragments
pub p25: i64,
/// The 50th percentile of number of rows in the fragments
pub p50: i64,
/// The 75th percentile of number of rows in the fragments
pub p75: i64,
/// The 99th percentile of number of rows in the fragments
pub p99: i64,
}
impl From<lancedb::table::TableStatistics> for TableStatistics {
fn from(v: lancedb::table::TableStatistics) -> Self {
Self {
total_bytes: v.total_bytes as i64,
num_rows: v.num_rows as i64,
num_indices: v.num_indices as i64,
fragment_stats: FragmentStatistics {
num_fragments: v.fragment_stats.num_fragments as i64,
num_small_fragments: v.fragment_stats.num_small_fragments as i64,
lengths: FragmentSummaryStats {
min: v.fragment_stats.lengths.min as i64,
max: v.fragment_stats.lengths.max as i64,
mean: v.fragment_stats.lengths.mean as i64,
p25: v.fragment_stats.lengths.p25 as i64,
p50: v.fragment_stats.lengths.p50 as i64,
p75: v.fragment_stats.lengths.p75 as i64,
p99: v.fragment_stats.lengths.p99 as i64,
},
},
}
}
}
#[napi(object)]
pub struct Version {
pub version: i64,

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.22.1-beta.0"
current_version = "0.22.1-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.22.1-beta.0"
version = "0.22.1-beta.1"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -7,7 +7,7 @@ dependencies = [
"numpy",
"overrides>=0.7",
"packaging",
"pyarrow>=14",
"pyarrow>=16",
"pydantic>=1.10",
"tqdm>=4.27.0",
]

View File

@@ -578,6 +578,9 @@ class RemoteTable(Table):
):
return LOOP.run(self._table.wait_for_index(index_names, timeout))
def stats(self):
return LOOP.run(self._table.stats())
def uses_v2_manifest_paths(self) -> bool:
raise NotImplementedError(
"uses_v2_manifest_paths() is not supported on the LanceDB Cloud"

View File

@@ -739,6 +739,13 @@ class Table(ABC):
"""
raise NotImplementedError
@abstractmethod
def stats(self) -> TableStatistics:
"""
Retrieve table and fragment statistics.
"""
raise NotImplementedError
@abstractmethod
def create_scalar_index(
self,
@@ -955,10 +962,12 @@ class Table(ABC):
>>> table = db.create_table("my_table", data)
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
>>> # Perform a "upsert" operation
>>> table.merge_insert("a") \\
>>> stats = table.merge_insert("a") \\
... .when_matched_update_all() \\
... .when_not_matched_insert_all() \\
... .execute(new_data)
>>> stats
{'num_inserted_rows': 1, 'num_updated_rows': 2, 'num_deleted_rows': 0}
>>> # The order of new rows is non-deterministic since we use
>>> # a hash-join as part of this operation and so we sort here
>>> table.to_arrow().sort_by("a").to_pandas()
@@ -1876,6 +1885,9 @@ class LanceTable(Table):
) -> None:
return LOOP.run(self._table.wait_for_index(index_names, timeout))
def stats(self) -> TableStatistics:
return LOOP.run(self._table.stats())
def create_scalar_index(
self,
column: str,
@@ -2479,7 +2491,9 @@ class LanceTable(Table):
on_bad_vectors: OnBadVectorsType,
fill_value: float,
):
LOOP.run(self._table._do_merge(merge, new_data, on_bad_vectors, fill_value))
return LOOP.run(
self._table._do_merge(merge, new_data, on_bad_vectors, fill_value)
)
@deprecation.deprecated(
deprecated_in="0.21.0",
@@ -3170,6 +3184,12 @@ class AsyncTable:
"""
await self._inner.wait_for_index(index_names, timeout)
async def stats(self) -> TableStatistics:
"""
Retrieve table and fragment statistics.
"""
return await self._inner.stats()
async def add(
self,
data: DATA,
@@ -3261,10 +3281,12 @@ class AsyncTable:
>>> table = db.create_table("my_table", data)
>>> new_data = pa.table({"a": [2, 3, 4], "b": ["x", "y", "z"]})
>>> # Perform a "upsert" operation
>>> table.merge_insert("a") \\
>>> stats = table.merge_insert("a") \\
... .when_matched_update_all() \\
... .when_not_matched_insert_all() \\
... .execute(new_data)
>>> stats
{'num_inserted_rows': 1, 'num_updated_rows': 2, 'num_deleted_rows': 0}
>>> # The order of new rows is non-deterministic since we use
>>> # a hash-join as part of this operation and so we sort here
>>> table.to_arrow().sort_by("a").to_pandas()
@@ -3620,7 +3642,7 @@ class AsyncTable:
)
if isinstance(data, pa.Table):
data = pa.RecordBatchReader.from_batches(data.schema, data.to_batches())
await self._inner.execute_merge_insert(
return await self._inner.execute_merge_insert(
data,
dict(
on=merge._on,
@@ -4068,6 +4090,82 @@ class IndexStatistics:
return getattr(self, key)
@dataclass
class TableStatistics:
"""
Statistics about a table and fragments.
Attributes
----------
total_bytes: int
The total number of bytes in the table.
num_rows: int
The total number of rows in the table.
num_indices: int
The total number of indices in the table.
fragment_stats: FragmentStatistics
Statistics about fragments in the table.
"""
total_bytes: int
num_rows: int
num_indices: int
fragment_stats: FragmentStatistics
@dataclass
class FragmentStatistics:
"""
Statistics about fragments.
Attributes
----------
num_fragments: int
The total number of fragments in the table.
num_small_fragments: int
The total number of small fragments in the table.
Small fragments have low row counts and may need to be compacted.
lengths: FragmentSummaryStats
Statistics about the number of rows in the table fragments.
"""
num_fragments: int
num_small_fragments: int
lengths: FragmentSummaryStats
@dataclass
class FragmentSummaryStats:
"""
Statistics about fragments sizes
Attributes
----------
min: int
The number of rows in the fragment with the fewest rows.
max: int
The number of rows in the fragment with the most rows.
mean: int
The mean number of rows in the fragments.
p25: int
The 25th percentile of number of rows in the fragments.
p50: int
The 50th percentile of number of rows in the fragments.
p75: int
The 75th percentile of number of rows in the fragments.
p99: int
The 99th percentile of number of rows in the fragments.
"""
min: int
max: int
mean: int
p25: int
p50: int
p75: int
p99: int
class Tags:
"""
Table tag manager.

View File

@@ -18,15 +18,19 @@ def test_upsert(mem_db):
{"id": 1, "name": "Bobby"},
{"id": 2, "name": "Charlie"},
]
(
stats = (
table.merge_insert("id")
.when_matched_update_all()
.when_not_matched_insert_all()
.execute(new_users)
)
table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}
# --8<-- [end:upsert_basic]
assert table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 0
@pytest.mark.asyncio
@@ -44,15 +48,19 @@ async def test_upsert_async(mem_db_async):
{"id": 1, "name": "Bobby"},
{"id": 2, "name": "Charlie"},
]
await (
stats = await (
table.merge_insert("id")
.when_matched_update_all()
.when_not_matched_insert_all()
.execute(new_users)
)
await table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 1, 'num_deleted_rows': 0}
# --8<-- [end:upsert_basic_async]
assert await table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 0
def test_insert_if_not_exists(mem_db):
@@ -69,10 +77,16 @@ def test_insert_if_not_exists(mem_db):
{"domain": "google.com", "name": "Google"},
{"domain": "facebook.com", "name": "Facebook"},
]
(table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains))
stats = (
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
)
table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 0, 'num_deleted_rows': 0}
# --8<-- [end:insert_if_not_exists]
assert table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 0
assert stats["num_deleted_rows"] == 0
@pytest.mark.asyncio
@@ -90,12 +104,16 @@ async def test_insert_if_not_exists_async(mem_db_async):
{"domain": "google.com", "name": "Google"},
{"domain": "facebook.com", "name": "Facebook"},
]
await (
stats = await (
table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains)
)
await table.count_rows() # 3
stats # {'num_inserted_rows': 1, 'num_updated_rows': 0, 'num_deleted_rows': 0}
# --8<-- [end:insert_if_not_exists_async]
assert await table.count_rows() == 3
assert stats["num_inserted_rows"] == 1
assert stats["num_updated_rows"] == 0
assert stats["num_deleted_rows"] == 0
def test_replace_range(mem_db):
@@ -113,7 +131,7 @@ def test_replace_range(mem_db):
new_chunks = [
{"doc_id": 1, "chunk_id": 0, "text": "Baz"},
]
(
stats = (
table.merge_insert(["doc_id", "chunk_id"])
.when_matched_update_all()
.when_not_matched_insert_all()
@@ -121,8 +139,12 @@ def test_replace_range(mem_db):
.execute(new_chunks)
)
table.count_rows("doc_id = 1") # 1
stats # {'num_inserted_rows': 0, 'num_updated_rows': 1, 'num_deleted_rows': 1}
# --8<-- [end:replace_range]
assert table.count_rows("doc_id = 1") == 1
assert stats["num_inserted_rows"] == 0
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 1
@pytest.mark.asyncio
@@ -141,7 +163,7 @@ async def test_replace_range_async(mem_db_async):
new_chunks = [
{"doc_id": 1, "chunk_id": 0, "text": "Baz"},
]
await (
stats = await (
table.merge_insert(["doc_id", "chunk_id"])
.when_matched_update_all()
.when_not_matched_insert_all()
@@ -149,5 +171,9 @@ async def test_replace_range_async(mem_db_async):
.execute(new_chunks)
)
await table.count_rows("doc_id = 1") # 1
stats # {'num_inserted_rows': 0, 'num_updated_rows': 1, 'num_deleted_rows': 1}
# --8<-- [end:replace_range_async]
assert await table.count_rows("doc_id = 1") == 1
assert stats["num_inserted_rows"] == 0
assert stats["num_updated_rows"] == 1
assert stats["num_deleted_rows"] == 1

View File

@@ -389,6 +389,50 @@ def test_table_wait_for_index_timeout():
table.wait_for_index(["id_idx"], timedelta(seconds=1))
def test_stats():
stats = {
"total_bytes": 38,
"num_rows": 2,
"num_indices": 0,
"fragment_stats": {
"num_fragments": 1,
"num_small_fragments": 1,
"lengths": {
"min": 2,
"max": 2,
"mean": 2,
"p25": 2,
"p50": 2,
"p75": 2,
"p99": 2,
},
},
}
def handler(request):
if request.path == "/v1/table/test/create/?mode=create":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(b"{}")
elif request.path == "/v1/table/test/stats/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
payload = json.dumps(stats)
request.wfile.write(payload.encode())
else:
print(request.path)
request.send_response(404)
request.end_headers()
with mock_lancedb_connection(handler) as db:
table = db.create_table("test", [{"id": 1}])
res = table.stats()
print(f"{res=}")
assert res == stats
@contextlib.contextmanager
def query_test_table(query_handler, *, server_version=Version("0.1.0")):
def handler(request):

View File

@@ -1695,3 +1695,31 @@ def test_replace_field_metadata(tmp_path):
schema = table.schema
field = schema[0].metadata
assert field == {b"foo": b"bar"}
def test_stats(mem_db: DBConnection):
table = mem_db.create_table(
"my_table",
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
)
assert len(table) == 2
stats = table.stats()
print(f"{stats=}")
assert stats == {
"total_bytes": 38,
"num_rows": 2,
"num_indices": 0,
"fragment_stats": {
"num_fragments": 1,
"num_small_fragments": 1,
"lengths": {
"min": 2,
"max": 2,
"mean": 2,
"p25": 2,
"p50": 2,
"p75": 2,
"p99": 2,
},
},
}

View File

@@ -279,6 +279,40 @@ impl Table {
})
}
pub fn stats(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let stats = inner.stats().await.infer_error()?;
Python::with_gil(|py| {
let dict = PyDict::new(py);
dict.set_item("total_bytes", stats.total_bytes)?;
dict.set_item("num_rows", stats.num_rows)?;
dict.set_item("num_indices", stats.num_indices)?;
let fragment_stats = PyDict::new(py);
fragment_stats.set_item("num_fragments", stats.fragment_stats.num_fragments)?;
fragment_stats.set_item(
"num_small_fragments",
stats.fragment_stats.num_small_fragments,
)?;
let fragment_lengths = PyDict::new(py);
fragment_lengths.set_item("min", stats.fragment_stats.lengths.min)?;
fragment_lengths.set_item("max", stats.fragment_stats.lengths.max)?;
fragment_lengths.set_item("mean", stats.fragment_stats.lengths.mean)?;
fragment_lengths.set_item("p25", stats.fragment_stats.lengths.p25)?;
fragment_lengths.set_item("p50", stats.fragment_stats.lengths.p50)?;
fragment_lengths.set_item("p75", stats.fragment_stats.lengths.p75)?;
fragment_lengths.set_item("p99", stats.fragment_stats.lengths.p99)?;
fragment_stats.set_item("lengths", fragment_lengths)?;
dict.set_item("fragment_stats", fragment_stats)?;
Ok(Some(dict.unbind()))
})
})
}
pub fn __repr__(&self) -> String {
match &self.inner {
None => format!("ClosedTable({})", self.name),
@@ -455,8 +489,14 @@ impl Table {
}
future_into_py(self_.py(), async move {
builder.execute(Box::new(batches)).await.infer_error()?;
Ok(())
let stats = builder.execute(Box::new(batches)).await.infer_error()?;
Python::with_gil(|py| {
let dict = PyDict::new(py);
dict.set_item("num_inserted_rows", stats.num_inserted_rows)?;
dict.set_item("num_updated_rows", stats.num_updated_rows)?;
dict.set_item("num_deleted_rows", stats.num_deleted_rows)?;
Ok(dict.unbind())
})
})
}

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.19.1-beta.0"
version = "0.19.1-beta.1"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.19.1-beta.0"
version = "0.19.1-beta.1"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -5,7 +5,7 @@ use crate::index::Index;
use crate::index::IndexStatistics;
use crate::query::{QueryFilter, QueryRequest, Select, VectorQueryRequest};
use crate::table::Tags;
use crate::table::{AddDataMode, AnyQuery, Filter};
use crate::table::{AddDataMode, AnyQuery, Filter, TableStatistics};
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
use crate::{DistanceType, Error, Table};
use arrow_array::{RecordBatch, RecordBatchIterator, RecordBatchReader};
@@ -47,6 +47,7 @@ use crate::{
TableDefinition, UpdateBuilder,
},
};
use lance::dataset::MergeStats;
const REQUEST_TIMEOUT_HEADER: HeaderName = HeaderName::from_static("x-request-timeout-ms");
@@ -1022,7 +1023,7 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
&self,
params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<()> {
) -> Result<MergeStats> {
self.check_mutable().await?;
let query = MergeInsertRequest::try_from(params)?;
@@ -1034,9 +1035,11 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
let (request_id, response) = self.send_streaming(request, new_data, true).await?;
// TODO: server can response with these stats in response body.
// We should test that we can handle both empty response from old server
// and response with stats from new server.
self.check_table_response(&request_id, response).await?;
Ok(())
Ok(MergeStats::default())
}
async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
@@ -1242,6 +1245,20 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
fn dataset_uri(&self) -> &str {
"NOT_SUPPORTED"
}
async fn stats(&self) -> Result<TableStatistics> {
let request = self.client.post(&format!("/v1/table/{}/stats/", self.name));
let (request_id, response) = self.send(request, true).await?;
let response = self.check_table_response(&request_id, response).await?;
let body = response.text().await.err_to_http(request_id.clone())?;
let stats = serde_json::from_str(&body).map_err(|e| Error::Http {
source: format!("Failed to parse table statistics: {}", e).into(),
request_id,
status_code: None,
})?;
Ok(stats)
}
}
#[derive(Serialize)]
@@ -1334,7 +1351,12 @@ mod tests {
Box::pin(table.count_rows(None).map_ok(|_| ())),
Box::pin(table.update().column("a", "a + 1").execute().map_ok(|_| ())),
Box::pin(table.add(example_data()).execute().map_ok(|_| ())),
Box::pin(table.merge_insert(&["test"]).execute(example_data())),
Box::pin(
table
.merge_insert(&["test"])
.execute(example_data())
.map_ok(|_| ()),
),
Box::pin(table.delete("false")),
Box::pin(table.add_columns(
NewColumnTransform::SqlExpressions(vec![("x".into(), "y".into())]),

View File

@@ -20,6 +20,7 @@ use lance::dataset::cleanup::RemovalStats;
use lance::dataset::optimize::{compact_files, CompactionMetrics, IndexRemapperOptions};
use lance::dataset::scanner::Scanner;
pub use lance::dataset::ColumnAlteration;
pub use lance::dataset::MergeStats;
pub use lance::dataset::NewColumnTransform;
pub use lance::dataset::ReadParams;
pub use lance::dataset::Version;
@@ -80,10 +81,13 @@ pub mod merge;
use crate::index::waiter::wait_for_index;
pub use chrono::Duration;
use futures::future::join_all;
pub use lance::dataset::optimize::CompactionOptions;
pub use lance::dataset::refs::{TagContents, Tags as LanceTags};
pub use lance::dataset::scanner::DatasetRecordBatchStream;
use lance::dataset::statistics::DatasetStatisticsExt;
pub use lance_index::optimize::OptimizeOptions;
use serde_with::skip_serializing_none;
/// Defines the type of column
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -484,7 +488,7 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
&self,
params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<()>;
) -> Result<MergeStats>;
/// Gets the table tag manager.
async fn tags(&self) -> Result<Box<dyn Tags + '_>>;
/// Optimize the dataset.
@@ -523,6 +527,8 @@ pub trait BaseTable: std::fmt::Display + std::fmt::Debug + Send + Sync {
index_names: &[&str],
timeout: std::time::Duration,
) -> Result<()>;
/// Get statistics on the table
async fn stats(&self) -> Result<TableStatistics>;
}
/// A Table is a collection of strong typed Rows.
@@ -1241,6 +1247,11 @@ impl Table {
.unwrap();
Ok(Arc::new(repartitioned))
}
/// Retrieve statistics on the table
pub async fn stats(&self) -> Result<TableStatistics> {
self.inner.stats().await
}
}
pub struct NativeTags {
@@ -2357,7 +2368,7 @@ impl BaseTable for NativeTable {
&self,
params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<()> {
) -> Result<MergeStats> {
let dataset = Arc::new(self.dataset.get().await?.clone());
let mut builder = LanceMergeInsertBuilder::try_new(dataset.clone(), params.on)?;
match (
@@ -2384,9 +2395,9 @@ impl BaseTable for NativeTable {
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
}
let job = builder.try_build()?;
let (new_dataset, _stats) = job.execute_reader(new_data).await?;
let (new_dataset, stats) = job.execute_reader(new_data).await?;
self.dataset.set_latest(new_dataset.as_ref().clone()).await;
Ok(())
Ok(stats)
}
/// Delete rows from the table
@@ -2568,6 +2579,108 @@ impl BaseTable for NativeTable {
) -> Result<()> {
wait_for_index(self, index_names, timeout).await
}
async fn stats(&self) -> Result<TableStatistics> {
let num_rows = self.count_rows(None).await?;
let num_indices = self.list_indices().await?.len();
let ds = self.dataset.get().await?;
let ds_clone = (*ds).clone();
let ds_stats = Arc::new(ds_clone).calculate_data_stats().await?;
let total_bytes = ds_stats.fields.iter().map(|f| f.bytes_on_disk).sum::<u64>() as usize;
let frags = ds.get_fragments();
let mut sorted_sizes = join_all(
frags
.iter()
.map(|frag| async move { frag.physical_rows().await.unwrap_or(0) }),
)
.await;
sorted_sizes.sort();
let small_frag_threshold = 100000;
let num_fragments = sorted_sizes.len();
let num_small_fragments = sorted_sizes
.iter()
.filter(|&&size| size < small_frag_threshold)
.count();
let p25 = *sorted_sizes.get(num_fragments / 4).unwrap_or(&0);
let p50 = *sorted_sizes.get(num_fragments / 2).unwrap_or(&0);
let p75 = *sorted_sizes.get(num_fragments * 3 / 4).unwrap_or(&0);
let p99 = *sorted_sizes.get(num_fragments * 99 / 100).unwrap_or(&0);
let min = sorted_sizes.first().copied().unwrap_or(0);
let max = sorted_sizes.last().copied().unwrap_or(0);
let mean = if num_fragments == 0 {
0
} else {
sorted_sizes.iter().copied().sum::<usize>() / num_fragments
};
let frag_stats = FragmentStatistics {
num_fragments,
num_small_fragments,
lengths: FragmentSummaryStats {
min,
max,
mean,
p25,
p50,
p75,
p99,
},
};
let stats = TableStatistics {
total_bytes,
num_rows,
num_indices,
fragment_stats: frag_stats,
};
Ok(stats)
}
}
#[skip_serializing_none]
#[derive(Debug, Deserialize, PartialEq)]
pub struct TableStatistics {
/// The total number of bytes in the table
pub total_bytes: usize,
/// The number of rows in the table
pub num_rows: usize,
/// The number of indices in the table
pub num_indices: usize,
/// Statistics on table fragments
pub fragment_stats: FragmentStatistics,
}
#[skip_serializing_none]
#[derive(Debug, Deserialize, PartialEq)]
pub struct FragmentStatistics {
/// The number of fragments in the table
pub num_fragments: usize,
/// The number of uncompacted fragments in the table
pub num_small_fragments: usize,
/// Statistics on the number of rows in the table fragments
pub lengths: FragmentSummaryStats,
// todo: add size statistics
// /// Statistics on the number of bytes in the table fragments
// sizes: FragmentStats,
}
#[skip_serializing_none]
#[derive(Debug, Deserialize, PartialEq)]
pub struct FragmentSummaryStats {
pub min: usize,
pub max: usize,
pub mean: usize,
pub p25: usize,
pub p50: usize,
pub p75: usize,
pub p99: usize,
}
#[cfg(test)]
@@ -3945,4 +4058,108 @@ mod tests {
Some(&"test_field_val1".to_string())
);
}
#[tokio::test]
pub async fn test_stats() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("foo", DataType::Int32, true),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Int32Array::from_iter_values(0..100)),
Arc::new(Int32Array::from_iter_values(0..100)),
],
)
.unwrap();
let table = conn
.create_table(
"test_stats",
RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()),
)
.execute()
.await
.unwrap();
for _ in 0..10 {
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Int32Array::from_iter_values(0..15)),
Arc::new(Int32Array::from_iter_values(0..15)),
],
)
.unwrap();
table
.add(RecordBatchIterator::new(
vec![Ok(batch.clone())],
batch.schema(),
))
.execute()
.await
.unwrap();
}
let empty_table = conn
.create_table(
"test_stats_empty",
RecordBatchIterator::new(vec![], batch.schema()),
)
.execute()
.await
.unwrap();
let res = table.stats().await.unwrap();
println!("{:#?}", res);
assert_eq!(
res,
TableStatistics {
num_rows: 250,
num_indices: 0,
total_bytes: 2000,
fragment_stats: FragmentStatistics {
num_fragments: 11,
num_small_fragments: 11,
lengths: FragmentSummaryStats {
min: 15,
max: 100,
mean: 22,
p25: 15,
p50: 15,
p75: 15,
p99: 100,
},
},
}
);
let res = empty_table.stats().await.unwrap();
println!("{:#?}", res);
assert_eq!(
res,
TableStatistics {
num_rows: 0,
num_indices: 0,
total_bytes: 0,
fragment_stats: FragmentStatistics {
num_fragments: 0,
num_small_fragments: 0,
lengths: FragmentSummaryStats {
min: 0,
max: 0,
mean: 0,
p25: 0,
p50: 0,
p75: 0,
p99: 0,
},
},
}
)
}
}

View File

@@ -4,6 +4,7 @@
use std::sync::Arc;
use arrow_array::RecordBatchReader;
use lance::dataset::MergeStats;
use crate::Result;
@@ -86,8 +87,9 @@ impl MergeInsertBuilder {
/// Executes the merge insert operation
///
/// Nothing is returned but the [`super::Table`] is updated
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<()> {
/// Returns statistics about the merge operation including the number of rows
/// inserted, updated, and deleted.
pub async fn execute(self, new_data: Box<dyn RecordBatchReader + Send>) -> Result<MergeStats> {
self.table.clone().merge_insert(self, new_data).await
}
}