mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 13:29:57 +00:00
Compare commits
7 Commits
docs/mcp
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1096da09da | ||
|
|
683824f1e9 | ||
|
|
db7bdefe77 | ||
|
|
e41894b071 | ||
|
|
e1ae2bcbd8 | ||
|
|
ababc3f8ec | ||
|
|
a1377afcaa |
@@ -17,6 +17,7 @@ arrow-ord = "43.0"
|
|||||||
arrow-schema = "43.0"
|
arrow-schema = "43.0"
|
||||||
arrow-arith = "43.0"
|
arrow-arith = "43.0"
|
||||||
arrow-cast = "43.0"
|
arrow-cast = "43.0"
|
||||||
|
chrono = "0.4.23"
|
||||||
half = { "version" = "=2.2.1", default-features = false, features = [
|
half = { "version" = "=2.2.1", default-features = false, features = [
|
||||||
"num-traits"
|
"num-traits"
|
||||||
] }
|
] }
|
||||||
|
|||||||
@@ -54,8 +54,7 @@ const table = await db.createTable('vectors',
|
|||||||
[{ id: 1, vector: [0.1, 0.2], item: "foo", price: 10 },
|
[{ id: 1, vector: [0.1, 0.2], item: "foo", price: 10 },
|
||||||
{ id: 2, vector: [1.1, 1.2], item: "bar", price: 50 }])
|
{ id: 2, vector: [1.1, 1.2], item: "bar", price: 50 }])
|
||||||
|
|
||||||
const query = table.search([0.1, 0.3]);
|
const query = table.search([0.1, 0.3]).limit(2);
|
||||||
query.limit = 20;
|
|
||||||
const results = await query.execute();
|
const results = await query.execute();
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -72,7 +71,7 @@ db = lancedb.connect(uri)
|
|||||||
table = db.create_table("my_table",
|
table = db.create_table("my_table",
|
||||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||||
result = table.search([100, 100]).limit(2).to_df()
|
result = table.search([100, 100]).limit(2).to_pandas()
|
||||||
```
|
```
|
||||||
|
|
||||||
## Blogs, Tutorials & Videos
|
## Blogs, Tutorials & Videos
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ theme:
|
|||||||
- navigation.tracking
|
- navigation.tracking
|
||||||
- navigation.instant
|
- navigation.instant
|
||||||
- navigation.indexes
|
- navigation.indexes
|
||||||
|
- navigation.expand
|
||||||
icon:
|
icon:
|
||||||
repo: fontawesome/brands/github
|
repo: fontawesome/brands/github
|
||||||
custom_dir: overrides
|
custom_dir: overrides
|
||||||
@@ -68,7 +69,7 @@ nav:
|
|||||||
- 🏢 Home: index.md
|
- 🏢 Home: index.md
|
||||||
- 💡 Basics: basic.md
|
- 💡 Basics: basic.md
|
||||||
- 📚 Guides:
|
- 📚 Guides:
|
||||||
- Tables: guides/tables.md
|
- Create Ingest Update Delete: guides/tables.md
|
||||||
- Vector Search: search.md
|
- Vector Search: search.md
|
||||||
- SQL filters: sql.md
|
- SQL filters: sql.md
|
||||||
- Indexing: ann_indexes.md
|
- Indexing: ann_indexes.md
|
||||||
@@ -96,9 +97,11 @@ nav:
|
|||||||
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
- Serverless Website Chatbot: examples/serverless_website_chatbot.md
|
||||||
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
- YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
|
||||||
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
- TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
|
||||||
|
- ⚙️ CLI & Config: cli_config.md
|
||||||
|
|
||||||
- Basics: basic.md
|
- Basics: basic.md
|
||||||
- Guides:
|
- Guides:
|
||||||
- Tables: guides/tables.md
|
- Create Ingest Update Delete: guides/tables.md
|
||||||
- Vector Search: search.md
|
- Vector Search: search.md
|
||||||
- SQL filters: sql.md
|
- SQL filters: sql.md
|
||||||
- Indexing: ann_indexes.md
|
- Indexing: ann_indexes.md
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ There are a couple of parameters that can be used to fine-tune the search:
|
|||||||
.limit(2) \
|
.limit(2) \
|
||||||
.nprobes(20) \
|
.nprobes(20) \
|
||||||
.refine_factor(10) \
|
.refine_factor(10) \
|
||||||
.to_df()
|
.to_pandas()
|
||||||
```
|
```
|
||||||
```
|
```
|
||||||
vector item _distance
|
vector item _distance
|
||||||
@@ -124,7 +124,7 @@ You can further filter the elements returned by a search using a where clause.
|
|||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
```python
|
```python
|
||||||
tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_df()
|
tbl.search(np.random.random((1536))).where("item != 'item 1141'").to_pandas()
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "Javascript"
|
=== "Javascript"
|
||||||
@@ -141,7 +141,7 @@ You can select the columns returned by the query using a select clause.
|
|||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
```python
|
```python
|
||||||
tbl.search(np.random.random((1536))).select(["vector"]).to_df()
|
tbl.search(np.random.random((1536))).select(["vector"]).to_pandas()
|
||||||
```
|
```
|
||||||
```
|
```
|
||||||
vector _distance
|
vector _distance
|
||||||
|
|||||||
@@ -146,7 +146,7 @@ Once you've embedded the query, you can find its nearest neighbors using the fol
|
|||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
```python
|
```python
|
||||||
tbl.search([100, 100]).limit(2).to_df()
|
tbl.search([100, 100]).limit(2).to_pandas()
|
||||||
```
|
```
|
||||||
|
|
||||||
This returns a pandas DataFrame with the results.
|
This returns a pandas DataFrame with the results.
|
||||||
|
|||||||
37
docs/src/cli_config.md
Normal file
37
docs/src/cli_config.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
|
||||||
|
## LanceDB CLI
|
||||||
|
Once lanceDB is installed, you can access the CLI using `lancedb` command on the console
|
||||||
|
```
|
||||||
|
lancedb
|
||||||
|
```
|
||||||
|
This lists out all the various command-line options available. You can get the usage or help for a particular command
|
||||||
|
```
|
||||||
|
lancedb {command} --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## LanceDB config
|
||||||
|
LanceDB uses a global config file to store certain settings. These settings are configurable using the lanceDB cli.
|
||||||
|
To view your config settings, you can use:
|
||||||
|
```
|
||||||
|
lancedb config
|
||||||
|
```
|
||||||
|
These config parameters can be tuned using the cli.
|
||||||
|
```
|
||||||
|
lancedb {config_name} --{argument}
|
||||||
|
```
|
||||||
|
|
||||||
|
## LanceDB Opt-in Diagnostics
|
||||||
|
When enabled, LanceDB will send anonymous events to help us improve LanceDB. These diagnostics are used only for error reporting and no data is collected. Error & stats allow us to automate certain aspects of bug reporting, prioritization of fixes and feature requests.
|
||||||
|
These diagnostics are opt-in and can be enabled or disabled using the `lancedb diagnostics` command. These are enabled by default.
|
||||||
|
Get usage help.
|
||||||
|
```
|
||||||
|
lancedb diagnostics --help
|
||||||
|
```
|
||||||
|
Disable diagnostics
|
||||||
|
```
|
||||||
|
lancedb diagnostics --disabled
|
||||||
|
```
|
||||||
|
Enable diagnostics
|
||||||
|
```
|
||||||
|
lancedb diagnostics --enabled
|
||||||
|
```
|
||||||
@@ -118,7 +118,7 @@ belong in the same latent space and your results will be nonsensical.
|
|||||||
```python
|
```python
|
||||||
query = "What's the best pizza topping?"
|
query = "What's the best pizza topping?"
|
||||||
query_vector = embed_func([query])[0]
|
query_vector = embed_func([query])[0]
|
||||||
tbl.search(query_vector).limit(10).to_df()
|
tbl.search(query_vector).limit(10).to_pandas()
|
||||||
```
|
```
|
||||||
|
|
||||||
The above snippet returns a pandas DataFrame with the 10 closest vectors to the query.
|
The above snippet returns a pandas DataFrame with the 10 closest vectors to the query.
|
||||||
|
|||||||
@@ -80,14 +80,14 @@ def handler(event, context):
|
|||||||
# Shape of SIFT is (128,1M), d=float32
|
# Shape of SIFT is (128,1M), d=float32
|
||||||
query_vector = np.array(event['query_vector'], dtype=np.float32)
|
query_vector = np.array(event['query_vector'], dtype=np.float32)
|
||||||
|
|
||||||
rs = table.search(query_vector).limit(2).to_df()
|
rs = table.search(query_vector).limit(2).to_list()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"statusCode": status_code,
|
"statusCode": status_code,
|
||||||
"headers": {
|
"headers": {
|
||||||
"Content-Type": "application/json"
|
"Content-Type": "application/json"
|
||||||
},
|
},
|
||||||
"body": rs.to_json()
|
"body": json.dumps(rs)
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,13 @@ table.create_fts_index("text")
|
|||||||
To search:
|
To search:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
df = table.search("puppy").limit(10).select(["text"]).to_df()
|
table.search("puppy").limit(10).select(["text"]).to_list()
|
||||||
|
```
|
||||||
|
|
||||||
|
Which returns a list of dictionaries:
|
||||||
|
|
||||||
|
```python
|
||||||
|
[{'text': 'Frodo was a happy puppy', 'score': 0.6931471824645996}]
|
||||||
```
|
```
|
||||||
|
|
||||||
LanceDB automatically looks for an FTS index if the input is str.
|
LanceDB automatically looks for an FTS index if the input is str.
|
||||||
|
|||||||
@@ -364,6 +364,48 @@ Use the `delete()` method on tables to delete rows from a table. To choose which
|
|||||||
await tbl.countRows() // Returns 1
|
await tbl.countRows() // Returns 1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Updating a Table [Experimental]
|
||||||
|
EXPERIMENTAL: Update rows in the table (not threadsafe).
|
||||||
|
|
||||||
|
This can be used to update zero to all rows depending on how many rows match the where clause.
|
||||||
|
|
||||||
|
| Parameter | Type | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `where` | `str` | The SQL where clause to use when updating rows. For example, `'x = 2'` or `'x IN (1, 2, 3)'`. The filter must not be empty, or it will error. |
|
||||||
|
| `values` | `dict` | The values to update. The keys are the column names and the values are the values to set. |
|
||||||
|
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
import lancedb
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Create a lancedb connection
|
||||||
|
db = lancedb.connect("./.lancedb")
|
||||||
|
|
||||||
|
# Create a table from a pandas DataFrame
|
||||||
|
data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]})
|
||||||
|
table = db.create_table("my_table", data)
|
||||||
|
|
||||||
|
# Update the table where x = 2
|
||||||
|
table.update(where="x = 2", values={"vector": [10, 10]})
|
||||||
|
|
||||||
|
# Get the updated table as a pandas DataFrame
|
||||||
|
df = table.to_pandas()
|
||||||
|
|
||||||
|
# Print the DataFrame
|
||||||
|
print(df)
|
||||||
|
```
|
||||||
|
|
||||||
|
Output
|
||||||
|
```shell
|
||||||
|
x vector
|
||||||
|
0 1 [1.0, 2.0]
|
||||||
|
1 3 [5.0, 6.0]
|
||||||
|
2 2 [10.0, 10.0]
|
||||||
|
```
|
||||||
|
|
||||||
## What's Next?
|
## What's Next?
|
||||||
|
|
||||||
Learn how to Query your tables and create indices
|
Learn how to Query your tables and create indices
|
||||||
@@ -36,7 +36,7 @@ LanceDB's core is written in Rust 🦀 and is built using <a href="https://githu
|
|||||||
table = db.create_table("my_table",
|
table = db.create_table("my_table",
|
||||||
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0}])
|
||||||
result = table.search([100, 100]).limit(2).to_df()
|
result = table.search([100, 100]).limit(2).to_list()
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "Javascript"
|
=== "Javascript"
|
||||||
|
|||||||
@@ -19,11 +19,11 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\n",
|
"\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip available: \u001B[0m\u001B[31;49m22.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.2\u001B[0m\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -39,6 +39,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import io\n",
|
"import io\n",
|
||||||
|
"\n",
|
||||||
"import PIL\n",
|
"import PIL\n",
|
||||||
"import duckdb\n",
|
"import duckdb\n",
|
||||||
"import lancedb"
|
"import lancedb"
|
||||||
@@ -158,18 +159,18 @@
|
|||||||
" \"db = lancedb.connect('~/datasets/demo')\\n\"\n",
|
" \"db = lancedb.connect('~/datasets/demo')\\n\"\n",
|
||||||
" \"tbl = db.open_table('diffusiondb')\\n\\n\"\n",
|
" \"tbl = db.open_table('diffusiondb')\\n\\n\"\n",
|
||||||
" f\"embedding = embed_func('{query}')\\n\"\n",
|
" f\"embedding = embed_func('{query}')\\n\"\n",
|
||||||
" \"tbl.search(embedding).limit(9).to_df()\"\n",
|
" \"tbl.search(embedding).limit(9).to_pandas()\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" return (_extract(tbl.search(emb).limit(9).to_df()), code)\n",
|
" return (_extract(tbl.search(emb).limit(9).to_pandas()), code)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def find_image_keywords(query):\n",
|
"def find_image_keywords(query):\n",
|
||||||
" code = (\n",
|
" code = (\n",
|
||||||
" \"import lancedb\\n\"\n",
|
" \"import lancedb\\n\"\n",
|
||||||
" \"db = lancedb.connect('~/datasets/demo')\\n\"\n",
|
" \"db = lancedb.connect('~/datasets/demo')\\n\"\n",
|
||||||
" \"tbl = db.open_table('diffusiondb')\\n\\n\"\n",
|
" \"tbl = db.open_table('diffusiondb')\\n\\n\"\n",
|
||||||
" f\"tbl.search('{query}').limit(9).to_df()\"\n",
|
" f\"tbl.search('{query}').limit(9).to_pandas()\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" return (_extract(tbl.search(query).limit(9).to_df()), code)\n",
|
" return (_extract(tbl.search(query).limit(9).to_pandas()), code)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def find_image_sql(query):\n",
|
"def find_image_sql(query):\n",
|
||||||
" code = (\n",
|
" code = (\n",
|
||||||
|
|||||||
@@ -27,11 +27,11 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\n",
|
"\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.1.1\u001B[0m\n",
|
||||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
|
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -184,7 +184,7 @@
|
|||||||
"df = (contextualize(data.to_pandas())\n",
|
"df = (contextualize(data.to_pandas())\n",
|
||||||
" .groupby(\"title\").text_col(\"text\")\n",
|
" .groupby(\"title\").text_col(\"text\")\n",
|
||||||
" .window(20).stride(4)\n",
|
" .window(20).stride(4)\n",
|
||||||
" .to_df())\n",
|
" .to_pandas())\n",
|
||||||
"df.head(1)"
|
"df.head(1)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -603,7 +603,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Use LanceDB to get top 3 most relevant context\n",
|
"# Use LanceDB to get top 3 most relevant context\n",
|
||||||
"context = tbl.search(emb).limit(3).to_df()"
|
"context = tbl.search(emb).limit(3).to_pandas()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ table = db.open_table("pd_table")
|
|||||||
|
|
||||||
query_vector = [100, 100]
|
query_vector = [100, 100]
|
||||||
# Pandas DataFrame
|
# Pandas DataFrame
|
||||||
df = table.search(query_vector).limit(1).to_df()
|
df = table.search(query_vector).limit(1).to_pandas()
|
||||||
print(df)
|
print(df)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -89,12 +89,12 @@ If you have more complex criteria, you can always apply the filter to the result
|
|||||||
```python
|
```python
|
||||||
|
|
||||||
# Apply the filter via LanceDB
|
# Apply the filter via LanceDB
|
||||||
results = table.search([100, 100]).where("price < 15").to_df()
|
results = table.search([100, 100]).where("price < 15").to_pandas()
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
assert results["item"].iloc[0] == "foo"
|
assert results["item"].iloc[0] == "foo"
|
||||||
|
|
||||||
# Apply the filter via Pandas
|
# Apply the filter via Pandas
|
||||||
df = results = table.search([100, 100]).to_df()
|
df = results = table.search([100, 100]).to_pandas()
|
||||||
results = df[df.price < 15]
|
results = df[df.price < 15]
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
assert results["item"].iloc[0] == "foo"
|
assert results["item"].iloc[0] == "foo"
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ await db_setup.createTable('my_vectors', data)
|
|||||||
|
|
||||||
df = tbl.search(np.random.random((1536))) \
|
df = tbl.search(np.random.random((1536))) \
|
||||||
.limit(10) \
|
.limit(10) \
|
||||||
.to_df()
|
.to_list()
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "JavaScript"
|
=== "JavaScript"
|
||||||
@@ -92,7 +92,7 @@ as well.
|
|||||||
df = tbl.search(np.random.random((1536))) \
|
df = tbl.search(np.random.random((1536))) \
|
||||||
.metric("cosine") \
|
.metric("cosine") \
|
||||||
.limit(10) \
|
.limit(10) \
|
||||||
.to_df()
|
.to_list()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import { Query } from './query'
|
|||||||
import { isEmbeddingFunction } from './embedding/embedding_function'
|
import { isEmbeddingFunction } from './embedding/embedding_function'
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||||
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete } = require('../native.js')
|
const { databaseNew, databaseTableNames, databaseOpenTable, databaseDropTable, tableCreate, tableAdd, tableCreateVectorIndex, tableCountRows, tableDelete, tableCleanupOldVersions, tableCompactFiles } = require('../native.js')
|
||||||
|
|
||||||
export { Query }
|
export { Query }
|
||||||
export type { EmbeddingFunction }
|
export type { EmbeddingFunction }
|
||||||
@@ -459,6 +459,111 @@ export class LocalTable<T = number[]> implements Table<T> {
|
|||||||
async delete (filter: string): Promise<void> {
|
async delete (filter: string): Promise<void> {
|
||||||
return tableDelete.call(this._tbl, filter).then((newTable: any) => { this._tbl = newTable })
|
return tableDelete.call(this._tbl, filter).then((newTable: any) => { this._tbl = newTable })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean up old versions of the table, freeing disk space.
|
||||||
|
*
|
||||||
|
* @param olderThan The minimum age in minutes of the versions to delete. If not
|
||||||
|
* provided, defaults to two weeks.
|
||||||
|
* @param deleteUnverified Because they may be part of an in-progress
|
||||||
|
* transaction, uncommitted files newer than 7 days old are
|
||||||
|
* not deleted by default. This means that failed transactions
|
||||||
|
* can leave around data that takes up disk space for up to
|
||||||
|
* 7 days. You can override this safety mechanism by setting
|
||||||
|
* this option to `true`, only if you promise there are no
|
||||||
|
* in progress writes while you run this operation. Failure to
|
||||||
|
* uphold this promise can lead to corrupted tables.
|
||||||
|
* @returns
|
||||||
|
*/
|
||||||
|
async cleanupOldVersions (olderThan?: number, deleteUnverified?: boolean): Promise<CleanupStats> {
|
||||||
|
return tableCleanupOldVersions.call(this._tbl, olderThan, deleteUnverified)
|
||||||
|
.then((res: { newTable: any, metrics: CleanupStats }) => {
|
||||||
|
this._tbl = res.newTable
|
||||||
|
return res.metrics
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the compaction process on the table.
|
||||||
|
*
|
||||||
|
* This can be run after making several small appends to optimize the table
|
||||||
|
* for faster reads.
|
||||||
|
*
|
||||||
|
* @param options Advanced options configuring compaction. In most cases, you
|
||||||
|
* can omit this arguments, as the default options are sensible
|
||||||
|
* for most tables.
|
||||||
|
* @returns Metrics about the compaction operation.
|
||||||
|
*/
|
||||||
|
async compactFiles (options?: CompactionOptions): Promise<CompactionMetrics> {
|
||||||
|
const optionsArg = options ?? {}
|
||||||
|
return tableCompactFiles.call(this._tbl, optionsArg)
|
||||||
|
.then((res: { newTable: any, metrics: CompactionMetrics }) => {
|
||||||
|
this._tbl = res.newTable
|
||||||
|
return res.metrics
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CleanupStats {
|
||||||
|
/**
|
||||||
|
* The number of bytes removed from disk.
|
||||||
|
*/
|
||||||
|
bytesRemoved: number
|
||||||
|
/**
|
||||||
|
* The number of old table versions removed.
|
||||||
|
*/
|
||||||
|
oldVersions: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CompactionOptions {
|
||||||
|
/**
|
||||||
|
* The number of rows per fragment to target. Fragments that have fewer rows
|
||||||
|
* will be compacted into adjacent fragments to produce larger fragments.
|
||||||
|
* Defaults to 1024 * 1024.
|
||||||
|
*/
|
||||||
|
targetRowsPerFragment?: number
|
||||||
|
/**
|
||||||
|
* The maximum number of rows per group. Defaults to 1024.
|
||||||
|
*/
|
||||||
|
maxRowsPerGroup?: number
|
||||||
|
/**
|
||||||
|
* If true, fragments that have rows that are deleted may be compacted to
|
||||||
|
* remove the deleted rows. This can improve the performance of queries.
|
||||||
|
* Default is true.
|
||||||
|
*/
|
||||||
|
materializeDeletions?: boolean
|
||||||
|
/**
|
||||||
|
* A number between 0 and 1, representing the proportion of rows that must be
|
||||||
|
* marked deleted before a fragment is a candidate for compaction to remove
|
||||||
|
* the deleted rows. Default is 10%.
|
||||||
|
*/
|
||||||
|
materializeDeletionsThreshold?: number
|
||||||
|
/**
|
||||||
|
* The number of threads to use for compaction. If not provided, defaults to
|
||||||
|
* the number of cores on the machine.
|
||||||
|
*/
|
||||||
|
numThreads?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CompactionMetrics {
|
||||||
|
/**
|
||||||
|
* The number of fragments that were removed.
|
||||||
|
*/
|
||||||
|
fragmentsRemoved: number
|
||||||
|
/**
|
||||||
|
* The number of new fragments that were created.
|
||||||
|
*/
|
||||||
|
fragmentsAdded: number
|
||||||
|
/**
|
||||||
|
* The number of files that were removed. Each fragment may have more than one
|
||||||
|
* file.
|
||||||
|
*/
|
||||||
|
filesRemoved: number
|
||||||
|
/**
|
||||||
|
* The number of files added. This is typically equal to the number of
|
||||||
|
* fragments added.
|
||||||
|
*/
|
||||||
|
filesAdded: number
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Config to build IVF_PQ index.
|
/// Config to build IVF_PQ index.
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ import * as chai from 'chai'
|
|||||||
import * as chaiAsPromised from 'chai-as-promised'
|
import * as chaiAsPromised from 'chai-as-promised'
|
||||||
|
|
||||||
import * as lancedb from '../index'
|
import * as lancedb from '../index'
|
||||||
import { type AwsCredentials, type EmbeddingFunction, MetricType, Query, WriteMode, DefaultWriteOptions, isWriteOptions } from '../index'
|
import { type AwsCredentials, type EmbeddingFunction, MetricType, Query, WriteMode, DefaultWriteOptions, isWriteOptions, type LocalTable } from '../index'
|
||||||
import { FixedSizeList, Field, Int32, makeVector, Schema, Utf8, Table as ArrowTable, vectorFromArray, Float32 } from 'apache-arrow'
|
import { FixedSizeList, Field, Int32, makeVector, Schema, Utf8, Table as ArrowTable, vectorFromArray, Float32 } from 'apache-arrow'
|
||||||
|
|
||||||
const expect = chai.expect
|
const expect = chai.expect
|
||||||
@@ -446,3 +446,45 @@ describe('WriteOptions', function () {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
describe('Compact and cleanup', function () {
|
||||||
|
it('can cleanup after compaction', async function () {
|
||||||
|
const dir = await track().mkdir('lancejs')
|
||||||
|
const con = await lancedb.connect(dir)
|
||||||
|
|
||||||
|
const data = [
|
||||||
|
{ price: 10, name: 'foo', vector: [1, 2, 3] },
|
||||||
|
{ price: 50, name: 'bar', vector: [4, 5, 6] }
|
||||||
|
]
|
||||||
|
const table = await con.createTable('t1', data) as LocalTable
|
||||||
|
|
||||||
|
const newData = [
|
||||||
|
{ price: 30, name: 'baz', vector: [7, 8, 9] }
|
||||||
|
]
|
||||||
|
await table.add(newData)
|
||||||
|
|
||||||
|
const compactionMetrics = await table.compactFiles({
|
||||||
|
numThreads: 2
|
||||||
|
})
|
||||||
|
assert.equal(compactionMetrics.fragmentsRemoved, 2)
|
||||||
|
assert.equal(compactionMetrics.fragmentsAdded, 1)
|
||||||
|
assert.equal(await table.countRows(), 3)
|
||||||
|
|
||||||
|
await table.cleanupOldVersions()
|
||||||
|
assert.equal(await table.countRows(), 3)
|
||||||
|
|
||||||
|
// should have no effect, but this validates the arguments are parsed.
|
||||||
|
await table.compactFiles({
|
||||||
|
targetRowsPerFragment: 1024 * 10,
|
||||||
|
maxRowsPerGroup: 1024,
|
||||||
|
materializeDeletions: true,
|
||||||
|
materializeDeletionsThreshold: 0.5,
|
||||||
|
numThreads: 2
|
||||||
|
})
|
||||||
|
|
||||||
|
const cleanupMetrics = await table.cleanupOldVersions(0, true)
|
||||||
|
assert.isAtLeast(cleanupMetrics.bytesRemoved, 1)
|
||||||
|
assert.isAtLeast(cleanupMetrics.oldVersions, 1)
|
||||||
|
assert.equal(await table.countRows(), 3)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 0.3.0
|
current_version = 0.3.1
|
||||||
commit = True
|
commit = True
|
||||||
message = [python] Bump version: {current_version} → {new_version}
|
message = [python] Bump version: {current_version} → {new_version}
|
||||||
tag = True
|
tag = True
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ pip install lancedb
|
|||||||
import lancedb
|
import lancedb
|
||||||
db = lancedb.connect('<PATH_TO_LANCEDB_DATASET>')
|
db = lancedb.connect('<PATH_TO_LANCEDB_DATASET>')
|
||||||
table = db.open_table('my_table')
|
table = db.open_table('my_table')
|
||||||
results = table.search([0.1, 0.3]).limit(20).to_df()
|
results = table.search([0.1, 0.3]).limit(20).to_list()
|
||||||
print(results)
|
print(results)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -14,11 +14,12 @@
|
|||||||
import importlib.metadata
|
import importlib.metadata
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
__version__ = importlib.metadata.version("lancedb")
|
||||||
|
|
||||||
from .db import URI, DBConnection, LanceDBConnection
|
from .db import URI, DBConnection, LanceDBConnection
|
||||||
from .remote.db import RemoteDBConnection
|
from .remote.db import RemoteDBConnection
|
||||||
from .schema import vector
|
from .schema import vector
|
||||||
|
from .utils import sentry_log
|
||||||
__version__ = importlib.metadata.version("lancedb")
|
|
||||||
|
|
||||||
|
|
||||||
def connect(
|
def connect(
|
||||||
|
|||||||
12
python/lancedb/cli/__init__.py
Normal file
12
python/lancedb/cli/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
46
python/lancedb/cli/cli.py
Normal file
46
python/lancedb/cli/cli.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
@click.version_option(help="LanceDB command line interface entry point")
|
||||||
|
def cli():
|
||||||
|
"LanceDB command line interface"
|
||||||
|
|
||||||
|
|
||||||
|
diagnostics_help = """
|
||||||
|
Enable or disable LanceDB diagnostics. When enabled, LanceDB will send anonymous events to help us improve LanceDB.
|
||||||
|
These diagnostics are used only for error reporting and no data is collected. You can find more about diagnosis on
|
||||||
|
our docs: https://lancedb.github.io/lancedb/cli_config/
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help=diagnostics_help)
|
||||||
|
@click.option("--enabled/--disabled", default=True)
|
||||||
|
def diagnostics(enabled):
|
||||||
|
CONFIG.update({"diagnostics": True if enabled else False})
|
||||||
|
click.echo("LanceDB diagnostics is %s" % ("enabled" if enabled else "disabled"))
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(help="Show current LanceDB configuration")
|
||||||
|
def config():
|
||||||
|
# TODO: pretty print as table with colors and formatting
|
||||||
|
click.echo("Current LanceDB configuration:")
|
||||||
|
cfg = CONFIG.copy()
|
||||||
|
cfg.pop("uuid") # Don't show uuid as it is not configurable
|
||||||
|
for item, amount in cfg.items():
|
||||||
|
click.echo("{} ({})".format(item, amount))
|
||||||
@@ -12,6 +12,9 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import deprecation
|
||||||
|
|
||||||
|
from . import __version__
|
||||||
from .exceptions import MissingColumnError, MissingValueError
|
from .exceptions import MissingColumnError, MissingValueError
|
||||||
from .util import safe_import_pandas
|
from .util import safe_import_pandas
|
||||||
|
|
||||||
@@ -43,7 +46,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
this how many tokens, but depending on the input data, it could be sentences,
|
this how many tokens, but depending on the input data, it could be sentences,
|
||||||
paragraphs, messages, etc.
|
paragraphs, messages, etc.
|
||||||
|
|
||||||
>>> contextualize(data).window(3).stride(1).text_col('token').to_df()
|
>>> contextualize(data).window(3).stride(1).text_col('token').to_pandas()
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown 1
|
0 The quick brown 1
|
||||||
1 quick brown fox 1
|
1 quick brown fox 1
|
||||||
@@ -56,7 +59,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
8 dog I love 1
|
8 dog I love 1
|
||||||
9 I love sandwiches 2
|
9 I love sandwiches 2
|
||||||
10 love sandwiches 2
|
10 love sandwiches 2
|
||||||
>>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_df()
|
>>> contextualize(data).window(7).stride(1).min_window_size(7).text_col('token').to_pandas()
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox jumped over the 1
|
0 The quick brown fox jumped over the 1
|
||||||
1 quick brown fox jumped over the lazy 1
|
1 quick brown fox jumped over the lazy 1
|
||||||
@@ -68,7 +71,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
``stride`` determines how many rows to skip between each window start. This can
|
``stride`` determines how many rows to skip between each window start. This can
|
||||||
be used to reduce the total number of windows generated.
|
be used to reduce the total number of windows generated.
|
||||||
|
|
||||||
>>> contextualize(data).window(4).stride(2).text_col('token').to_df()
|
>>> contextualize(data).window(4).stride(2).text_col('token').to_pandas()
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox 1
|
0 The quick brown fox 1
|
||||||
2 brown fox jumped over 1
|
2 brown fox jumped over 1
|
||||||
@@ -81,7 +84,7 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
context windows that don't cross document boundaries. In this case, we can
|
context windows that don't cross document boundaries. In this case, we can
|
||||||
pass ``document_id`` as the group by.
|
pass ``document_id`` as the group by.
|
||||||
|
|
||||||
>>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_df()
|
>>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_pandas()
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox 1
|
0 The quick brown fox 1
|
||||||
2 brown fox jumped over 1
|
2 brown fox jumped over 1
|
||||||
@@ -93,14 +96,14 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer:
|
|||||||
This can be used to trim the last few context windows which have size less than
|
This can be used to trim the last few context windows which have size less than
|
||||||
``min_window_size``. By default context windows of size 1 are skipped.
|
``min_window_size``. By default context windows of size 1 are skipped.
|
||||||
|
|
||||||
>>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_df()
|
>>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_pandas()
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox jumped over 1
|
0 The quick brown fox jumped over 1
|
||||||
3 fox jumped over the lazy dog 1
|
3 fox jumped over the lazy dog 1
|
||||||
6 the lazy dog 1
|
6 the lazy dog 1
|
||||||
9 I love sandwiches 2
|
9 I love sandwiches 2
|
||||||
|
|
||||||
>>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_df()
|
>>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_pandas()
|
||||||
token document_id
|
token document_id
|
||||||
0 The quick brown fox jumped over 1
|
0 The quick brown fox jumped over 1
|
||||||
3 fox jumped over the lazy dog 1
|
3 fox jumped over the lazy dog 1
|
||||||
@@ -176,7 +179,16 @@ class Contextualizer:
|
|||||||
self._min_window_size = min_window_size
|
self._min_window_size = min_window_size
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@deprecation.deprecated(
|
||||||
|
deprecated_in="0.3.1",
|
||||||
|
removed_in="0.4.0",
|
||||||
|
current_version=__version__,
|
||||||
|
details="Use the bar function instead",
|
||||||
|
)
|
||||||
def to_df(self) -> "pd.DataFrame":
|
def to_df(self) -> "pd.DataFrame":
|
||||||
|
return self.to_pandas()
|
||||||
|
|
||||||
|
def to_pandas(self) -> "pd.DataFrame":
|
||||||
"""Create the context windows and return a DataFrame."""
|
"""Create the context windows and return a DataFrame."""
|
||||||
if pd is None:
|
if pd is None:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
from .cohere import CohereEmbeddingFunction
|
||||||
from .functions import (
|
from .functions import (
|
||||||
EmbeddingFunction,
|
EmbeddingFunction,
|
||||||
EmbeddingFunctionConfig,
|
EmbeddingFunctionConfig,
|
||||||
|
|||||||
86
python/lancedb/embeddings/cohere.py
Normal file
86
python/lancedb/embeddings/cohere.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
# Copyright (c) 2023. LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import ClassVar, List, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .functions import TextEmbeddingFunction, register
|
||||||
|
from .utils import api_key_not_found_help
|
||||||
|
|
||||||
|
|
||||||
|
@register("cohere")
|
||||||
|
class CohereEmbeddingFunction(TextEmbeddingFunction):
|
||||||
|
"""
|
||||||
|
An embedding function that uses the Cohere API
|
||||||
|
|
||||||
|
https://docs.cohere.com/docs/multilingual-language-models
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name: str, default "embed-multilingual-v2.0"
|
||||||
|
The name of the model to use. See the Cohere documentation for a list of available models.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
import lancedb
|
||||||
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
|
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||||
|
|
||||||
|
cohere = EmbeddingFunctionRegistry.get_instance().get("cohere").create(name="embed-multilingual-v2.0")
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = cohere.SourceField()
|
||||||
|
vector: Vector(cohere.ndims()) = cohere.VectorField()
|
||||||
|
|
||||||
|
data = [ { "text": "hello world" },
|
||||||
|
{ "text": "goodbye world" }]
|
||||||
|
|
||||||
|
db = lancedb.connect("~/.lancedb")
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(data)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str = "embed-multilingual-v2.0"
|
||||||
|
client: ClassVar = None
|
||||||
|
|
||||||
|
def ndims(self):
|
||||||
|
# TODO: fix hardcoding
|
||||||
|
return 768
|
||||||
|
|
||||||
|
def generate_embeddings(
|
||||||
|
self, texts: Union[List[str], np.ndarray]
|
||||||
|
) -> List[np.array]:
|
||||||
|
"""
|
||||||
|
Get the embeddings for the given texts
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
texts: list[str] or np.ndarray (of str)
|
||||||
|
The texts to embed
|
||||||
|
"""
|
||||||
|
# TODO retry, rate limit, token limit
|
||||||
|
self._init_client()
|
||||||
|
rs = CohereEmbeddingFunction.client.embed(texts=texts, model=self.name)
|
||||||
|
|
||||||
|
return [emb for emb in rs.embeddings]
|
||||||
|
|
||||||
|
def _init_client(self):
|
||||||
|
cohere = self.safe_import("cohere")
|
||||||
|
if CohereEmbeddingFunction.client is None:
|
||||||
|
if os.environ.get("COHERE_API_KEY") is None:
|
||||||
|
api_key_not_found_help("cohere")
|
||||||
|
CohereEmbeddingFunction.client = cohere.Client(os.environ["COHERE_API_KEY"])
|
||||||
@@ -21,6 +21,7 @@ from lance.vector import vec_to_table
|
|||||||
from retry import retry
|
from retry import retry
|
||||||
|
|
||||||
from ..util import safe_import_pandas
|
from ..util import safe_import_pandas
|
||||||
|
from ..utils.general import LOGGER
|
||||||
|
|
||||||
pd = safe_import_pandas()
|
pd = safe_import_pandas()
|
||||||
DATA = Union[pa.Table, "pd.DataFrame"]
|
DATA = Union[pa.Table, "pd.DataFrame"]
|
||||||
@@ -152,3 +153,8 @@ class FunctionWrapper:
|
|||||||
yield from tqdm(_chunker(arr), total=math.ceil(length / self._batch_size))
|
yield from tqdm(_chunker(arr), total=math.ceil(length / self._batch_size))
|
||||||
else:
|
else:
|
||||||
yield from _chunker(arr)
|
yield from _chunker(arr)
|
||||||
|
|
||||||
|
|
||||||
|
def api_key_not_found_help(provider):
|
||||||
|
LOGGER.error(f"Could not find API key for {provider}.")
|
||||||
|
raise ValueError(f"Please set the {provider.upper()}_API_KEY environment variable.")
|
||||||
|
|||||||
@@ -16,10 +16,12 @@ from __future__ import annotations
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Literal, Optional, Type, Union
|
from typing import List, Literal, Optional, Type, Union
|
||||||
|
|
||||||
|
import deprecation
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pydantic
|
import pydantic
|
||||||
|
|
||||||
|
from . import __version__
|
||||||
from .common import VECTOR_COLUMN_NAME
|
from .common import VECTOR_COLUMN_NAME
|
||||||
from .pydantic import LanceModel
|
from .pydantic import LanceModel
|
||||||
from .util import safe_import_pandas
|
from .util import safe_import_pandas
|
||||||
@@ -127,7 +129,24 @@ class LanceQueryBuilder(ABC):
|
|||||||
self._columns = None
|
self._columns = None
|
||||||
self._where = None
|
self._where = None
|
||||||
|
|
||||||
|
@deprecation.deprecated(
|
||||||
|
deprecated_in="0.3.1",
|
||||||
|
removed_in="0.4.0",
|
||||||
|
current_version=__version__,
|
||||||
|
details="Use the bar function instead",
|
||||||
|
)
|
||||||
def to_df(self) -> "pd.DataFrame":
|
def to_df(self) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.
|
||||||
|
|
||||||
|
Execute the query and return the results as a pandas DataFrame.
|
||||||
|
In addition to the selected columns, LanceDB also returns a vector
|
||||||
|
and also the "_distance" column which is the distance between the query
|
||||||
|
vector and the returned vector.
|
||||||
|
"""
|
||||||
|
return self.to_pandas()
|
||||||
|
|
||||||
|
def to_pandas(self) -> "pd.DataFrame":
|
||||||
"""
|
"""
|
||||||
Execute the query and return the results as a pandas DataFrame.
|
Execute the query and return the results as a pandas DataFrame.
|
||||||
In addition to the selected columns, LanceDB also returns a vector
|
In addition to the selected columns, LanceDB also returns a vector
|
||||||
@@ -148,6 +167,16 @@ class LanceQueryBuilder(ABC):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def to_list(self) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Execute the query and return the results as a list of dictionaries.
|
||||||
|
|
||||||
|
Each list entry is a dictionary with the selected column names as keys,
|
||||||
|
or all table columns if `select` is not called. The vector and the "_distance"
|
||||||
|
fields are returned whether or not they're explicitly selected.
|
||||||
|
"""
|
||||||
|
return self.to_arrow().to_pylist()
|
||||||
|
|
||||||
def to_pydantic(self, model: Type[LanceModel]) -> List[LanceModel]:
|
def to_pydantic(self, model: Type[LanceModel]) -> List[LanceModel]:
|
||||||
"""Return the table as a list of pydantic models.
|
"""Return the table as a list of pydantic models.
|
||||||
|
|
||||||
@@ -232,7 +261,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
... .where("b < 10")
|
... .where("b < 10")
|
||||||
... .select(["b"])
|
... .select(["b"])
|
||||||
... .limit(2)
|
... .limit(2)
|
||||||
... .to_df())
|
... .to_pandas())
|
||||||
b vector _distance
|
b vector _distance
|
||||||
0 6 [0.4, 0.4] 0.0
|
0 6 [0.4, 0.4] 0.0
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from __future__ import annotations
|
|||||||
import inspect
|
import inspect
|
||||||
import os
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from datetime import timedelta
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import Any, Iterable, List, Optional, Union
|
from typing import Any, Iterable, List, Optional, Union
|
||||||
|
|
||||||
@@ -24,7 +25,7 @@ import numpy as np
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pyarrow.compute as pc
|
import pyarrow.compute as pc
|
||||||
from lance import LanceDataset
|
from lance import LanceDataset
|
||||||
from lance.dataset import ReaderLike
|
from lance.dataset import CleanupStats, ReaderLike
|
||||||
from lance.vector import vec_to_table
|
from lance.vector import vec_to_table
|
||||||
|
|
||||||
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
from .common import DATA, VEC, VECTOR_COLUMN_NAME
|
||||||
@@ -33,6 +34,7 @@ from .embeddings.functions import EmbeddingFunctionConfig
|
|||||||
from .pydantic import LanceModel
|
from .pydantic import LanceModel
|
||||||
from .query import LanceQueryBuilder, Query
|
from .query import LanceQueryBuilder, Query
|
||||||
from .util import fs_from_uri, safe_import_pandas
|
from .util import fs_from_uri, safe_import_pandas
|
||||||
|
from .utils.events import register_event
|
||||||
|
|
||||||
pd = safe_import_pandas()
|
pd = safe_import_pandas()
|
||||||
|
|
||||||
@@ -136,7 +138,7 @@ class Table(ABC):
|
|||||||
|
|
||||||
Can query the table with [Table.search][lancedb.table.Table.search].
|
Can query the table with [Table.search][lancedb.table.Table.search].
|
||||||
|
|
||||||
>>> table.search([0.4, 0.4]).select(["b"]).to_df()
|
>>> table.search([0.4, 0.4]).select(["b"]).to_pandas()
|
||||||
b vector _distance
|
b vector _distance
|
||||||
0 4 [0.5, 1.3] 0.82
|
0 4 [0.5, 1.3] 0.82
|
||||||
1 2 [1.1, 1.2] 1.13
|
1 2 [1.1, 1.2] 1.13
|
||||||
@@ -394,6 +396,17 @@ class LanceTable(Table):
|
|||||||
raise ValueError(f"Invalid version {version}")
|
raise ValueError(f"Invalid version {version}")
|
||||||
self._reset_dataset(version=version)
|
self._reset_dataset(version=version)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Accessing the property updates the cached value
|
||||||
|
_ = self._dataset
|
||||||
|
except Exception as e:
|
||||||
|
if "not found" in str(e):
|
||||||
|
raise ValueError(
|
||||||
|
f"Version {version} no longer exists. Was it cleaned up?"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
def restore(self, version: int = None):
|
def restore(self, version: int = None):
|
||||||
"""Restore a version of the table. This is an in-place operation.
|
"""Restore a version of the table. This is an in-place operation.
|
||||||
|
|
||||||
@@ -496,6 +509,7 @@ class LanceTable(Table):
|
|||||||
accelerator=accelerator,
|
accelerator=accelerator,
|
||||||
)
|
)
|
||||||
self._reset_dataset()
|
self._reset_dataset()
|
||||||
|
register_event("create_index")
|
||||||
|
|
||||||
def create_fts_index(self, field_names: Union[str, List[str]]):
|
def create_fts_index(self, field_names: Union[str, List[str]]):
|
||||||
"""Create a full-text search index on the table.
|
"""Create a full-text search index on the table.
|
||||||
@@ -514,6 +528,7 @@ class LanceTable(Table):
|
|||||||
field_names = [field_names]
|
field_names = [field_names]
|
||||||
index = create_index(self._get_fts_index_path(), field_names)
|
index = create_index(self._get_fts_index_path(), field_names)
|
||||||
populate_index(index, self, field_names)
|
populate_index(index, self, field_names)
|
||||||
|
register_event("create_fts_index")
|
||||||
|
|
||||||
def _get_fts_index_path(self):
|
def _get_fts_index_path(self):
|
||||||
return os.path.join(self._dataset_uri, "_indices", "tantivy")
|
return os.path.join(self._dataset_uri, "_indices", "tantivy")
|
||||||
@@ -566,6 +581,7 @@ class LanceTable(Table):
|
|||||||
)
|
)
|
||||||
lance.write_dataset(data, self._dataset_uri, schema=self.schema, mode=mode)
|
lance.write_dataset(data, self._dataset_uri, schema=self.schema, mode=mode)
|
||||||
self._reset_dataset()
|
self._reset_dataset()
|
||||||
|
register_event("add")
|
||||||
|
|
||||||
def merge(
|
def merge(
|
||||||
self,
|
self,
|
||||||
@@ -629,6 +645,7 @@ class LanceTable(Table):
|
|||||||
other_table, left_on=left_on, right_on=right_on, schema=schema
|
other_table, left_on=left_on, right_on=right_on, schema=schema
|
||||||
)
|
)
|
||||||
self._reset_dataset()
|
self._reset_dataset()
|
||||||
|
register_event("merge")
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def embedding_functions(self) -> dict:
|
def embedding_functions(self) -> dict:
|
||||||
@@ -679,6 +696,7 @@ class LanceTable(Table):
|
|||||||
and also the "_distance" column which is the distance between the query
|
and also the "_distance" column which is the distance between the query
|
||||||
vector and the returned vector.
|
vector and the returned vector.
|
||||||
"""
|
"""
|
||||||
|
register_event("search")
|
||||||
return LanceQueryBuilder.create(
|
return LanceQueryBuilder.create(
|
||||||
self, query, query_type, vector_column_name=vector_column_name
|
self, query, query_type, vector_column_name=vector_column_name
|
||||||
)
|
)
|
||||||
@@ -782,6 +800,7 @@ class LanceTable(Table):
|
|||||||
if data is not None:
|
if data is not None:
|
||||||
table.add(data)
|
table.add(data)
|
||||||
|
|
||||||
|
register_event("create_table")
|
||||||
return table
|
return table
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -847,6 +866,7 @@ class LanceTable(Table):
|
|||||||
self.delete(where)
|
self.delete(where)
|
||||||
self.add(orig_data, mode="append")
|
self.add(orig_data, mode="append")
|
||||||
self._reset_dataset()
|
self._reset_dataset()
|
||||||
|
register_event("update")
|
||||||
|
|
||||||
def _execute_query(self, query: Query) -> pa.Table:
|
def _execute_query(self, query: Query) -> pa.Table:
|
||||||
ds = self.to_lance()
|
ds = self.to_lance()
|
||||||
@@ -870,6 +890,48 @@ class LanceTable(Table):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def cleanup_old_versions(
|
||||||
|
self,
|
||||||
|
older_than: Optional[timedelta] = None,
|
||||||
|
*,
|
||||||
|
delete_unverified: bool = False,
|
||||||
|
) -> CleanupStats:
|
||||||
|
"""
|
||||||
|
Clean up old versions of the table, freeing disk space.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
older_than: timedelta, default None
|
||||||
|
The minimum age of the version to delete. If None, then this defaults
|
||||||
|
to two weeks.
|
||||||
|
delete_unverified: bool, default False
|
||||||
|
Because they may be part of an in-progress transaction, files newer
|
||||||
|
than 7 days old are not deleted by default. If you are sure that
|
||||||
|
there are no in-progress transactions, then you can set this to True
|
||||||
|
to delete all files older than `older_than`.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
CleanupStats
|
||||||
|
The stats of the cleanup operation, including how many bytes were
|
||||||
|
freed.
|
||||||
|
"""
|
||||||
|
return self.to_lance().cleanup_old_versions(
|
||||||
|
older_than, delete_unverified=delete_unverified
|
||||||
|
)
|
||||||
|
|
||||||
|
def compact_files(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Run the compaction process on the table.
|
||||||
|
|
||||||
|
This can be run after making several small appends to optimize the table
|
||||||
|
for faster reads.
|
||||||
|
|
||||||
|
Arguments are passed onto :meth:`lance.dataset.DatasetOptimizer.compact_files`.
|
||||||
|
For most cases, the default should be fine.
|
||||||
|
"""
|
||||||
|
return self.to_lance().optimize.compact_files(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_schema(
|
def _sanitize_schema(
|
||||||
data: pa.Table,
|
data: pa.Table,
|
||||||
|
|||||||
15
python/lancedb/utils/__init__.py
Normal file
15
python/lancedb/utils/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .config import Config
|
||||||
|
|
||||||
|
CONFIG = Config()
|
||||||
116
python/lancedb/utils/config.py
Normal file
116
python/lancedb/utils/config.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .general import LOGGER, is_dir_writeable, yaml_load, yaml_save
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_config_dir(sub_dir="lancedb"):
|
||||||
|
"""
|
||||||
|
Get the user config directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sub_dir (str): The name of the subdirectory to create.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(Path): The path to the user config directory.
|
||||||
|
"""
|
||||||
|
# Return the appropriate config directory for each operating system
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
path = Path.home() / "AppData" / "Roaming" / sub_dir
|
||||||
|
elif platform.system() == "Darwin":
|
||||||
|
path = Path.home() / "Library" / "Application Support" / sub_dir
|
||||||
|
elif platform.system() == "Linux":
|
||||||
|
path = Path.home() / ".config" / sub_dir
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported operating system: {platform.system()}")
|
||||||
|
|
||||||
|
# GCP and AWS lambda fix, only /tmp is writeable
|
||||||
|
if not is_dir_writeable(path.parent):
|
||||||
|
LOGGER.warning(
|
||||||
|
f"WARNING ⚠️ user config directory '{path}' is not writeable, defaulting to '/tmp' or CWD."
|
||||||
|
"Alternatively you can define a LANCEDB_CONFIG_DIR environment variable for this path."
|
||||||
|
)
|
||||||
|
path = (
|
||||||
|
Path("/tmp") / sub_dir
|
||||||
|
if is_dir_writeable("/tmp")
|
||||||
|
else Path().cwd() / sub_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the subdirectory if it does not exist
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
USER_CONFIG_DIR = Path(os.getenv("LANCEDB_CONFIG_DIR") or get_user_config_dir())
|
||||||
|
CONFIG_FILE = USER_CONFIG_DIR / "config.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
class Config(dict):
|
||||||
|
"""
|
||||||
|
Manages lancedb config stored in a YAML file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file (str | Path): Path to the lancedb config YAML file. Default is USER_CONFIG_DIR / 'config.yaml'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, file=CONFIG_FILE):
|
||||||
|
self.file = Path(file)
|
||||||
|
self.defaults = { # Default global config values
|
||||||
|
"diagnostics": True,
|
||||||
|
"uuid": hashlib.sha256(str(uuid.getnode()).encode()).hexdigest(),
|
||||||
|
}
|
||||||
|
|
||||||
|
super().__init__(copy.deepcopy(self.defaults))
|
||||||
|
|
||||||
|
if not self.file.exists():
|
||||||
|
self.save()
|
||||||
|
|
||||||
|
self.load()
|
||||||
|
correct_keys = self.keys() == self.defaults.keys()
|
||||||
|
correct_types = all(
|
||||||
|
type(a) is type(b) for a, b in zip(self.values(), self.defaults.values())
|
||||||
|
)
|
||||||
|
if not (correct_keys and correct_types):
|
||||||
|
LOGGER.warning(
|
||||||
|
"WARNING ⚠️ LanceDB settings reset to default values. This may be due to a possible problem "
|
||||||
|
"with your settings or a recent package update. "
|
||||||
|
f"\nView settings & usage with 'lancedb settings' or at '{self.file}'"
|
||||||
|
)
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
"""Loads settings from the YAML file."""
|
||||||
|
super().update(yaml_load(self.file))
|
||||||
|
|
||||||
|
def save(self):
|
||||||
|
"""Saves the current settings to the YAML file."""
|
||||||
|
yaml_save(self.file, dict(self))
|
||||||
|
|
||||||
|
def update(self, *args, **kwargs):
|
||||||
|
"""Updates a setting value in the current settings."""
|
||||||
|
super().update(*args, **kwargs)
|
||||||
|
self.save()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets the settings to default and saves them."""
|
||||||
|
self.clear()
|
||||||
|
self.update(self.defaults)
|
||||||
|
self.save()
|
||||||
161
python/lancedb/utils/events.py
Normal file
161
python/lancedb/utils/events.py
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import importlib.metadata
|
||||||
|
import platform
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
from lancedb.utils.general import TryExcept
|
||||||
|
|
||||||
|
from .general import (
|
||||||
|
PLATFORMS,
|
||||||
|
get_git_origin_url,
|
||||||
|
is_git_dir,
|
||||||
|
is_github_actions_ci,
|
||||||
|
is_online,
|
||||||
|
is_pip_package,
|
||||||
|
is_pytest_running,
|
||||||
|
threaded_request,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _Events:
|
||||||
|
"""
|
||||||
|
A class for collecting anonymous event analytics. Event analytics are enabled when ``diagnostics=True`` in config and
|
||||||
|
disabled when ``diagnostics=False``.
|
||||||
|
|
||||||
|
You can enable or disable diagnostics by running ``lancedb diagnostics --enabled`` or ``lancedb diagnostics --disabled``.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
url : str
|
||||||
|
The URL to send anonymous events.
|
||||||
|
rate_limit : float
|
||||||
|
The rate limit in seconds for sending events.
|
||||||
|
metadata : dict
|
||||||
|
A dictionary containing metadata about the environment.
|
||||||
|
enabled : bool
|
||||||
|
A flag to enable or disable Events based on certain conditions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_instance = None
|
||||||
|
|
||||||
|
url = "https://app.posthog.com/capture/"
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
api_key = "phc_oENDjGgHtmIDrV6puUiFem2RB4JA8gGWulfdulmMdZP"
|
||||||
|
# This api-key is write only and is safe to expose in the codebase.
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
Initializes the Events object with default values for events, rate_limit, and metadata.
|
||||||
|
"""
|
||||||
|
self.events = [] # events list
|
||||||
|
self.max_events = 25 # max events to store in memory
|
||||||
|
self.rate_limit = 60.0 # rate limit (seconds)
|
||||||
|
self.time = 0.0
|
||||||
|
|
||||||
|
if is_git_dir():
|
||||||
|
install = "git"
|
||||||
|
elif is_pip_package():
|
||||||
|
install = "pip"
|
||||||
|
else:
|
||||||
|
install = "other"
|
||||||
|
self.metadata = {
|
||||||
|
"cli": sys.argv[0],
|
||||||
|
"install": install,
|
||||||
|
"python": ".".join(platform.python_version_tuple()[:2]),
|
||||||
|
"version": importlib.metadata.version("lancedb"),
|
||||||
|
"platforms": PLATFORMS,
|
||||||
|
"session_id": round(random.random() * 1e15),
|
||||||
|
# 'engagement_time_msec': 1000 # TODO: In future we might be interested in this metric
|
||||||
|
}
|
||||||
|
|
||||||
|
TESTS_RUNNING = is_pytest_running() or is_github_actions_ci()
|
||||||
|
ONLINE = is_online()
|
||||||
|
self.enabled = (
|
||||||
|
CONFIG["diagnostics"]
|
||||||
|
and not TESTS_RUNNING
|
||||||
|
and ONLINE
|
||||||
|
and (
|
||||||
|
is_pip_package()
|
||||||
|
or get_git_origin_url() == "https://github.com/lancedb/lancedb.git"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(self, event_name, params={}):
|
||||||
|
"""
|
||||||
|
Attempts to add a new event to the events list and send events if the rate limit is reached.
|
||||||
|
|
||||||
|
Args
|
||||||
|
----
|
||||||
|
event_name : str
|
||||||
|
The name of the event to be logged.
|
||||||
|
params : dict, optional
|
||||||
|
A dictionary of additional parameters to be logged with the event.
|
||||||
|
"""
|
||||||
|
### NOTE: We might need a way to tag a session with a label to check usage from a source. Setting label should be exposed to the user.
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
if (
|
||||||
|
len(self.events) < self.max_events
|
||||||
|
): # Events list limited to 25 events (drop any events past this)
|
||||||
|
params.update(self.metadata)
|
||||||
|
self.events.append(
|
||||||
|
{
|
||||||
|
"event": event_name,
|
||||||
|
"properties": params,
|
||||||
|
"timestamp": datetime.datetime.now(
|
||||||
|
tz=datetime.timezone.utc
|
||||||
|
).isoformat(),
|
||||||
|
"distinct_id": CONFIG["uuid"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check rate limit
|
||||||
|
t = time.time()
|
||||||
|
if (t - self.time) < self.rate_limit:
|
||||||
|
return
|
||||||
|
# Time is over rate limiter, send now
|
||||||
|
data = {
|
||||||
|
"api_key": self.api_key,
|
||||||
|
"distinct_id": CONFIG["uuid"], # posthog needs this to accepts the event
|
||||||
|
"batch": self.events,
|
||||||
|
}
|
||||||
|
|
||||||
|
# POST equivalent to requests.post(self.url, json=data).
|
||||||
|
# threaded request is used to avoid blocking, retries are disabled, and verbose is disabled
|
||||||
|
# to avoid any possible disruption in the console.
|
||||||
|
threaded_request(
|
||||||
|
method="post",
|
||||||
|
url=self.url,
|
||||||
|
headers=self.headers,
|
||||||
|
json=data,
|
||||||
|
retry=0,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Flush & Reset
|
||||||
|
self.events = []
|
||||||
|
self.time = t
|
||||||
|
|
||||||
|
|
||||||
|
@TryExcept(verbose=False)
|
||||||
|
def register_event(name: str, **kwargs):
|
||||||
|
if _Events._instance is None:
|
||||||
|
_Events._instance = _Events()
|
||||||
|
|
||||||
|
_Events._instance(name, **kwargs)
|
||||||
445
python/lancedb/utils/general.py
Normal file
445
python/lancedb/utils/general.py
Normal file
@@ -0,0 +1,445 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import importlib
|
||||||
|
import logging.config
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
LOGGING_NAME = "lancedb"
|
||||||
|
VERBOSE = (
|
||||||
|
str(os.getenv("LANCEDB_VERBOSE", True)).lower() == "true"
|
||||||
|
) # global verbose mode
|
||||||
|
|
||||||
|
|
||||||
|
def set_logging(name=LOGGING_NAME, verbose=True):
|
||||||
|
"""Sets up logging for the given name.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
name : str, optional
|
||||||
|
The name of the logger. Default is 'lancedb'.
|
||||||
|
verbose : bool, optional
|
||||||
|
Whether to enable verbose logging. Default is True.
|
||||||
|
"""
|
||||||
|
|
||||||
|
rank = int(os.getenv("RANK", -1)) # rank in world for Multi-GPU trainings
|
||||||
|
level = logging.INFO if verbose and rank in {-1, 0} else logging.ERROR
|
||||||
|
logging.config.dictConfig(
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"disable_existing_loggers": False,
|
||||||
|
"formatters": {name: {"format": "%(message)s"}},
|
||||||
|
"handlers": {
|
||||||
|
name: {
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
"formatter": name,
|
||||||
|
"level": level,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"loggers": {name: {"level": level, "handlers": [name], "propagate": False}},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
set_logging(LOGGING_NAME, verbose=VERBOSE)
|
||||||
|
LOGGER = logging.getLogger(LOGGING_NAME)
|
||||||
|
|
||||||
|
|
||||||
|
def is_pip_package(filepath: str = __name__) -> bool:
|
||||||
|
"""Determines if the file at the given filepath is part of a pip package.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath : str, optional
|
||||||
|
The filepath to check. Default is the current file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the file is part of a pip package, False otherwise.
|
||||||
|
"""
|
||||||
|
# Get the spec for the module
|
||||||
|
spec = importlib.util.find_spec(filepath)
|
||||||
|
|
||||||
|
# Return whether the spec is not None and the origin is not None (indicating it is a package)
|
||||||
|
return spec is not None and spec.origin is not None
|
||||||
|
|
||||||
|
|
||||||
|
def is_pytest_running():
|
||||||
|
"""Determines whether pytest is currently running or not.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if pytest is running, False otherwise.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
("PYTEST_CURRENT_TEST" in os.environ)
|
||||||
|
or ("pytest" in sys.modules)
|
||||||
|
or ("pytest" in Path(sys.argv[0]).stem)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_github_actions_ci() -> bool:
|
||||||
|
"""
|
||||||
|
Determine if the current environment is a GitHub Actions CI Python runner.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the current environment is a GitHub Actions CI Python runner, False otherwise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return (
|
||||||
|
"GITHUB_ACTIONS" in os.environ
|
||||||
|
and "RUNNER_OS" in os.environ
|
||||||
|
and "RUNNER_TOOL_CACHE" in os.environ
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_git_dir():
|
||||||
|
"""
|
||||||
|
Determines whether the current file is part of a git repository.
|
||||||
|
If the current file is not part of a git repository, returns None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if current file is part of a git repository.
|
||||||
|
"""
|
||||||
|
return get_git_dir() is not None
|
||||||
|
|
||||||
|
|
||||||
|
def is_online() -> bool:
|
||||||
|
"""
|
||||||
|
Check internet connectivity by attempting to connect to a known online host.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if connection is successful, False otherwise.
|
||||||
|
"""
|
||||||
|
import socket
|
||||||
|
|
||||||
|
for host in "1.1.1.1", "8.8.8.8", "223.5.5.5": # Cloudflare, Google, AliDNS:
|
||||||
|
try:
|
||||||
|
test_connection = socket.create_connection(address=(host, 53), timeout=2)
|
||||||
|
except (socket.timeout, socket.gaierror, OSError):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# If the connection was successful, close it to avoid a ResourceWarning
|
||||||
|
test_connection.close()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_dir_writeable(dir_path: Union[str, Path]) -> bool:
|
||||||
|
"""Check if a directory is writeable.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dir_path : Union[str, Path]
|
||||||
|
The path to the directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the directory is writeable, False otherwise.
|
||||||
|
"""
|
||||||
|
return os.access(str(dir_path), os.W_OK)
|
||||||
|
|
||||||
|
|
||||||
|
def is_colab():
|
||||||
|
"""Check if the current script is running inside a Google Colab notebook.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if running inside a Colab notebook, False otherwise.
|
||||||
|
"""
|
||||||
|
return "COLAB_RELEASE_TAG" in os.environ or "COLAB_BACKEND_VERSION" in os.environ
|
||||||
|
|
||||||
|
|
||||||
|
def is_kaggle():
|
||||||
|
"""Check if the current script is running inside a Kaggle kernel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if running inside a Kaggle kernel, False otherwise.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
os.environ.get("PWD") == "/kaggle/working"
|
||||||
|
and os.environ.get("KAGGLE_URL_BASE") == "https://www.kaggle.com"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_jupyter():
|
||||||
|
"""Check if the current script is running inside a Jupyter Notebook.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if running inside a Jupyter Notebook, False otherwise.
|
||||||
|
"""
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
from IPython import get_ipython
|
||||||
|
|
||||||
|
return get_ipython() is not None
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_docker() -> bool:
|
||||||
|
"""Determine if the script is running inside a Docker container.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the script is running inside a Docker container, False otherwise.
|
||||||
|
"""
|
||||||
|
file = Path("/proc/self/cgroup")
|
||||||
|
if file.exists():
|
||||||
|
with open(file) as f:
|
||||||
|
return "docker" in f.read()
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_git_dir():
|
||||||
|
"""Determine whether the current file is part of a git repository and if so, returns the repository root directory.
|
||||||
|
If the current file is not part of a git repository, returns None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Git root directory if found or None if not found.
|
||||||
|
"""
|
||||||
|
for d in Path(__file__).parents:
|
||||||
|
if (d / ".git").is_dir():
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def get_git_origin_url():
|
||||||
|
"""Retrieve the origin URL of a git repository.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
The origin URL of the git repository or None if not git directory.
|
||||||
|
"""
|
||||||
|
if is_git_dir():
|
||||||
|
with contextlib.suppress(subprocess.CalledProcessError):
|
||||||
|
origin = subprocess.check_output(
|
||||||
|
["git", "config", "--get", "remote.origin.url"]
|
||||||
|
)
|
||||||
|
return origin.decode().strip()
|
||||||
|
|
||||||
|
|
||||||
|
def yaml_save(file="data.yaml", data=None, header=""):
|
||||||
|
"""Save YAML data to a file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file : str, optional
|
||||||
|
File name, by default 'data.yaml'.
|
||||||
|
data : dict, optional
|
||||||
|
Data to save in YAML format, by default None.
|
||||||
|
header : str, optional
|
||||||
|
YAML header to add, by default "".
|
||||||
|
"""
|
||||||
|
if data is None:
|
||||||
|
data = {}
|
||||||
|
file = Path(file)
|
||||||
|
if not file.parent.exists():
|
||||||
|
# Create parent directories if they don't exist
|
||||||
|
file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Convert Path objects to strings
|
||||||
|
for k, v in data.items():
|
||||||
|
if isinstance(v, Path):
|
||||||
|
data[k] = str(v)
|
||||||
|
|
||||||
|
# Dump data to file in YAML format
|
||||||
|
with open(file, "w", errors="ignore", encoding="utf-8") as f:
|
||||||
|
if header:
|
||||||
|
f.write(header)
|
||||||
|
yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True)
|
||||||
|
|
||||||
|
|
||||||
|
def yaml_load(file="data.yaml", append_filename=False):
|
||||||
|
"""
|
||||||
|
Load YAML data from a file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file : str, optional
|
||||||
|
File name. Default is 'data.yaml'.
|
||||||
|
append_filename : bool, optional
|
||||||
|
Add the YAML filename to the YAML dictionary. Default is False.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict
|
||||||
|
YAML data and file name.
|
||||||
|
"""
|
||||||
|
assert Path(file).suffix in (
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
), f"Attempting to load non-YAML file {file} with yaml_load()"
|
||||||
|
with open(file, errors="ignore", encoding="utf-8") as f:
|
||||||
|
s = f.read() # string
|
||||||
|
|
||||||
|
# Add YAML filename to dict and return
|
||||||
|
data = (
|
||||||
|
yaml.safe_load(s) or {}
|
||||||
|
) # always return a dict (yaml.safe_load() may return None for empty files)
|
||||||
|
if append_filename:
|
||||||
|
data["yaml_file"] = str(file)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def yaml_print(yaml_file: Union[str, Path, dict]) -> None:
|
||||||
|
"""
|
||||||
|
Pretty prints a YAML file or a YAML-formatted dictionary.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
yaml_file : Union[str, Path, dict]
|
||||||
|
The file path of the YAML file or a YAML-formatted dictionary.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
yaml_dict = (
|
||||||
|
yaml_load(yaml_file) if isinstance(yaml_file, (str, Path)) else yaml_file
|
||||||
|
)
|
||||||
|
dump = yaml.dump(yaml_dict, sort_keys=False, allow_unicode=True)
|
||||||
|
LOGGER.info(f"Printing '{yaml_file}'\n\n{dump}")
|
||||||
|
|
||||||
|
|
||||||
|
PLATFORMS = [platform.system()]
|
||||||
|
if is_colab():
|
||||||
|
PLATFORMS.append("Colab")
|
||||||
|
if is_kaggle():
|
||||||
|
PLATFORMS.append("Kaggle")
|
||||||
|
if is_jupyter():
|
||||||
|
PLATFORMS.append("Jupyter")
|
||||||
|
if is_docker():
|
||||||
|
PLATFORMS.append("Docker")
|
||||||
|
|
||||||
|
PLATFORMS = "|".join(PLATFORMS)
|
||||||
|
|
||||||
|
|
||||||
|
class TryExcept(contextlib.ContextDecorator):
|
||||||
|
"""
|
||||||
|
TryExcept context manager.
|
||||||
|
Usage: @TryExcept() decorator or 'with TryExcept():' context manager.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, msg="", verbose=True):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
msg : str, optional
|
||||||
|
Custom message to display in case of exception, by default "".
|
||||||
|
verbose : bool, optional
|
||||||
|
Whether to display the message, by default True.
|
||||||
|
"""
|
||||||
|
self.msg = msg
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, value, traceback):
|
||||||
|
if self.verbose and value:
|
||||||
|
LOGGER.info(f"{self.msg}{': ' if self.msg else ''}{value}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def threaded_request(
|
||||||
|
method, url, retry=3, timeout=30, thread=True, code=-1, verbose=True, **kwargs
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Makes an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
method : str
|
||||||
|
The HTTP method to use for the request. Choices are 'post' and 'get'.
|
||||||
|
url : str
|
||||||
|
The URL to make the request to.
|
||||||
|
retry : int, optional
|
||||||
|
Number of retries to attempt before giving up, by default 3.
|
||||||
|
timeout : int, optional
|
||||||
|
Timeout in seconds after which the function will give up retrying, by default 30.
|
||||||
|
thread : bool, optional
|
||||||
|
Whether to execute the request in a separate daemon thread, by default True.
|
||||||
|
code : int, optional
|
||||||
|
An identifier for the request, used for logging purposes, by default -1.
|
||||||
|
verbose : bool, optional
|
||||||
|
A flag to determine whether to print out to console or not, by default True.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
requests.Response
|
||||||
|
The HTTP response object. If the request is executed in a separate thread, returns the thread itself.
|
||||||
|
"""
|
||||||
|
retry_codes = () # retry only these codes TODO: add codes if needed in future (500, 408)
|
||||||
|
|
||||||
|
@TryExcept(verbose=verbose)
|
||||||
|
def func(method, url, **kwargs):
|
||||||
|
"""Make HTTP requests with retries and timeouts, with optional progress tracking."""
|
||||||
|
response = None
|
||||||
|
t0 = time.time()
|
||||||
|
for i in range(retry + 1):
|
||||||
|
if (time.time() - t0) > timeout:
|
||||||
|
break
|
||||||
|
response = requests.request(method, url, **kwargs)
|
||||||
|
if response.status_code < 300: # good return codes in the 2xx range
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
m = response.json().get("message", "No JSON message.")
|
||||||
|
except AttributeError:
|
||||||
|
m = "Unable to read JSON."
|
||||||
|
if i == 0:
|
||||||
|
if response.status_code in retry_codes:
|
||||||
|
m += f" Retrying {retry}x for {timeout}s." if retry else ""
|
||||||
|
elif response.status_code == 429: # rate limit
|
||||||
|
m = f"Rate limit reached"
|
||||||
|
if verbose:
|
||||||
|
LOGGER.warning(f"{response.status_code} #{code}")
|
||||||
|
if response.status_code not in retry_codes:
|
||||||
|
return response
|
||||||
|
time.sleep(2**i) # exponential standoff
|
||||||
|
return response
|
||||||
|
|
||||||
|
args = method, url
|
||||||
|
if thread:
|
||||||
|
return threading.Thread(
|
||||||
|
target=func, args=args, kwargs=kwargs, daemon=True
|
||||||
|
).start()
|
||||||
|
else:
|
||||||
|
return func(*args, **kwargs)
|
||||||
112
python/lancedb/utils/sentry_log.py
Normal file
112
python/lancedb/utils/sentry_log.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
# Copyright 2023 LanceDB Developers
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import bdb
|
||||||
|
import importlib.metadata
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
|
||||||
|
from .general import (
|
||||||
|
PLATFORMS,
|
||||||
|
TryExcept,
|
||||||
|
is_git_dir,
|
||||||
|
is_github_actions_ci,
|
||||||
|
is_online,
|
||||||
|
is_pip_package,
|
||||||
|
is_pytest_running,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@TryExcept(verbose=False)
|
||||||
|
def set_sentry():
|
||||||
|
"""
|
||||||
|
Initialize the Sentry SDK for error tracking and reporting. Only used if sentry_sdk package is installed and
|
||||||
|
sync=True in settings. Run 'lancedb settings' to see and update settings YAML file.
|
||||||
|
|
||||||
|
Conditions required to send errors (ALL conditions must be met or no errors will be reported):
|
||||||
|
- sentry_sdk package is installed
|
||||||
|
- sync=True in settings
|
||||||
|
- pytest is not running
|
||||||
|
- running in a pip package installation
|
||||||
|
- running in a non-git directory
|
||||||
|
- online environment
|
||||||
|
|
||||||
|
The function also configures Sentry SDK to ignore KeyboardInterrupt and FileNotFoundError
|
||||||
|
exceptions for now.
|
||||||
|
|
||||||
|
Additionally, the function sets custom tags and user information for Sentry events.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def before_send(event, hint):
|
||||||
|
"""
|
||||||
|
Modify the event before sending it to Sentry based on specific exception types and messages.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
event (dict): The event dictionary containing information about the error.
|
||||||
|
hint (dict): A dictionary containing additional information about the error.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: The modified event or None if the event should not be sent to Sentry.
|
||||||
|
"""
|
||||||
|
if "exc_info" in hint:
|
||||||
|
exc_type, exc_value, tb = hint["exc_info"]
|
||||||
|
if "out of memory" in str(exc_value).lower():
|
||||||
|
return None
|
||||||
|
|
||||||
|
if is_git_dir():
|
||||||
|
install = "git"
|
||||||
|
elif is_pip_package():
|
||||||
|
install = "pip"
|
||||||
|
else:
|
||||||
|
install = "other"
|
||||||
|
|
||||||
|
event["tags"] = {
|
||||||
|
"sys_argv": sys.argv[0],
|
||||||
|
"sys_argv_name": Path(sys.argv[0]).name,
|
||||||
|
"install": install,
|
||||||
|
"platforms": PLATFORMS,
|
||||||
|
"version": importlib.metadata.version("lancedb"),
|
||||||
|
}
|
||||||
|
return event
|
||||||
|
|
||||||
|
TESTS_RUNNING = is_pytest_running() or is_github_actions_ci()
|
||||||
|
ONLINE = is_online()
|
||||||
|
if CONFIG["diagnostics"] and not TESTS_RUNNING and ONLINE and is_pip_package():
|
||||||
|
# and not is_git_dir(): # not running inside a git dir. Maybe too restrictive?
|
||||||
|
|
||||||
|
# If sentry_sdk package is not installed then return and do not use Sentry
|
||||||
|
try:
|
||||||
|
import sentry_sdk # noqa
|
||||||
|
except ImportError:
|
||||||
|
return
|
||||||
|
|
||||||
|
sentry_sdk.init(
|
||||||
|
dsn="https://c63ef8c64e05d1aa1a96513361f3ca2f@o4505950840946688.ingest.sentry.io/4505950933614592",
|
||||||
|
debug=False,
|
||||||
|
include_local_variables=False,
|
||||||
|
traces_sample_rate=1.0,
|
||||||
|
environment="production", # 'dev' or 'production'
|
||||||
|
before_send=before_send,
|
||||||
|
ignore_errors=[KeyboardInterrupt, FileNotFoundError, bdb.BdbQuit],
|
||||||
|
)
|
||||||
|
sentry_sdk.set_user({"id": CONFIG["uuid"]}) # SHA-256 anonymized UUID hash
|
||||||
|
|
||||||
|
# Disable all sentry logging
|
||||||
|
for logger in "sentry_sdk", "sentry_sdk.errors":
|
||||||
|
logging.getLogger(logger).setLevel(logging.CRITICAL)
|
||||||
|
|
||||||
|
|
||||||
|
set_sentry()
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"deprecation",
|
||||||
"pylance==0.8.3",
|
"pylance==0.8.3",
|
||||||
"ratelimiter~=1.0",
|
"ratelimiter~=1.0",
|
||||||
"retry>=0.9.2",
|
"retry>=0.9.2",
|
||||||
@@ -10,7 +11,10 @@ dependencies = [
|
|||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"attrs>=21.3.0",
|
"attrs>=21.3.0",
|
||||||
"semver>=3.0",
|
"semver>=3.0",
|
||||||
"cachetools"
|
"cachetools",
|
||||||
|
"pyyaml>=6.0",
|
||||||
|
"click>=8.1.7",
|
||||||
|
"requests>=2.31.0"
|
||||||
]
|
]
|
||||||
description = "lancedb"
|
description = "lancedb"
|
||||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||||
@@ -48,7 +52,10 @@ tests = ["pandas>=1.4", "pytest", "pytest-mock", "pytest-asyncio", "requests"]
|
|||||||
dev = ["ruff", "pre-commit", "black"]
|
dev = ["ruff", "pre-commit", "black"]
|
||||||
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
|
||||||
clip = ["torch", "pillow", "open-clip"]
|
clip = ["torch", "pillow", "open-clip"]
|
||||||
embeddings = ["openai", "sentence-transformers", "torch", "pillow", "open-clip"]
|
embeddings = ["openai", "sentence-transformers", "torch", "pillow", "open-clip", "cohere"]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
lancedb = "lancedb.cli.cli:cli"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools", "wheel"]
|
requires = ["setuptools", "wheel"]
|
||||||
|
|||||||
35
python/tests/test_cli.py
Normal file
35
python/tests/test_cli.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from click.testing import CliRunner
|
||||||
|
|
||||||
|
from lancedb.cli.cli import cli
|
||||||
|
from lancedb.utils import CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
def test_entry():
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(cli)
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
assert "lancedb" in result.output.lower() # lazy check
|
||||||
|
|
||||||
|
|
||||||
|
def test_diagnostics():
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(cli, ["diagnostics", "--disabled"])
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
assert CONFIG["diagnostics"] == False
|
||||||
|
|
||||||
|
result = runner.invoke(cli, ["diagnostics", "--enabled"])
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
assert CONFIG["diagnostics"] == True
|
||||||
|
|
||||||
|
|
||||||
|
def test_config():
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(cli, ["config"])
|
||||||
|
assert result.exit_code == 0 # Main check
|
||||||
|
cfg = CONFIG.copy()
|
||||||
|
cfg.pop("uuid")
|
||||||
|
for (
|
||||||
|
item,
|
||||||
|
_,
|
||||||
|
) in cfg.items(): # check for keys only as formatting is subject to change
|
||||||
|
assert item in result.output
|
||||||
@@ -47,7 +47,7 @@ def test_contextualizer(raw_df: pd.DataFrame):
|
|||||||
.stride(3)
|
.stride(3)
|
||||||
.text_col("token")
|
.text_col("token")
|
||||||
.groupby("document_id")
|
.groupby("document_id")
|
||||||
.to_df()["token"]
|
.to_pandas()["token"]
|
||||||
.to_list()
|
.to_list()
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -67,7 +67,7 @@ def test_contextualizer_with_threshold(raw_df: pd.DataFrame):
|
|||||||
.text_col("token")
|
.text_col("token")
|
||||||
.groupby("document_id")
|
.groupby("document_id")
|
||||||
.min_window_size(4)
|
.min_window_size(4)
|
||||||
.to_df()["token"]
|
.to_pandas()["token"]
|
||||||
.to_list()
|
.to_list()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -33,11 +33,11 @@ def test_basic(tmp_path):
|
|||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
rs = table.search([100, 100]).limit(1).to_df()
|
rs = table.search([100, 100]).limit(1).to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "bar"
|
assert rs["item"].iloc[0] == "bar"
|
||||||
|
|
||||||
rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
|
rs = table.search([100, 100]).where("price < 15").limit(2).to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "foo"
|
assert rs["item"].iloc[0] == "foo"
|
||||||
|
|
||||||
@@ -62,11 +62,11 @@ def test_ingest_pd(tmp_path):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
table = db.create_table("test", data=data)
|
table = db.create_table("test", data=data)
|
||||||
rs = table.search([100, 100]).limit(1).to_df()
|
rs = table.search([100, 100]).limit(1).to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "bar"
|
assert rs["item"].iloc[0] == "bar"
|
||||||
|
|
||||||
rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
|
rs = table.search([100, 100]).where("price < 15").limit(2).to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "foo"
|
assert rs["item"].iloc[0] == "foo"
|
||||||
|
|
||||||
@@ -137,8 +137,8 @@ def test_ingest_iterator(tmp_path):
|
|||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
tbl = db.create_table("table2", make_batches(), schema=schema, mode="overwrite")
|
tbl = db.create_table("table2", make_batches(), schema=schema, mode="overwrite")
|
||||||
tbl.to_pandas()
|
tbl.to_pandas()
|
||||||
assert tbl.search([3.1, 4.1]).limit(1).to_df()["_distance"][0] == 0.0
|
assert tbl.search([3.1, 4.1]).limit(1).to_pandas()["_distance"][0] == 0.0
|
||||||
assert tbl.search([5.9, 26.5]).limit(1).to_df()["_distance"][0] == 0.0
|
assert tbl.search([5.9, 26.5]).limit(1).to_pandas()["_distance"][0] == 0.0
|
||||||
tbl_len = len(tbl)
|
tbl_len = len(tbl)
|
||||||
tbl.add(make_batches())
|
tbl.add(make_batches())
|
||||||
assert tbl_len == 50
|
assert tbl_len == 50
|
||||||
|
|||||||
@@ -23,5 +23,5 @@ from lancedb import LanceDBConnection
|
|||||||
def test_against_local_server():
|
def test_against_local_server():
|
||||||
conn = LanceDBConnection("lancedb+http://localhost:10024")
|
conn = LanceDBConnection("lancedb+http://localhost:10024")
|
||||||
table = conn.open_table("sift1m_ivf1024_pq16")
|
table = conn.open_table("sift1m_ivf1024_pq16")
|
||||||
df = table.search(np.random.rand(128)).to_df()
|
df = table.search(np.random.rand(128)).to_pandas()
|
||||||
assert len(df) == 10
|
assert len(df) == 10
|
||||||
|
|||||||
@@ -11,6 +11,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import io
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -123,3 +124,26 @@ def test_openclip(tmp_path):
|
|||||||
arrow_table["vector"].combine_chunks().values.to_numpy(),
|
arrow_table["vector"].combine_chunks().values.to_numpy(),
|
||||||
arrow_table["vec_from_bytes"].combine_chunks().values.to_numpy(),
|
arrow_table["vec_from_bytes"].combine_chunks().values.to_numpy(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
os.environ.get("COHERE_API_KEY") is None, reason="COHERE_API_KEY not set"
|
||||||
|
) # also skip if cohere not installed
|
||||||
|
def test_cohere_embedding_function():
|
||||||
|
cohere = (
|
||||||
|
EmbeddingFunctionRegistry.get_instance()
|
||||||
|
.get("cohere")
|
||||||
|
.create(name="embed-multilingual-v2.0")
|
||||||
|
)
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = cohere.SourceField()
|
||||||
|
vector: Vector(cohere.ndims()) = cohere.VectorField()
|
||||||
|
|
||||||
|
df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
|
||||||
|
db = lancedb.connect("~/lancedb")
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(df)
|
||||||
|
assert len(tbl.to_pandas()["vector"][0]) == cohere.ndims()
|
||||||
|
|||||||
@@ -71,14 +71,14 @@ def test_search_index(tmp_path, table):
|
|||||||
|
|
||||||
def test_create_index_from_table(tmp_path, table):
|
def test_create_index_from_table(tmp_path, table):
|
||||||
table.create_fts_index("text")
|
table.create_fts_index("text")
|
||||||
df = table.search("puppy").limit(10).select(["text"]).to_df()
|
df = table.search("puppy").limit(10).select(["text"]).to_pandas()
|
||||||
assert len(df) == 10
|
assert len(df) == 10
|
||||||
assert "text" in df.columns
|
assert "text" in df.columns
|
||||||
|
|
||||||
|
|
||||||
def test_create_index_multiple_columns(tmp_path, table):
|
def test_create_index_multiple_columns(tmp_path, table):
|
||||||
table.create_fts_index(["text", "text2"])
|
table.create_fts_index(["text", "text2"])
|
||||||
df = table.search("puppy").limit(10).to_df()
|
df = table.search("puppy").limit(10).to_pandas()
|
||||||
assert len(df) == 10
|
assert len(df) == 10
|
||||||
assert "text" in df.columns
|
assert "text" in df.columns
|
||||||
assert "text2" in df.columns
|
assert "text2" in df.columns
|
||||||
@@ -87,5 +87,5 @@ def test_create_index_multiple_columns(tmp_path, table):
|
|||||||
def test_empty_rs(tmp_path, table, mocker):
|
def test_empty_rs(tmp_path, table, mocker):
|
||||||
table.create_fts_index(["text", "text2"])
|
table.create_fts_index(["text", "text2"])
|
||||||
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
mocker.patch("lancedb.fts.search_index", return_value=([], []))
|
||||||
df = table.search("puppy").limit(10).to_df()
|
df = table.search("puppy").limit(10).to_pandas()
|
||||||
assert len(df) == 0
|
assert len(df) == 0
|
||||||
|
|||||||
@@ -36,11 +36,11 @@ def test_s3_io():
|
|||||||
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
rs = table.search([100, 100]).limit(1).to_df()
|
rs = table.search([100, 100]).limit(1).to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "bar"
|
assert rs["item"].iloc[0] == "bar"
|
||||||
|
|
||||||
rs = table.search([100, 100]).where("price < 15").limit(2).to_df()
|
rs = table.search([100, 100]).where("price < 15").limit(2).to_pandas()
|
||||||
assert len(rs) == 1
|
assert len(rs) == 1
|
||||||
assert rs["item"].iloc[0] == "foo"
|
assert rs["item"].iloc[0] == "foo"
|
||||||
|
|
||||||
|
|||||||
@@ -85,17 +85,20 @@ def test_cast(table):
|
|||||||
|
|
||||||
|
|
||||||
def test_query_builder(table):
|
def test_query_builder(table):
|
||||||
df = (
|
rs = (
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").limit(1).select(["id"]).to_df()
|
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||||
|
.limit(1)
|
||||||
|
.select(["id"])
|
||||||
|
.to_list()
|
||||||
)
|
)
|
||||||
assert df["id"].values[0] == 1
|
assert rs[0]["id"] == 1
|
||||||
assert all(df["vector"].values[0] == [1, 2])
|
assert all(np.array(rs[0]["vector"]) == [1, 2])
|
||||||
|
|
||||||
|
|
||||||
def test_query_builder_with_filter(table):
|
def test_query_builder_with_filter(table):
|
||||||
df = LanceVectorQueryBuilder(table, [0, 0], "vector").where("id = 2").to_df()
|
rs = LanceVectorQueryBuilder(table, [0, 0], "vector").where("id = 2").to_list()
|
||||||
assert df["id"].values[0] == 2
|
assert rs[0]["id"] == 2
|
||||||
assert all(df["vector"].values[0] == [3, 4])
|
assert all(np.array(rs[0]["vector"]) == [3, 4])
|
||||||
|
|
||||||
|
|
||||||
def test_query_builder_with_prefilter(table):
|
def test_query_builder_with_prefilter(table):
|
||||||
@@ -103,7 +106,7 @@ def test_query_builder_with_prefilter(table):
|
|||||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||||
.where("id = 2")
|
.where("id = 2")
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.to_df()
|
.to_pandas()
|
||||||
)
|
)
|
||||||
assert len(df) == 0
|
assert len(df) == 0
|
||||||
|
|
||||||
@@ -111,7 +114,7 @@ def test_query_builder_with_prefilter(table):
|
|||||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||||
.where("id = 2", prefilter=True)
|
.where("id = 2", prefilter=True)
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.to_df()
|
.to_pandas()
|
||||||
)
|
)
|
||||||
assert df["id"].values[0] == 2
|
assert df["id"].values[0] == 2
|
||||||
assert all(df["vector"].values[0] == [3, 4])
|
assert all(df["vector"].values[0] == [3, 4])
|
||||||
@@ -120,9 +123,11 @@ def test_query_builder_with_prefilter(table):
|
|||||||
def test_query_builder_with_metric(table):
|
def test_query_builder_with_metric(table):
|
||||||
query = [4, 8]
|
query = [4, 8]
|
||||||
vector_column_name = "vector"
|
vector_column_name = "vector"
|
||||||
df_default = LanceVectorQueryBuilder(table, query, vector_column_name).to_df()
|
df_default = LanceVectorQueryBuilder(table, query, vector_column_name).to_pandas()
|
||||||
df_l2 = (
|
df_l2 = (
|
||||||
LanceVectorQueryBuilder(table, query, vector_column_name).metric("L2").to_df()
|
LanceVectorQueryBuilder(table, query, vector_column_name)
|
||||||
|
.metric("L2")
|
||||||
|
.to_pandas()
|
||||||
)
|
)
|
||||||
tm.assert_frame_equal(df_default, df_l2)
|
tm.assert_frame_equal(df_default, df_l2)
|
||||||
|
|
||||||
@@ -130,7 +135,7 @@ def test_query_builder_with_metric(table):
|
|||||||
LanceVectorQueryBuilder(table, query, vector_column_name)
|
LanceVectorQueryBuilder(table, query, vector_column_name)
|
||||||
.metric("cosine")
|
.metric("cosine")
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.to_df()
|
.to_pandas()
|
||||||
)
|
)
|
||||||
assert df_cosine._distance[0] == pytest.approx(
|
assert df_cosine._distance[0] == pytest.approx(
|
||||||
cosine_distance(query, df_cosine.vector[0]),
|
cosine_distance(query, df_cosine.vector[0]),
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ async def test_e2e_with_mock_server():
|
|||||||
columns=["id", "vector"],
|
columns=["id", "vector"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
).to_df()
|
).to_pandas()
|
||||||
|
|
||||||
assert "vector" in df.columns
|
assert "vector" in df.columns
|
||||||
assert "id" in df.columns
|
assert "id" in df.columns
|
||||||
|
|||||||
@@ -32,4 +32,4 @@ def test_remote_db():
|
|||||||
setattr(conn, "_client", FakeLanceDBClient())
|
setattr(conn, "_client", FakeLanceDBClient())
|
||||||
|
|
||||||
table = conn["test"]
|
table = conn["test"]
|
||||||
table.search([1.0, 2.0]).to_df()
|
table.search([1.0, 2.0]).to_pandas()
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
|
from datetime import timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
from unittest.mock import PropertyMock, patch
|
from unittest.mock import PropertyMock, patch
|
||||||
@@ -427,8 +428,8 @@ def test_multiple_vector_columns(db):
|
|||||||
table.add(df)
|
table.add(df)
|
||||||
|
|
||||||
q = np.random.randn(10)
|
q = np.random.randn(10)
|
||||||
result1 = table.search(q, vector_column_name="vector1").limit(1).to_df()
|
result1 = table.search(q, vector_column_name="vector1").limit(1).to_pandas()
|
||||||
result2 = table.search(q, vector_column_name="vector2").limit(1).to_df()
|
result2 = table.search(q, vector_column_name="vector2").limit(1).to_pandas()
|
||||||
|
|
||||||
assert result1["text"].iloc[0] != result2["text"].iloc[0]
|
assert result1["text"].iloc[0] != result2["text"].iloc[0]
|
||||||
|
|
||||||
@@ -439,6 +440,34 @@ def test_empty_query(db):
|
|||||||
"my_table",
|
"my_table",
|
||||||
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
||||||
)
|
)
|
||||||
df = table.search().select(["id"]).where("text='bar'").limit(1).to_df()
|
df = table.search().select(["id"]).where("text='bar'").limit(1).to_pandas()
|
||||||
val = df.id.iloc[0]
|
val = df.id.iloc[0]
|
||||||
assert val == 1
|
assert val == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_compact_cleanup(db):
|
||||||
|
table = LanceTable.create(
|
||||||
|
db,
|
||||||
|
"my_table",
|
||||||
|
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
|
||||||
|
)
|
||||||
|
|
||||||
|
table.add([{"text": "baz", "id": 2}])
|
||||||
|
assert len(table) == 3
|
||||||
|
assert table.version == 3
|
||||||
|
|
||||||
|
stats = table.compact_files()
|
||||||
|
assert len(table) == 3
|
||||||
|
assert table.version == 4
|
||||||
|
assert stats.fragments_removed > 0
|
||||||
|
assert stats.fragments_added == 1
|
||||||
|
|
||||||
|
stats = table.cleanup_old_versions()
|
||||||
|
assert stats.bytes_removed == 0
|
||||||
|
|
||||||
|
stats = table.cleanup_old_versions(older_than=timedelta(0), delete_unverified=True)
|
||||||
|
assert stats.bytes_removed > 0
|
||||||
|
assert table.version == 4
|
||||||
|
|
||||||
|
with pytest.raises(Exception, match="Version 3 no longer exists"):
|
||||||
|
table.checkout(3)
|
||||||
|
|||||||
60
python/tests/test_telemetry.py
Normal file
60
python/tests/test_telemetry.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
from lancedb.utils.events import _Events
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def request_log_path(tmp_path):
|
||||||
|
return tmp_path / "request.json"
|
||||||
|
|
||||||
|
|
||||||
|
def mock_register_event(name: str, **kwargs):
|
||||||
|
if _Events._instance is None:
|
||||||
|
_Events._instance = _Events()
|
||||||
|
|
||||||
|
_Events._instance.enabled = True
|
||||||
|
_Events._instance.rate_limit = 0
|
||||||
|
_Events._instance(name, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_event_reporting(monkeypatch, request_log_path, tmp_path) -> None:
|
||||||
|
def mock_request(**kwargs):
|
||||||
|
json_data = kwargs.get("json", {})
|
||||||
|
with open(request_log_path, "w") as f:
|
||||||
|
json.dump(json_data, f)
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
lancedb.table, "register_event", mock_register_event
|
||||||
|
) # Force enable registering events and strip exception handling
|
||||||
|
monkeypatch.setattr(lancedb.utils.events, "threaded_request", mock_request)
|
||||||
|
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
db.create_table(
|
||||||
|
"test",
|
||||||
|
data=[
|
||||||
|
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
|
||||||
|
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
|
||||||
|
],
|
||||||
|
mode="overwrite",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert request_log_path.exists() # test if event was registered
|
||||||
|
|
||||||
|
with open(request_log_path, "r") as f:
|
||||||
|
json_data = json.load(f)
|
||||||
|
|
||||||
|
# TODO: don't hardcode these here. Instead create a module level json scehma in lancedb.utils.events for better evolvability
|
||||||
|
batch_keys = ["api_key", "distinct_id", "batch"]
|
||||||
|
event_keys = ["event", "properties", "timestamp", "distinct_id"]
|
||||||
|
property_keys = ["cli", "install", "platforms", "version", "session_id"]
|
||||||
|
|
||||||
|
assert all([key in json_data for key in batch_keys])
|
||||||
|
assert all([key in json_data["batch"][0] for key in event_keys])
|
||||||
|
assert all([key in json_data["batch"][0]["properties"] for key in property_keys])
|
||||||
|
|
||||||
|
# cleanup & reset
|
||||||
|
monkeypatch.undo()
|
||||||
|
_Events._instance = None
|
||||||
@@ -13,6 +13,7 @@ crate-type = ["cdylib"]
|
|||||||
arrow-array = { workspace = true }
|
arrow-array = { workspace = true }
|
||||||
arrow-ipc = { workspace = true }
|
arrow-ipc = { workspace = true }
|
||||||
arrow-schema = { workspace = true }
|
arrow-schema = { workspace = true }
|
||||||
|
chrono = { workspace = true }
|
||||||
conv = "0.3.3"
|
conv = "0.3.3"
|
||||||
once_cell = "1"
|
once_cell = "1"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
|
|||||||
@@ -78,9 +78,11 @@ fn get_index_params_builder(
|
|||||||
|
|
||||||
num_partitions.map(|np| {
|
num_partitions.map(|np| {
|
||||||
let max_iters = max_iters.unwrap_or(50);
|
let max_iters = max_iters.unwrap_or(50);
|
||||||
let mut ivf_params = IvfBuildParams::default();
|
let ivf_params = IvfBuildParams {
|
||||||
ivf_params.num_partitions = np;
|
num_partitions: np,
|
||||||
ivf_params.max_iters = max_iters;
|
max_iters,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
index_builder.ivf_params(ivf_params)
|
index_builder.ivf_params(ivf_params)
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -237,6 +237,8 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
|
|||||||
cx.export_function("tableAdd", JsTable::js_add)?;
|
cx.export_function("tableAdd", JsTable::js_add)?;
|
||||||
cx.export_function("tableCountRows", JsTable::js_count_rows)?;
|
cx.export_function("tableCountRows", JsTable::js_count_rows)?;
|
||||||
cx.export_function("tableDelete", JsTable::js_delete)?;
|
cx.export_function("tableDelete", JsTable::js_delete)?;
|
||||||
|
cx.export_function("tableCleanupOldVersions", JsTable::js_cleanup)?;
|
||||||
|
cx.export_function("tableCompactFiles", JsTable::js_compact)?;
|
||||||
cx.export_function(
|
cx.export_function(
|
||||||
"tableCreateVectorIndex",
|
"tableCreateVectorIndex",
|
||||||
index::vector::table_create_vector_index,
|
index::vector::table_create_vector_index,
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
use arrow_array::RecordBatchIterator;
|
use arrow_array::RecordBatchIterator;
|
||||||
|
use lance::dataset::optimize::CompactionOptions;
|
||||||
use lance::dataset::{WriteMode, WriteParams};
|
use lance::dataset::{WriteMode, WriteParams};
|
||||||
use lance::io::object_store::ObjectStoreParams;
|
use lance::io::object_store::ObjectStoreParams;
|
||||||
|
|
||||||
@@ -163,4 +164,116 @@ impl JsTable {
|
|||||||
});
|
});
|
||||||
Ok(promise)
|
Ok(promise)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn js_cleanup(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
|
let rt = runtime(&mut cx)?;
|
||||||
|
let (deferred, promise) = cx.promise();
|
||||||
|
let table = js_table.table.clone();
|
||||||
|
let channel = cx.channel();
|
||||||
|
|
||||||
|
let older_than: i64 = cx
|
||||||
|
.argument_opt(0)
|
||||||
|
.and_then(|val| val.downcast::<JsNumber, _>(&mut cx).ok())
|
||||||
|
.map(|val| val.value(&mut cx) as i64)
|
||||||
|
.unwrap_or_else(|| 2 * 7 * 24 * 60); // 2 weeks
|
||||||
|
let older_than = chrono::Duration::minutes(older_than);
|
||||||
|
let delete_unverified: bool = cx
|
||||||
|
.argument_opt(1)
|
||||||
|
.and_then(|val| val.downcast::<JsBoolean, _>(&mut cx).ok())
|
||||||
|
.map(|val| val.value(&mut cx))
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
rt.spawn(async move {
|
||||||
|
let stats = table
|
||||||
|
.cleanup_old_versions(older_than, Some(delete_unverified))
|
||||||
|
.await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
let stats = stats.or_throw(&mut cx)?;
|
||||||
|
|
||||||
|
let output_metrics = JsObject::new(&mut cx);
|
||||||
|
let bytes_removed = cx.number(stats.bytes_removed as f64);
|
||||||
|
output_metrics.set(&mut cx, "bytesRemoved", bytes_removed)?;
|
||||||
|
|
||||||
|
let old_versions = cx.number(stats.old_versions as f64);
|
||||||
|
output_metrics.set(&mut cx, "oldVersions", old_versions)?;
|
||||||
|
|
||||||
|
let output_table = cx.boxed(JsTable::from(table));
|
||||||
|
|
||||||
|
let output = JsObject::new(&mut cx);
|
||||||
|
output.set(&mut cx, "metrics", output_metrics)?;
|
||||||
|
output.set(&mut cx, "newTable", output_table)?;
|
||||||
|
|
||||||
|
Ok(output)
|
||||||
|
})
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn js_compact(mut cx: FunctionContext) -> JsResult<JsPromise> {
|
||||||
|
let js_table = cx.this().downcast_or_throw::<JsBox<JsTable>, _>(&mut cx)?;
|
||||||
|
let rt = runtime(&mut cx)?;
|
||||||
|
let (deferred, promise) = cx.promise();
|
||||||
|
let mut table = js_table.table.clone();
|
||||||
|
let channel = cx.channel();
|
||||||
|
|
||||||
|
let js_options = cx.argument::<JsObject>(0)?;
|
||||||
|
let mut options = CompactionOptions::default();
|
||||||
|
|
||||||
|
if let Some(target_rows) =
|
||||||
|
js_options.get_opt::<JsNumber, _, _>(&mut cx, "targetRowsPerFragment")?
|
||||||
|
{
|
||||||
|
options.target_rows_per_fragment = target_rows.value(&mut cx) as usize;
|
||||||
|
}
|
||||||
|
if let Some(max_per_group) =
|
||||||
|
js_options.get_opt::<JsNumber, _, _>(&mut cx, "maxRowsPerGroup")?
|
||||||
|
{
|
||||||
|
options.max_rows_per_group = max_per_group.value(&mut cx) as usize;
|
||||||
|
}
|
||||||
|
if let Some(materialize_deletions) =
|
||||||
|
js_options.get_opt::<JsBoolean, _, _>(&mut cx, "materializeDeletions")?
|
||||||
|
{
|
||||||
|
options.materialize_deletions = materialize_deletions.value(&mut cx);
|
||||||
|
}
|
||||||
|
if let Some(materialize_deletions_threshold) =
|
||||||
|
js_options.get_opt::<JsNumber, _, _>(&mut cx, "materializeDeletionsThreshold")?
|
||||||
|
{
|
||||||
|
options.materialize_deletions_threshold =
|
||||||
|
materialize_deletions_threshold.value(&mut cx) as f32;
|
||||||
|
}
|
||||||
|
if let Some(num_threads) = js_options.get_opt::<JsNumber, _, _>(&mut cx, "numThreads")? {
|
||||||
|
options.num_threads = num_threads.value(&mut cx) as usize;
|
||||||
|
}
|
||||||
|
|
||||||
|
rt.spawn(async move {
|
||||||
|
let stats = table.compact_files(options).await;
|
||||||
|
|
||||||
|
deferred.settle_with(&channel, move |mut cx| {
|
||||||
|
let stats = stats.or_throw(&mut cx)?;
|
||||||
|
|
||||||
|
let output_metrics = JsObject::new(&mut cx);
|
||||||
|
let fragments_removed = cx.number(stats.fragments_removed as f64);
|
||||||
|
output_metrics.set(&mut cx, "fragmentsRemoved", fragments_removed)?;
|
||||||
|
|
||||||
|
let fragments_added = cx.number(stats.fragments_added as f64);
|
||||||
|
output_metrics.set(&mut cx, "fragmentsAdded", fragments_added)?;
|
||||||
|
|
||||||
|
let files_removed = cx.number(stats.files_removed as f64);
|
||||||
|
output_metrics.set(&mut cx, "filesRemoved", files_removed)?;
|
||||||
|
|
||||||
|
let files_added = cx.number(stats.files_added as f64);
|
||||||
|
output_metrics.set(&mut cx, "filesAdded", files_added)?;
|
||||||
|
|
||||||
|
let output_table = cx.boxed(JsTable::from(table));
|
||||||
|
|
||||||
|
let output = JsObject::new(&mut cx);
|
||||||
|
output.set(&mut cx, "metrics", output_metrics)?;
|
||||||
|
output.set(&mut cx, "newTable", output_table)?;
|
||||||
|
|
||||||
|
Ok(output)
|
||||||
|
})
|
||||||
|
});
|
||||||
|
Ok(promise)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ arrow-data = { workspace = true }
|
|||||||
arrow-schema = { workspace = true }
|
arrow-schema = { workspace = true }
|
||||||
arrow-ord = { workspace = true }
|
arrow-ord = { workspace = true }
|
||||||
arrow-cast = { workspace = true }
|
arrow-cast = { workspace = true }
|
||||||
|
chrono = { workspace = true }
|
||||||
object_store = { workspace = true }
|
object_store = { workspace = true }
|
||||||
snafu = { workspace = true }
|
snafu = { workspace = true }
|
||||||
half = { workspace = true }
|
half = { workspace = true }
|
||||||
|
|||||||
@@ -14,7 +14,6 @@
|
|||||||
|
|
||||||
//! A mirroring object store that mirror writes to a secondary object store
|
//! A mirroring object store that mirror writes to a secondary object store
|
||||||
|
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
fmt::Formatter,
|
fmt::Formatter,
|
||||||
pin::Pin,
|
pin::Pin,
|
||||||
|
|||||||
@@ -12,10 +12,13 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
use chrono::Duration;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use arrow_array::{Float32Array, RecordBatchReader};
|
use arrow_array::{Float32Array, RecordBatchReader};
|
||||||
use arrow_schema::SchemaRef;
|
use arrow_schema::SchemaRef;
|
||||||
|
use lance::dataset::cleanup::RemovalStats;
|
||||||
|
use lance::dataset::optimize::{compact_files, CompactionMetrics, CompactionOptions};
|
||||||
use lance::dataset::{Dataset, WriteParams};
|
use lance::dataset::{Dataset, WriteParams};
|
||||||
use lance::index::IndexType;
|
use lance::index::IndexType;
|
||||||
use lance::io::object_store::WrappingObjectStore;
|
use lance::io::object_store::WrappingObjectStore;
|
||||||
@@ -305,6 +308,41 @@ impl Table {
|
|||||||
self.dataset = Arc::new(dataset);
|
self.dataset = Arc::new(dataset);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Remove old versions of the dataset from disk.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `older_than` - The duration of time to keep versions of the dataset.
|
||||||
|
/// * `delete_unverified` - Because they may be part of an in-progress
|
||||||
|
/// transaction, files newer than 7 days old are not deleted by default.
|
||||||
|
/// If you are sure that there are no in-progress transactions, then you
|
||||||
|
/// can set this to True to delete all files older than `older_than`.
|
||||||
|
///
|
||||||
|
/// This calls into [lance::dataset::Dataset::cleanup_old_versions] and
|
||||||
|
/// returns the result.
|
||||||
|
pub async fn cleanup_old_versions(
|
||||||
|
&self,
|
||||||
|
older_than: Duration,
|
||||||
|
delete_unverified: Option<bool>,
|
||||||
|
) -> Result<RemovalStats> {
|
||||||
|
Ok(self
|
||||||
|
.dataset
|
||||||
|
.cleanup_old_versions(older_than, delete_unverified)
|
||||||
|
.await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compact files in the dataset.
|
||||||
|
///
|
||||||
|
/// This can be run after making several small appends to optimize the table
|
||||||
|
/// for faster reads.
|
||||||
|
///
|
||||||
|
/// This calls into [lance::dataset::optimize::compact_files].
|
||||||
|
pub async fn compact_files(&mut self, options: CompactionOptions) -> Result<CompactionMetrics> {
|
||||||
|
let mut dataset = self.dataset.as_ref().clone();
|
||||||
|
let metrics = compact_files(&mut dataset, options).await?;
|
||||||
|
self.dataset = Arc::new(dataset);
|
||||||
|
Ok(metrics)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
Reference in New Issue
Block a user