arrow table/f16 example (#907)

This commit is contained in:
QianZhu
2024-01-31 14:41:28 -08:00
committed by Weston Pace
parent 567c5f6d01
commit 1f2eafca75
3 changed files with 159 additions and 109 deletions

View File

@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 2,
"id": "c1b4e34b-a49c-471d-a343-a5940bb5138a",
"metadata": {},
"outputs": [],
@@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "4e5a8d07-d9a1-48c1-913a-8e0629289579",
"metadata": {},
"outputs": [],
@@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "5df12f66-8d99-43ad-8d0b-22189ec0a6b9",
"metadata": {},
"outputs": [
@@ -62,7 +62,7 @@
"long: [[-122.7,-74.1]]"
]
},
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -90,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "f4d87ae9-0ccb-48eb-b31d-bb8f2370e47e",
"metadata": {},
"outputs": [
@@ -108,7 +108,7 @@
"long: [[-122.7,-74.1]]"
]
},
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -135,10 +135,17 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"id": "25f34bcf-fca0-4431-8601-eac95d1bd347",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[2024-01-31T18:59:33Z WARN lance::dataset] No existing dataset at /Users/qian/Work/LanceDB/lancedb/docs/src/notebooks/.lancedb/table3.lance, it will be created\n"
]
},
{
"data": {
"text/plain": [
@@ -148,7 +155,7 @@
"long: float"
]
},
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -171,45 +178,51 @@
"id": "4df51925-7ca2-4005-9c72-38b3d26240c6",
"metadata": {},
"source": [
"### From PyArrow Tables\n",
"### From an Arrow Table\n",
"\n",
"You can also create LanceDB tables directly from pyarrow tables"
"You can also create LanceDB tables directly from pyarrow tables. LanceDB supports float16 type."
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 7,
"id": "90a880f6-be43-4c9d-ba65-0b05197c0f6f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"vector: fixed_size_list<item: float>[2]\n",
" child 0, item: float\n",
"item: string\n",
"price: double"
"vector: fixed_size_list<item: halffloat>[16]\n",
" child 0, item: halffloat\n",
"text: string"
]
},
"execution_count": 12,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table = pa.Table.from_arrays(\n",
" [\n",
" pa.array([[3.1, 4.1], [5.9, 26.5]],\n",
" pa.list_(pa.float32(), 2)),\n",
" pa.array([\"foo\", \"bar\"]),\n",
" pa.array([10.0, 20.0]),\n",
" ],\n",
" [\"vector\", \"item\", \"price\"],\n",
" )\n",
"import numpy as np\n",
"\n",
"db = lancedb.connect(\"db\")\n",
"dim = 16\n",
"total = 2\n",
"schema = pa.schema(\n",
" [\n",
" pa.field(\"vector\", pa.list_(pa.float16(), dim)),\n",
" pa.field(\"text\", pa.string())\n",
" ]\n",
")\n",
"data = pa.Table.from_arrays(\n",
" [\n",
" pa.array([np.random.randn(dim).astype(np.float16) for _ in range(total)],\n",
" pa.list_(pa.float16(), dim)),\n",
" pa.array([\"foo\", \"bar\"])\n",
" ],\n",
" [\"vector\", \"text\"],\n",
")\n",
"\n",
"tbl = db.create_table(\"test1\", table, mode=\"overwrite\")\n",
"tbl = db.create_table(\"f16_tbl\", data, schema=schema)\n",
"tbl.schema"
]
},
@@ -225,7 +238,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 8,
"id": "d81121d7-e4b7-447c-a48c-974b6ebb464a",
"metadata": {},
"outputs": [
@@ -240,7 +253,7 @@
"imdb_id: int64 not null"
]
},
"execution_count": 13,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -282,7 +295,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 9,
"id": "bc247142-4e3c-41a2-b94c-8e00d2c2a508",
"metadata": {},
"outputs": [
@@ -292,7 +305,7 @@
"LanceTable(table4)"
]
},
"execution_count": 14,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -333,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 10,
"id": "25ad3523-e0c9-4c28-b3df-38189c4e0e5f",
"metadata": {},
"outputs": [
@@ -346,7 +359,7 @@
"price: double not null"
]
},
"execution_count": 16,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -385,7 +398,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 11,
"id": "2814173a-eacc-4dd8-a64d-6312b44582cc",
"metadata": {},
"outputs": [],
@@ -411,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 12,
"id": "df9e13c0-41f6-437f-9dfa-2fd71d3d9c45",
"metadata": {},
"outputs": [
@@ -421,7 +434,7 @@
"['table6', 'table4', 'table5', 'movielens_small']"
]
},
"execution_count": 18,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -432,7 +445,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 13,
"id": "9343f5ad-6024-42ee-ac2f-6c1471df8679",
"metadata": {},
"outputs": [
@@ -541,7 +554,7 @@
"9 [5.9, 26.5] bar 20.0"
]
},
"execution_count": 20,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -564,7 +577,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 14,
"id": "8a56250f-73a1-4c26-a6ad-5c7a0ce3a9ab",
"metadata": {},
"outputs": [],
@@ -590,7 +603,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 15,
"id": "030c7057-b98e-4e2f-be14-b8c1f927f83c",
"metadata": {},
"outputs": [],
@@ -621,7 +634,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 16,
"id": "e7a17de2-08d2-41b7-bd05-f63d1045ab1f",
"metadata": {},
"outputs": [
@@ -629,16 +642,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"32\n"
"22\n"
]
},
{
"data": {
"text/plain": [
"17"
"12"
]
},
"execution_count": 24,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -661,7 +674,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 17,
"id": "fe3310bd-08f4-4a22-a63b-b3127d22f9f7",
"metadata": {},
"outputs": [
@@ -681,25 +694,20 @@
"8 [3.1, 4.1] foo 10.0\n",
"9 [3.1, 4.1] foo 10.0\n",
"10 [3.1, 4.1] foo 10.0\n",
"11 [3.1, 4.1] foo 10.0\n",
"12 [3.1, 4.1] foo 10.0\n",
"13 [3.1, 4.1] foo 10.0\n",
"14 [3.1, 4.1] foo 10.0\n",
"15 [3.1, 4.1] foo 10.0\n",
"16 [3.1, 4.1] foo 10.0\n"
"11 [3.1, 4.1] foo 10.0\n"
]
},
{
"ename": "OSError",
"evalue": "LanceError(IO): Error during planning: column foo does not exist",
"evalue": "LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[30], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m to_remove \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m to_remove)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(tbl\u001b[38;5;241m.\u001b[39mto_pandas())\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtbl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mitem IN (\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mto_remove\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m tbl\u001b[38;5;241m.\u001b[39mto_pandas()\n",
"File \u001b[0;32m~/Documents/lancedb/lancedb/python/lancedb/table.py:610\u001b[0m, in \u001b[0;36mLanceTable.delete\u001b[0;34m(self, where)\u001b[0m\n\u001b[1;32m 609\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdelete\u001b[39m(\u001b[38;5;28mself\u001b[39m, where: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 610\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Documents/lancedb/lancedb/env/lib/python3.11/site-packages/lance/dataset.py:489\u001b[0m, in \u001b[0;36mLanceDataset.delete\u001b[0;34m(self, predicate)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(predicate, pa\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mExpression):\n\u001b[1;32m 488\u001b[0m predicate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(predicate)\n\u001b[0;32m--> 489\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mOSError\u001b[0m: LanceError(IO): Error during planning: column foo does not exist"
"Cell \u001b[0;32mIn[17], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m to_remove \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(v) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m to_remove)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(tbl\u001b[38;5;241m.\u001b[39mto_pandas())\n\u001b[0;32m----> 4\u001b[0m \u001b[43mtbl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mitem IN (\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mto_remove\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lancedb/table.py:872\u001b[0m, in \u001b[0;36mLanceTable.delete\u001b[0;34m(self, where)\u001b[0m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdelete\u001b[39m(\u001b[38;5;28mself\u001b[39m, where: \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 872\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwhere\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Work/LanceDB/lancedb/docs/doc-venv/lib/python3.11/site-packages/lance/dataset.py:596\u001b[0m, in \u001b[0;36mLanceDataset.delete\u001b[0;34m(self, predicate)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(predicate, pa\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mExpression):\n\u001b[1;32m 595\u001b[0m predicate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(predicate)\n\u001b[0;32m--> 596\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mOSError\u001b[0m: LanceError(IO): Error during planning: column foo does not exist, /Users/runner/work/lance/lance/rust/lance-core/src/error.rs:212:23"
]
}
],
@@ -712,7 +720,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": null,
"id": "87d5bc21-847f-4c81-b56e-f6dbe5d05aac",
"metadata": {},
"outputs": [],
@@ -729,7 +737,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": null,
"id": "9cba4519-eb3a-4941-ab7e-873d762e750f",
"metadata": {},
"outputs": [],
@@ -742,7 +750,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": null,
"id": "5bdc9801-d5ed-4871-92d0-88b27108e788",
"metadata": {},
"outputs": [
@@ -817,7 +825,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.11.7"
}
},
"nbformat": 4,