feat: Prototype of the storage engine (#107)

* feat: memtable flush (#63)

* wip: memtable flush

* optimize schema conversion

* remove unnecessary import

* add parquet file verfication

* add backtrace to error

* chore: upgrade opendal to 0.9 and fixed some problems

* rename error

* fix: error description

Co-authored-by: Dennis Zhuang <killme2008@gmail.com>

* feat: region manifest service (#57)

* feat: adds Manifest API

* feat: impl region manifest service

* refactor: by CR comments

* fix: storage error mod test

* fix: tweak storage cargo

* fix: tweak storage cargo

* refactor: by CR comments

* refactor: rename current_version

* feat: add wal writer (#60)

* feat: add Wal

* upgrade engine for wal

* fix: unit test for wal

* feat: wal into region

* fix: unix test

* fix clippy

* chore: by cr

* chore: by cr

* chore: prevent test data polution

* chore: by cr

* minor fix

* chore: by cr

* feat: Implement flush (#65)

* feat: Flush framework

- feat: Add id to memtable
- refactor: Rename MemtableSet/MutableMemtables to MemtableVersion/MemtableSet
- feat: Freeze memtable
- feat: Trigger flush
- feat: Background job pool
- feat: flush job
- feat: Sst access layer
- feat: Custom Deserialize for StringBytes
- feat: Use RegionWriter to apply file metas
- feat: Apply version edit
- chore: Remove unused imports

refactor: Use ParquetWriter to replace FlushTask

refactor: FsAccessLayer takes object store as param

chore: Remove todo from doc comments

feat: Move wal to WriterContext

chore: Fix clippy

chore: Add backtrace to WriteWal error

* feat: adds manifest to region and refactor sst/manifest dir config (#72)

* feat: adds manifest to region and refactor sst/manifest dir with EngineConfig

* refactor: ensure path ends with '/' in ManifestLogStorage

* fix: style

* refactor: normalize storage directory path and minor changes by CR

* refactor: doesn't need slash any more

* feat: Implement apply_edit() and add timestamp index to schema (#73)

* feat: Implement VersionControl::apply_edit()

* feat: Add timestamp index to schema

* feat: Implement Schema::timestamp_column()

* feat: persist region metadata to manifest (#74)

* feat: persist metadata when creating region or sst files

* fix: revert FileMeta comment

* feat: resolve todo

* fix: clippy warning

* fix: revert files_to_remove type in RegionEdit

* feat: impl SizeBasedStrategy for flush (#76)

* feat: impl SizeBasedStrategy for flush

* doc: get_mutable_limitation

* fix: code style and comment

* feat: align timestamp (#75)

* feat: align timestamps in write batch

* fix cr comments

* fix timestamp overflow

* simplify overflow check

* fix cr comments

* fix clippy issues

* test: Fix region tests (comment out some unsupported tests) (#82)

* feat: flush job (#80)

* feat: flush job

* fix cr comments

* move file name instead of clone

* comment log file test (#84)

* feat: improve MemtableVersion (#78)

* feat: improve MemtableVersion

* feat: remove flushed immutable memtables and test MemtableVersion

* refactor: by CR comments

* refactor: clone kv in iterator

* fix: clippy warning

* refactor: Make BatchIterator supertrait of Iterator (#85)

* refactor: rename Version to ManifestVersion and move out manifest from ShareData (#83)

* feat: Insert multiple memtables by time range (#77)

* feat: memtable::Inserter supports insert multiple memtables by time range

* chore: Update timestamp comment

* test: Add tests for Inserter

* test: Fix region tests (comment out some unsupported tests)

* refactor: align_timestamp() use TimestampMillis::aligned_by_bucket()

* chore: rename aligned_by_bucket to align_by_bucket

* fix: Fix compile errors

* fix: sst and manifest dir (#86)

* Set RowKeyDescriptor::enable_version_column to false by default

* feat: Implement write stall (#90)

* feat: Implement write stall

* chore: Update comments

* feat: Support reading multiple memtables (#93)

* feat: Support reading multiple memtables

* test: uncomment tests rely on snapshot read

* feat: wal format (#70)

* feat: wal codec

* chore: minor fix

* chore: comment

* chore: by cr

* chore: write_batch_codec mod

* chore: by cr

* chore: upgrade proto

* chore: by cr

* fix failing test

* fix failing test

* feat: manifest to wal (#100)

* feat: write manifest to wal

* chore: sequence into wal

* chore: by cr

* chore: by cr

* refactor: create log store (#104)

Co-authored-by: dennis zhuang <killme2008@gmail.com>
Co-authored-by: Lei, Huang <6406592+v0y4g3r@users.noreply.github.com>
Co-authored-by: fariygirl <clickmetoday@163.com>
Co-authored-by: Jiachun Feng <jiachun_feng@proton.me>
Co-authored-by: Lei, HUANG <mrsatangel@gmail.com>

* chore: Fix clippy

Co-authored-by: Lei, Huang <6406592+v0y4g3r@users.noreply.github.com>
Co-authored-by: Dennis Zhuang <killme2008@gmail.com>
Co-authored-by: Jiachun Feng <jiachun_feng@proton.me>
Co-authored-by: fariygirl <clickmetoday@163.com>
Co-authored-by: Lei, HUANG <mrsatangel@gmail.com>
This commit is contained in:
evenyag
2022-07-25 15:26:00 +08:00
committed by GitHub
parent 2b064265bf
commit bf5975ca3e
95 changed files with 5675 additions and 543 deletions

3
.gitignore vendored
View File

@@ -8,6 +8,9 @@
# These are backup files generated by rustfmt # These are backup files generated by rustfmt
**/*.rs.bk **/*.rs.bk
# Mac DS_Store
**/*.DS_Store
debug/ debug/
# MSVC Windows builds of rustc generate these, which store debugging information # MSVC Windows builds of rustc generate these, which store debugging information

301
Cargo.lock generated
View File

@@ -142,6 +142,17 @@ dependencies = [
"strength_reduce", "strength_reduce",
] ]
[[package]]
name = "async-channel"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2114d64672151c0c5eaa5e131ec84a74f06e1e559830dabba01ca30605d66319"
dependencies = [
"concurrent-queue",
"event-listener",
"futures-core",
]
[[package]] [[package]]
name = "async-compat" name = "async-compat"
version = "0.2.1" version = "0.2.1"
@@ -308,6 +319,18 @@ dependencies = [
"tower-service", "tower-service",
] ]
[[package]]
name = "backon"
version = "0.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f334d8b7d003e7d4e17844b81ffbfcd24ad955777997440701c08a834e407105"
dependencies = [
"futures",
"pin-project",
"rand 0.8.5",
"tokio",
]
[[package]] [[package]]
name = "backtrace" name = "backtrace"
version = "0.3.65" version = "0.3.65"
@@ -329,6 +352,12 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
[[package]]
name = "bit-vec"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "1.3.2" version = "1.3.2"
@@ -446,6 +475,15 @@ name = "bytes"
version = "1.1.0" version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8"
dependencies = [
"serde",
]
[[package]]
name = "cache-padded"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c"
[[package]] [[package]]
name = "cast" name = "cast"
@@ -456,6 +494,12 @@ dependencies = [
"rustc_version", "rustc_version",
] ]
[[package]]
name = "castaway"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2698f953def977c68f935bb0dfa959375ad4638570e969e2f1e9f433cbf1af6"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.73" version = "1.0.73"
@@ -671,7 +715,7 @@ version = "0.1.0"
dependencies = [ dependencies = [
"common-error", "common-error",
"common-telemetry", "common-telemetry",
"metrics", "metrics 0.18.1",
"once_cell", "once_cell",
"paste", "paste",
"snafu", "snafu",
@@ -686,7 +730,7 @@ dependencies = [
"backtrace", "backtrace",
"common-error", "common-error",
"console-subscriber", "console-subscriber",
"metrics", "metrics 0.18.1",
"metrics-exporter-prometheus", "metrics-exporter-prometheus",
"once_cell", "once_cell",
"opentelemetry", "opentelemetry",
@@ -705,6 +749,15 @@ dependencies = [
name = "common-time" name = "common-time"
version = "0.1.0" version = "0.1.0"
[[package]]
name = "concurrent-queue"
version = "1.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30ed07550be01594c6026cff2a1d7fe9c8f683caa798e12b68694ac9e88286a3"
dependencies = [
"cache-padded",
]
[[package]] [[package]]
name = "console-api" name = "console-api"
version = "0.2.0" version = "0.2.0"
@@ -949,6 +1002,37 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "curl"
version = "0.4.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d855aeef205b43f65a5001e0997d81f8efca7badad4fad7d897aa7f0d0651f"
dependencies = [
"curl-sys",
"libc",
"openssl-probe",
"openssl-sys",
"schannel",
"socket2",
"winapi",
]
[[package]]
name = "curl-sys"
version = "0.4.55+curl-7.83.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23734ec77368ec583c2e61dd3f0b0e5c98b93abe6d2a004ca06b91dd7e3e2762"
dependencies = [
"cc",
"libc",
"libnghttp2-sys",
"libz-sys",
"openssl-sys",
"pkg-config",
"vcpkg",
"winapi",
]
[[package]] [[package]]
name = "datafusion" name = "datafusion"
version = "7.0.0" version = "7.0.0"
@@ -1041,7 +1125,8 @@ dependencies = [
"common-telemetry", "common-telemetry",
"datatypes", "datatypes",
"hyper", "hyper",
"metrics", "log-store",
"metrics 0.18.1",
"query", "query",
"serde", "serde",
"serde_json", "serde_json",
@@ -1051,6 +1136,7 @@ dependencies = [
"store-api", "store-api",
"table", "table",
"table-engine", "table-engine",
"tempdir",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tonic", "tonic",
@@ -1146,6 +1232,12 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "event-listener"
version = "2.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77f3309417938f28bf8228fcff79a4a37103981e3e186d2ccd19c74b38f4eb71"
[[package]] [[package]]
name = "fallible-streaming-iterator" name = "fallible-streaming-iterator"
version = "0.1.9" version = "0.1.9"
@@ -1264,6 +1356,21 @@ version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b"
[[package]]
name = "futures-lite"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48"
dependencies = [
"fastrand",
"futures-core",
"futures-io",
"memchr",
"parking",
"pin-project-lite",
"waker-fn",
]
[[package]] [[package]]
name = "futures-macro" name = "futures-macro"
version = "0.3.21" version = "0.3.21"
@@ -1594,6 +1701,33 @@ dependencies = [
"nom", "nom",
] ]
[[package]]
name = "isahc"
version = "1.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "334e04b4d781f436dc315cb1e7515bd96826426345d498149e4bde36b67f8ee9"
dependencies = [
"async-channel",
"castaway",
"crossbeam-utils",
"curl",
"curl-sys",
"encoding_rs",
"event-listener",
"futures-lite",
"http",
"log",
"mime",
"once_cell",
"polling",
"slab",
"sluice",
"tracing",
"tracing-futures",
"url",
"waker-fn",
]
[[package]] [[package]]
name = "itertools" name = "itertools"
version = "0.10.3" version = "0.10.3"
@@ -1723,6 +1857,28 @@ version = "0.2.125"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b" checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b"
[[package]]
name = "libnghttp2-sys"
version = "0.1.7+1.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57ed28aba195b38d5ff02b9170cbff627e336a20925e43b4945390401c5dc93f"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "libz-sys"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf"
dependencies = [
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.7" version = "0.4.7"
@@ -1870,6 +2026,16 @@ dependencies = [
"metrics-macros", "metrics-macros",
] ]
[[package]]
name = "metrics"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "142c53885123b68d94108295a09d4afe1a1388ed95b54d5dacd9a454753030f2"
dependencies = [
"ahash",
"metrics-macros",
]
[[package]] [[package]]
name = "metrics-exporter-prometheus" name = "metrics-exporter-prometheus"
version = "0.9.0" version = "0.9.0"
@@ -1877,7 +2043,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b93b470b04c005178058e18ac8bb2eb3fda562cf87af5ea05ba8d44190d458c" checksum = "8b93b470b04c005178058e18ac8bb2eb3fda562cf87af5ea05ba8d44190d458c"
dependencies = [ dependencies = [
"indexmap", "indexmap",
"metrics", "metrics 0.18.1",
"metrics-util", "metrics-util",
"parking_lot 0.11.2", "parking_lot 0.11.2",
"quanta", "quanta",
@@ -1905,7 +2071,7 @@ dependencies = [
"crossbeam-epoch", "crossbeam-epoch",
"crossbeam-utils", "crossbeam-utils",
"hashbrown 0.11.2", "hashbrown 0.11.2",
"metrics", "metrics 0.18.1",
"num_cpus", "num_cpus",
"parking_lot 0.11.2", "parking_lot 0.11.2",
"quanta", "quanta",
@@ -2178,9 +2344,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]] [[package]]
name = "opendal" name = "opendal"
version = "0.6.2" version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3649ace5a99d388ac9d02459135ad0425941e8cf6c33f418c4ded80483563ce3" checksum = "e9e982034fd0b4f142efba461604f5ccb1fb1f962c4e84c8e44ce369f0e3d1f2"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-compat", "async-compat",
@@ -2193,15 +2359,14 @@ dependencies = [
"hyper-tls", "hyper-tls",
"log", "log",
"md5", "md5",
"metrics", "metrics 0.19.0",
"minitrace", "minitrace",
"once_cell", "once_cell",
"parking_lot 0.12.0", "parking_lot 0.12.0",
"percent-encoding",
"pin-project", "pin-project",
"quick-xml", "quick-xml",
"reqsign", "reqsign",
"reqwest",
"roxmltree",
"serde", "serde",
"thiserror", "thiserror",
"time 0.3.9", "time 0.3.9",
@@ -2323,6 +2488,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bcbab4bfea7a59c2c0fe47211a1ac4e3e96bea6eb446d704f310bc5c732ae2" checksum = "96bcbab4bfea7a59c2c0fe47211a1ac4e3e96bea6eb446d704f310bc5c732ae2"
dependencies = [ dependencies = [
"num-traits", "num-traits",
"serde",
] ]
[[package]] [[package]]
@@ -2341,6 +2507,12 @@ version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
[[package]]
name = "parking"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72"
[[package]] [[package]]
name = "parking_lot" name = "parking_lot"
version = "0.11.2" version = "0.11.2"
@@ -2577,6 +2749,19 @@ dependencies = [
"plotters-backend", "plotters-backend",
] ]
[[package]]
name = "polling"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "685404d509889fade3e86fe3a5803bca2ec09b0c0778d5ada6ec8bf7a8de5259"
dependencies = [
"cfg-if",
"libc",
"log",
"wepoll-ffi",
"winapi",
]
[[package]] [[package]]
name = "ppv-lite86" name = "ppv-lite86"
version = "0.2.16" version = "0.2.16"
@@ -2585,9 +2770,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
[[package]] [[package]]
name = "prettyplease" name = "prettyplease"
version = "0.1.14" version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3662417e650bd6af740f5b8b3501776aa10c3d5cbd10b40263ed250d3770884" checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"syn", "syn",
@@ -2714,7 +2899,7 @@ dependencies = [
"datatypes", "datatypes",
"futures", "futures",
"futures-util", "futures-util",
"metrics", "metrics 0.18.1",
"num", "num",
"num-traits", "num-traits",
"rand 0.8.5", "rand 0.8.5",
@@ -2727,9 +2912,9 @@ dependencies = [
[[package]] [[package]]
name = "quick-xml" name = "quick-xml"
version = "0.22.0" version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" checksum = "9279fbdacaad3baf559d8cabe0acc3d06e30ea14931af31af79578ac0946decc"
dependencies = [ dependencies = [
"memchr", "memchr",
"serde", "serde",
@@ -2910,12 +3095,12 @@ dependencies = [
[[package]] [[package]]
name = "reqsign" name = "reqsign"
version = "0.0.3" version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8931679eac96ffc8eee4e5507c4b91fbc0799f29a6535707ee3ef89c0d0de426" checksum = "9a6b48d7d1f390bcb0149b4d7a3022f5a927fca173c19413ba17e74936716e39"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "backon",
"base64", "base64",
"bytes", "bytes",
"dirs", "dirs",
@@ -2923,18 +3108,17 @@ dependencies = [
"hex", "hex",
"hmac", "hmac",
"http", "http",
"isahc",
"jsonwebtoken", "jsonwebtoken",
"log", "log",
"once_cell", "once_cell",
"percent-encoding", "percent-encoding",
"reqwest", "quick-xml",
"roxmltree",
"rust-ini", "rust-ini",
"serde", "serde",
"serde_json", "serde_json",
"sha2", "sha2",
"time 0.3.9", "time 0.3.9",
"tokio",
] ]
[[package]] [[package]]
@@ -2996,15 +3180,6 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "roxmltree"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "921904a62e410e37e215c40381b7117f830d9d89ba60ab5236170541dd25646b"
dependencies = [
"xmlparser",
]
[[package]] [[package]]
name = "rust-ini" name = "rust-ini"
version = "0.18.0" version = "0.18.0"
@@ -3216,10 +3391,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32"
[[package]] [[package]]
name = "smallvec" name = "sluice"
version = "1.8.0" version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" checksum = "6d7400c0eff44aa2fcb5e31a5f24ba9716ed90138769e4977a2ba6014ae63eb5"
dependencies = [
"async-channel",
"futures-core",
"futures-io",
]
[[package]]
name = "smallvec"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1"
[[package]] [[package]]
name = "snafu" name = "snafu"
@@ -3295,16 +3481,35 @@ name = "storage"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"arrow-format",
"async-trait", "async-trait",
"atomic_float", "atomic_float",
"bit-vec",
"bytes",
"common-error", "common-error",
"common-runtime",
"common-telemetry", "common-telemetry",
"common-time",
"criterion", "criterion",
"datatypes", "datatypes",
"futures",
"futures-util",
"lazy_static",
"log-store",
"object-store",
"planus",
"prost",
"rand 0.8.5", "rand 0.8.5",
"regex",
"serde",
"serde_json",
"snafu", "snafu",
"store-api", "store-api",
"tempdir",
"tokio", "tokio",
"tonic",
"tonic-build",
"uuid",
] ]
[[package]] [[package]]
@@ -3316,8 +3521,11 @@ dependencies = [
"bytes", "bytes",
"common-base", "common-base",
"common-error", "common-error",
"common-time",
"datatypes", "datatypes",
"futures", "futures",
"object-store",
"serde",
"snafu", "snafu",
"tokio", "tokio",
] ]
@@ -3422,10 +3630,12 @@ dependencies = [
"datafusion-common", "datafusion-common",
"datatypes", "datatypes",
"futures", "futures",
"log-store",
"snafu", "snafu",
"storage", "storage",
"store-api", "store-api",
"table", "table",
"tempdir",
"tokio", "tokio",
] ]
@@ -4004,9 +4214,9 @@ dependencies = [
[[package]] [[package]]
name = "uuid" name = "uuid"
version = "1.0.0" version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8cfcd319456c4d6ea10087ed423473267e1a071f3bc0aa89f80d60997843c6f0" checksum = "dd6469f4314d5f1ffec476e05f17cc9a78bc7a27a6a857842170bdf8d6f98d2f"
dependencies = [ dependencies = [
"getrandom", "getrandom",
] ]
@@ -4029,6 +4239,12 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "waker-fn"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca"
[[package]] [[package]]
name = "walkdir" name = "walkdir"
version = "2.3.2" version = "2.3.2"
@@ -4144,6 +4360,15 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "wepoll-ffi"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb"
dependencies = [
"cc",
]
[[package]] [[package]]
name = "which" name = "which"
version = "4.2.5" version = "4.2.5"
@@ -4238,12 +4463,6 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "xmlparser"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "114ba2b24d2167ef6d67d7d04c8cc86522b87f490025f39f0303b7db5bf5e3d8"
[[package]] [[package]]
name = "zstd" name = "zstd"
version = "0.10.0+zstd.1.5.2" version = "0.10.0+zstd.1.5.2"

View File

@@ -5,11 +5,11 @@ members = [
"src/common/base", "src/common/base",
"src/common/error", "src/common/error",
"src/common/function", "src/common/function",
"src/common/query",
"src/common/recordbatch",
"src/common/runtime", "src/common/runtime",
"src/common/telemetry", "src/common/telemetry",
"src/common/time", "src/common/time",
"src/common/query",
"src/common/recordbatch",
"src/cmd", "src/cmd",
"src/datanode", "src/datanode",
"src/datatypes", "src/datatypes",

View File

@@ -1,5 +1,5 @@
use clap::Parser; use clap::Parser;
use datanode::{Datanode, DatanodeOptions}; use datanode::datanode::{Datanode, DatanodeOptions};
use snafu::ResultExt; use snafu::ResultExt;
use crate::error::{Result, StartDatanodeSnafu}; use crate::error::{Result, StartDatanodeSnafu};
@@ -40,6 +40,7 @@ struct StartCommand {
impl StartCommand { impl StartCommand {
async fn run(self) -> Result<()> { async fn run(self) -> Result<()> {
Datanode::new(self.into()) Datanode::new(self.into())
.await
.context(StartDatanodeSnafu)? .context(StartDatanodeSnafu)?
.start() .start()
.await .await
@@ -52,6 +53,7 @@ impl From<StartCommand> for DatanodeOptions {
DatanodeOptions { DatanodeOptions {
http_addr: cmd.http_addr, http_addr: cmd.http_addr,
rpc_addr: cmd.rpc_addr, rpc_addr: cmd.rpc_addr,
..Default::default()
} }
} }
} }

View File

@@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
bytes = "1.1" bytes = { version = "1.1", features = ["serde"] }
common-error = { path = "../error" } common-error = { path = "../error" }
paste = "1.0" paste = "1.0"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }

View File

@@ -1,9 +1,9 @@
use std::ops::Deref; use std::ops::Deref;
use serde::{Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
/// Bytes buffer. /// Bytes buffer.
#[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize)]
pub struct Bytes(bytes::Bytes); pub struct Bytes(bytes::Bytes);
impl From<bytes::Bytes> for Bytes { impl From<bytes::Bytes> for Bytes {
@@ -56,15 +56,6 @@ impl PartialEq<Bytes> for [u8] {
} }
} }
impl Serialize for Bytes {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.0.serialize(serializer)
}
}
/// String buffer that can hold arbitrary encoding string (only support UTF-8 now). /// String buffer that can hold arbitrary encoding string (only support UTF-8 now).
/// ///
/// Now this buffer is restricted to only hold valid UTF-8 string (only allow constructing `StringBytes` /// Now this buffer is restricted to only hold valid UTF-8 string (only allow constructing `StringBytes`
@@ -128,6 +119,17 @@ impl Serialize for StringBytes {
} }
} }
// Custom Deserialize to ensure UTF-8 check is always done.
impl<'de> Deserialize<'de> for StringBytes {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
Ok(StringBytes::from(s))
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;

View File

@@ -34,6 +34,11 @@ pub enum StatusCode {
TableNotFound, TableNotFound,
TableColumnNotFound, TableColumnNotFound,
// ====== End of catalog related status code ======= // ====== End of catalog related status code =======
// ====== Begin of storage related status code =====
/// Storage is temporarily unable to handle the request
StorageUnavailable,
// ====== End of storage related status code =======
} }
impl fmt::Display for StatusCode { impl fmt::Display for StatusCode {

View File

@@ -9,4 +9,4 @@ pub use global::{
spawn_read, spawn_write, write_runtime, spawn_read, spawn_write, write_runtime,
}; };
pub use crate::runtime::{Builder, JoinHandle, Runtime}; pub use crate::runtime::{Builder, JoinError, JoinHandle, Runtime};

View File

@@ -6,13 +6,13 @@ use metrics::{decrement_gauge, increment_gauge};
use snafu::ResultExt; use snafu::ResultExt;
use tokio::runtime::{Builder as RuntimeBuilder, Handle}; use tokio::runtime::{Builder as RuntimeBuilder, Handle};
use tokio::sync::oneshot; use tokio::sync::oneshot;
pub use tokio::task::JoinHandle; pub use tokio::task::{JoinError, JoinHandle};
use crate::error::*; use crate::error::*;
use crate::metric::*; use crate::metric::*;
/// A runtime to run future tasks /// A runtime to run future tasks
#[derive(Clone)] #[derive(Clone, Debug)]
pub struct Runtime { pub struct Runtime {
handle: Handle, handle: Handle,
// Used to receive a drop signal when dropper is dropped, inspired by databend // Used to receive a drop signal when dropper is dropped, inspired by databend
@@ -20,6 +20,7 @@ pub struct Runtime {
} }
/// Dropping the dropper will cause runtime to shutdown. /// Dropping the dropper will cause runtime to shutdown.
#[derive(Debug)]
pub struct Dropper { pub struct Dropper {
close: Option<oneshot::Sender<()>>, close: Option<oneshot::Sender<()>>,
} }

View File

@@ -11,7 +11,7 @@ pub struct TimeRange<T> {
} }
impl<T> TimeRange<T> { impl<T> TimeRange<T> {
/// Create a new range that contains timestamp in `[start, end)`. /// Creates a new range that contains timestamp in `[start, end)`.
/// ///
/// Returns `None` if `start` > `end`. /// Returns `None` if `start` > `end`.
pub fn new<U: PartialOrd + Into<T>>(start: U, end: U) -> Option<TimeRange<T>> { pub fn new<U: PartialOrd + Into<T>>(start: U, end: U) -> Option<TimeRange<T>> {
@@ -23,6 +23,14 @@ impl<T> TimeRange<T> {
} }
} }
/// Given a value, creates an empty time range that `start == end == value`.
pub fn empty_with_value<U: Clone + Into<T>>(value: U) -> TimeRange<T> {
TimeRange {
start: value.clone().into(),
end: value.into(),
}
}
/// Returns the lower bound of the range (inclusive). /// Returns the lower bound of the range (inclusive).
#[inline] #[inline]
pub fn start(&self) -> &T { pub fn start(&self) -> &T {
@@ -71,6 +79,10 @@ mod tests {
assert_eq!(range_eq.start(), range_eq.end()); assert_eq!(range_eq.start(), range_eq.end());
assert_eq!(None, RangeMillis::new(1, 0)); assert_eq!(None, RangeMillis::new(1, 0));
let range = RangeMillis::empty_with_value(1024);
assert_eq!(range.start(), range.end());
assert_eq!(1024, *range.start());
} }
#[test] #[test]

View File

@@ -1,6 +1,8 @@
use std::cmp::Ordering; use std::cmp::Ordering;
/// Unix timestamp in millisecond resolution. /// Unix timestamp in millisecond resolution.
///
/// Negative timestamp is allowed, which represents timestamp before '1970-01-01T00:00:00'.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TimestampMillis(i64); pub struct TimestampMillis(i64);
@@ -18,6 +20,29 @@ impl TimestampMillis {
pub const fn new(ms: i64) -> TimestampMillis { pub const fn new(ms: i64) -> TimestampMillis {
TimestampMillis(ms) TimestampMillis(ms)
} }
/// Returns the timestamp aligned by `bucket_duration` in milliseconds or
/// `None` if overflow occurred.
///
/// # Panics
/// Panics if `bucket_duration <= 0`.
pub fn align_by_bucket(self, bucket_duration: i64) -> Option<TimestampMillis> {
assert!(bucket_duration > 0);
let ts = if self.0 >= 0 {
self.0
} else {
// `bucket_duration > 0` implies `bucket_duration - 1` won't overflow.
self.0.checked_sub(bucket_duration - 1)?
};
Some(TimestampMillis(ts / bucket_duration * bucket_duration))
}
/// Returns the timestamp value as i64.
pub fn as_i64(&self) -> i64 {
self.0
}
} }
impl From<i64> for TimestampMillis { impl From<i64> for TimestampMillis {
@@ -60,6 +85,7 @@ mod tests {
let timestamp = TimestampMillis::from(ts); let timestamp = TimestampMillis::from(ts);
assert_eq!(timestamp, ts); assert_eq!(timestamp, ts);
assert_eq!(ts, timestamp); assert_eq!(ts, timestamp);
assert_eq!(ts, timestamp.as_i64());
assert_ne!(TimestampMillis::new(0), timestamp); assert_ne!(TimestampMillis::new(0), timestamp);
assert!(TimestampMillis::new(-123) < TimestampMillis::new(0)); assert!(TimestampMillis::new(-123) < TimestampMillis::new(0));
@@ -70,4 +96,28 @@ mod tests {
assert_eq!(i64::MAX - 1, TimestampMillis::MAX); assert_eq!(i64::MAX - 1, TimestampMillis::MAX);
assert_eq!(i64::MIN, TimestampMillis::MIN); assert_eq!(i64::MIN, TimestampMillis::MIN);
} }
#[test]
fn test_align_by_bucket() {
let bucket = 100;
assert_eq!(0, TimestampMillis::new(0).align_by_bucket(bucket).unwrap());
assert_eq!(0, TimestampMillis::new(1).align_by_bucket(bucket).unwrap());
assert_eq!(0, TimestampMillis::new(99).align_by_bucket(bucket).unwrap());
assert_eq!(
100,
TimestampMillis::new(100).align_by_bucket(bucket).unwrap()
);
assert_eq!(
100,
TimestampMillis::new(199).align_by_bucket(bucket).unwrap()
);
assert_eq!(0, TimestampMillis::MAX.align_by_bucket(i64::MAX).unwrap());
assert_eq!(
i64::MAX,
TimestampMillis::INF.align_by_bucket(i64::MAX).unwrap()
);
assert_eq!(None, TimestampMillis::MIN.align_by_bucket(bucket));
}
} }

View File

@@ -15,6 +15,7 @@ common-recordbatch = { path = "../common/recordbatch" }
common-telemetry = { path = "../common/telemetry" } common-telemetry = { path = "../common/telemetry" }
datatypes = { path = "../datatypes"} datatypes = { path = "../datatypes"}
hyper = { version = "0.14", features = ["full"] } hyper = { version = "0.14", features = ["full"] }
log-store = { path = "../log-store" }
metrics = "0.18" metrics = "0.18"
query = { path = "../query" } query = { path = "../query" }
serde = "1.0" serde = "1.0"
@@ -34,6 +35,7 @@ tower-http = { version ="0.3", features = ["full"]}
[dev-dependencies] [dev-dependencies]
axum-test-helper = "0.1" axum-test-helper = "0.1"
common-query = { path = "../common/query" } common-query = { path = "../common/query" }
tempdir = "0.3"
[dev-dependencies.arrow] [dev-dependencies.arrow]
package = "arrow2" package = "arrow2"

View File

@@ -8,11 +8,23 @@ use crate::error::{NewCatalogSnafu, Result};
use crate::instance::{Instance, InstanceRef}; use crate::instance::{Instance, InstanceRef};
use crate::server::Services; use crate::server::Services;
#[derive(Debug)] #[derive(Clone, Debug)]
pub struct DatanodeOptions { pub struct DatanodeOptions {
pub http_addr: String, pub http_addr: String,
pub rpc_addr: String, pub rpc_addr: String,
pub wal_dir: String,
} }
impl Default for DatanodeOptions {
fn default() -> Self {
Self {
http_addr: Default::default(),
rpc_addr: Default::default(),
wal_dir: "/tmp/wal".to_string(),
}
}
}
/// Datanode service. /// Datanode service.
pub struct Datanode { pub struct Datanode {
opts: DatanodeOptions, opts: DatanodeOptions,
@@ -22,9 +34,9 @@ pub struct Datanode {
} }
impl Datanode { impl Datanode {
pub fn new(opts: DatanodeOptions) -> Result<Datanode> { pub async fn new(opts: DatanodeOptions) -> Result<Datanode> {
let catalog_list = memory::new_memory_catalog_list().context(NewCatalogSnafu)?; let catalog_list = memory::new_memory_catalog_list().context(NewCatalogSnafu)?;
let instance = Arc::new(Instance::new(catalog_list.clone())); let instance = Arc::new(Instance::new(&opts, catalog_list.clone()).await?);
Ok(Self { Ok(Self {
opts, opts,

View File

@@ -3,6 +3,7 @@ use std::any::Any;
use common_error::ext::BoxedError; use common_error::ext::BoxedError;
use common_error::prelude::*; use common_error::prelude::*;
use datatypes::prelude::ConcreteDataType; use datatypes::prelude::ConcreteDataType;
use storage::error::Error as StorageError;
use table::error::Error as TableError; use table::error::Error as TableError;
use table_engine::error::Error as TableEngineError; use table_engine::error::Error as TableEngineError;
@@ -92,6 +93,15 @@ pub enum Error {
#[snafu(display("Fail to start gRPC server, source: {}", source))] #[snafu(display("Fail to start gRPC server, source: {}", source))]
StartGrpc { source: tonic::transport::Error }, StartGrpc { source: tonic::transport::Error },
#[snafu(display("Failed to create directory {}, source: {}", dir, source))]
CreateDir { dir: String, source: std::io::Error },
#[snafu(display("Failed to open log store, source: {}", source))]
OpenLogStore { source: log_store::error::Error },
#[snafu(display("Failed to storage engine, source: {}", source))]
OpenStorageEngine { source: StorageError },
} }
pub type Result<T> = std::result::Result<T, Error>; pub type Result<T> = std::result::Result<T, Error>;
@@ -112,7 +122,10 @@ impl ErrorExt for Error {
Error::StartHttp { .. } Error::StartHttp { .. }
| Error::ParseAddr { .. } | Error::ParseAddr { .. }
| Error::TcpBind { .. } | Error::TcpBind { .. }
| Error::StartGrpc { .. } => StatusCode::Internal, | Error::StartGrpc { .. }
| Error::CreateDir { .. } => StatusCode::Internal,
Error::OpenLogStore { source } => source.status_code(),
Error::OpenStorageEngine { source } => source.status_code(),
} }
} }

View File

@@ -1,21 +1,24 @@
use std::sync::Arc; use std::{fs, path, sync::Arc};
use common_telemetry::logging::info;
use datatypes::prelude::ConcreteDataType; use datatypes::prelude::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema}; use datatypes::schema::{ColumnSchema, Schema};
use log_store::fs::{config::LogConfig, log::LocalFileLogStore};
use query::catalog::{CatalogListRef, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use query::catalog::{CatalogListRef, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use query::query_engine::{Output, QueryEngineFactory, QueryEngineRef}; use query::query_engine::{Output, QueryEngineFactory, QueryEngineRef};
use snafu::ResultExt; use snafu::ResultExt;
use sql::statements::statement::Statement; use sql::statements::statement::Statement;
use storage::EngineImpl; use storage::{config::EngineConfig, EngineImpl};
use table::engine::EngineContext; use table::engine::EngineContext;
use table::engine::TableEngine; use table::engine::TableEngine;
use table::requests::CreateTableRequest; use table::requests::CreateTableRequest;
use table_engine::engine::MitoEngine; use table_engine::engine::MitoEngine;
use crate::error::{CreateTableSnafu, ExecuteSqlSnafu, Result}; use crate::datanode::DatanodeOptions;
use crate::error::{self, CreateTableSnafu, ExecuteSqlSnafu, Result};
use crate::sql::SqlHandler; use crate::sql::SqlHandler;
type DefaultEngine = MitoEngine<EngineImpl>; type DefaultEngine = MitoEngine<EngineImpl<LocalFileLogStore>>;
// An abstraction to read/write services. // An abstraction to read/write services.
pub struct Instance { pub struct Instance {
@@ -30,17 +33,22 @@ pub struct Instance {
pub type InstanceRef = Arc<Instance>; pub type InstanceRef = Arc<Instance>;
impl Instance { impl Instance {
pub fn new(catalog_list: CatalogListRef) -> Self { pub async fn new(opts: &DatanodeOptions, catalog_list: CatalogListRef) -> Result<Self> {
let log_store = create_local_file_log_store(opts).await?;
let factory = QueryEngineFactory::new(catalog_list.clone()); let factory = QueryEngineFactory::new(catalog_list.clone());
let query_engine = factory.query_engine().clone(); let query_engine = factory.query_engine().clone();
let table_engine = DefaultEngine::new(EngineImpl::new()); let table_engine = DefaultEngine::new(
EngineImpl::new(EngineConfig::default(), Arc::new(log_store))
.await
.context(error::OpenStorageEngineSnafu)?,
);
Self { Ok(Self {
query_engine, query_engine,
sql_handler: SqlHandler::new(table_engine.clone()), sql_handler: SqlHandler::new(table_engine.clone()),
table_engine, table_engine,
catalog_list, catalog_list,
} })
} }
pub async fn execute_sql(&self, sql: &str) -> Result<Output> { pub async fn execute_sql(&self, sql: &str) -> Result<Output> {
@@ -95,7 +103,10 @@ impl Instance {
CreateTableRequest { CreateTableRequest {
name: table_name.to_string(), name: table_name.to_string(),
desc: Some(" a test table".to_string()), desc: Some(" a test table".to_string()),
schema: Arc::new(Schema::new(column_schemas)), schema: Arc::new(
Schema::with_timestamp_index(column_schemas, 3)
.expect("ts is expected to be timestamp column"),
),
}, },
) )
.await .await
@@ -116,6 +127,25 @@ impl Instance {
} }
} }
async fn create_local_file_log_store(opts: &DatanodeOptions) -> Result<LocalFileLogStore> {
// create WAL directory
fs::create_dir_all(path::Path::new(&opts.wal_dir))
.context(error::CreateDirSnafu { dir: &opts.wal_dir })?;
info!("The WAL directory is: {}", &opts.wal_dir);
let log_config = LogConfig {
log_file_dir: opts.wal_dir.clone(),
..Default::default()
};
let log_store = LocalFileLogStore::open(&log_config)
.await
.context(error::OpenLogStoreSnafu)?;
Ok(log_store)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use arrow::array::UInt64Array; use arrow::array::UInt64Array;
@@ -123,12 +153,13 @@ mod tests {
use query::catalog::memory; use query::catalog::memory;
use super::*; use super::*;
use crate::test_util;
#[tokio::test] #[tokio::test]
async fn test_execute_insert() { async fn test_execute_insert() {
let catalog_list = memory::new_memory_catalog_list().unwrap(); let catalog_list = memory::new_memory_catalog_list().unwrap();
let (opts, _tmp_dir) = test_util::create_tmp_dir_and_datanode_opts();
let instance = Instance::new(catalog_list); let instance = Instance::new(&opts, catalog_list).await.unwrap();
instance.start().await.unwrap(); instance.start().await.unwrap();
let output = instance let output = instance
@@ -147,8 +178,8 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_execute_query() { async fn test_execute_query() {
let catalog_list = memory::new_memory_catalog_list().unwrap(); let catalog_list = memory::new_memory_catalog_list().unwrap();
let (opts, _tmp_dir) = test_util::create_tmp_dir_and_datanode_opts();
let instance = Instance::new(catalog_list); let instance = Instance::new(&opts, catalog_list).await.unwrap();
let output = instance let output = instance
.execute_sql("select sum(number) from numbers limit 20") .execute_sql("select sum(number) from numbers limit 20")

View File

@@ -6,5 +6,7 @@ mod metric;
pub mod server; pub mod server;
mod sql; mod sql;
pub use crate::datanode::Datanode; #[cfg(test)]
pub use crate::datanode::DatanodeOptions; pub mod test_util;
#[cfg(test)]
mod tests;

View File

@@ -48,6 +48,7 @@ mod tests {
use super::*; use super::*;
use crate::instance::Instance; use crate::instance::Instance;
use crate::server::http::JsonOutput; use crate::server::http::JsonOutput;
use crate::test_util;
fn create_params() -> Query<HashMap<String, String>> { fn create_params() -> Query<HashMap<String, String>> {
let mut map = HashMap::new(); let mut map = HashMap::new();
@@ -58,15 +59,16 @@ mod tests {
Query(map) Query(map)
} }
fn create_extension() -> Extension<InstanceRef> { async fn create_extension() -> Extension<InstanceRef> {
let catalog_list = memory::new_memory_catalog_list().unwrap(); let catalog_list = memory::new_memory_catalog_list().unwrap();
let instance = Arc::new(Instance::new(catalog_list)); let (opts, _tmp_dir) = test_util::create_tmp_dir_and_datanode_opts();
let instance = Arc::new(Instance::new(&opts, catalog_list).await.unwrap());
Extension(instance) Extension(instance)
} }
#[tokio::test] #[tokio::test]
async fn test_sql_not_provided() { async fn test_sql_not_provided() {
let extension = create_extension(); let extension = create_extension().await;
let json = sql(extension, Query(HashMap::default())).await; let json = sql(extension, Query(HashMap::default())).await;
match json { match json {
@@ -82,7 +84,7 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_sql_output_rows() { async fn test_sql_output_rows() {
let query = create_params(); let query = create_params();
let extension = create_extension(); let extension = create_extension().await;
let json = sql(extension, query).await; let json = sql(extension, query).await;
@@ -110,7 +112,7 @@ mod tests {
counter!("test_metrics", 1); counter!("test_metrics", 1);
let query = create_params(); let query = create_params();
let extension = create_extension(); let extension = create_extension().await;
let text = metrics(extension, query).await; let text = metrics(extension, query).await;
match text { match text {

View File

@@ -63,14 +63,17 @@ mod tests {
use datatypes::prelude::ConcreteDataType; use datatypes::prelude::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use datatypes::value::Value; use datatypes::value::Value;
use log_store::fs::noop::NoopLogStore;
use query::catalog::memory; use query::catalog::memory;
use query::catalog::schema::SchemaProvider; use query::catalog::schema::SchemaProvider;
use query::error::Result as QueryResult; use query::error::Result as QueryResult;
use query::QueryEngineFactory; use query::QueryEngineFactory;
use storage::config::EngineConfig;
use storage::EngineImpl; use storage::EngineImpl;
use table::error::Result as TableResult; use table::error::Result as TableResult;
use table::{Table, TableRef}; use table::{Table, TableRef};
use table_engine::engine::MitoEngine; use table_engine::engine::MitoEngine;
use tempdir::TempDir;
use super::*; use super::*;
@@ -90,7 +93,7 @@ mod tests {
ColumnSchema::new("ts", ConcreteDataType::int64_datatype(), true), ColumnSchema::new("ts", ConcreteDataType::int64_datatype(), true),
]; ];
Arc::new(Schema::new(column_schemas)) Arc::new(Schema::with_timestamp_index(column_schemas, 3).unwrap())
} }
async fn scan( async fn scan(
&self, &self,
@@ -129,8 +132,11 @@ mod tests {
} }
} }
#[test] #[tokio::test]
fn test_statement_to_request() { async fn test_statement_to_request() {
let dir = TempDir::new("setup_test_engine_and_table").unwrap();
let store_dir = dir.path().to_string_lossy();
let catalog_list = memory::new_memory_catalog_list().unwrap(); let catalog_list = memory::new_memory_catalog_list().unwrap();
let factory = QueryEngineFactory::new(catalog_list); let factory = QueryEngineFactory::new(catalog_list);
let query_engine = factory.query_engine().clone(); let query_engine = factory.query_engine().clone();
@@ -140,7 +146,14 @@ mod tests {
('host2', 88.8, 333.3, 1655276558000) ('host2', 88.8, 333.3, 1655276558000)
"#; "#;
let table_engine = MitoEngine::<EngineImpl>::new(EngineImpl::new()); let table_engine = MitoEngine::<EngineImpl<NoopLogStore>>::new(
EngineImpl::new(
EngineConfig::with_store_dir(&store_dir),
Arc::new(NoopLogStore::default()),
)
.await
.unwrap(),
);
let sql_handler = SqlHandler::new(table_engine); let sql_handler = SqlHandler::new(table_engine);
let stmt = query_engine.sql_to_statement(sql).unwrap(); let stmt = query_engine.sql_to_statement(sql).unwrap();

View File

@@ -0,0 +1,17 @@
use tempdir::TempDir;
use crate::datanode::DatanodeOptions;
/// Create a tmp dir(will be deleted once it goes out of scope.) and a default `DatanodeOptions`,
/// Only for test.
///
/// TODO: Add a test feature
pub fn create_tmp_dir_and_datanode_opts() -> (DatanodeOptions, TempDir) {
let tmp_dir = TempDir::new("/tmp/greptimedb_test").unwrap();
let opts = DatanodeOptions {
wal_dir: tmp_dir.path().to_str().unwrap().to_string(),
..Default::default()
};
(opts, tmp_dir)
}

View File

@@ -0,0 +1 @@
mod http_test;

View File

@@ -5,12 +5,16 @@ use std::sync::Arc;
use axum::http::StatusCode; use axum::http::StatusCode;
use axum::Router; use axum::Router;
use axum_test_helper::TestClient; use axum_test_helper::TestClient;
use datanode::{instance::Instance, server::http::HttpServer};
use query::catalog::memory; use query::catalog::memory;
fn make_test_app() -> Router { use crate::instance::Instance;
use crate::server::http::HttpServer;
use crate::test_util;
async fn make_test_app() -> Router {
let catalog_list = memory::new_memory_catalog_list().unwrap(); let catalog_list = memory::new_memory_catalog_list().unwrap();
let instance = Arc::new(Instance::new(catalog_list)); let (opts, _tmp_dir) = test_util::create_tmp_dir_and_datanode_opts();
let instance = Arc::new(Instance::new(&opts, catalog_list).await.unwrap());
let http_server = HttpServer::new(instance); let http_server = HttpServer::new(instance);
http_server.make_app() http_server.make_app()
} }
@@ -18,7 +22,7 @@ fn make_test_app() -> Router {
#[tokio::test] #[tokio::test]
async fn test_sql_api() { async fn test_sql_api() {
common_telemetry::init_default_ut_logging(); common_telemetry::init_default_ut_logging();
let app = make_test_app(); let app = make_test_app().await;
let client = TestClient::new(app); let client = TestClient::new(app);
let res = client.get("/sql").send().await; let res = client.get("/sql").send().await;
assert_eq!(res.status(), StatusCode::OK); assert_eq!(res.status(), StatusCode::OK);
@@ -46,7 +50,7 @@ async fn test_sql_api() {
async fn test_metrics_api() { async fn test_metrics_api() {
common_telemetry::init_default_ut_logging(); common_telemetry::init_default_ut_logging();
common_telemetry::init_default_metrics_recorder(); common_telemetry::init_default_metrics_recorder();
let app = make_test_app(); let app = make_test_app().await;
let client = TestClient::new(app); let client = TestClient::new(app);
// Send a sql // Send a sql

View File

@@ -13,10 +13,10 @@ common-base = { path = "../common/base" }
common-error = { path = "../common/error" } common-error = { path = "../common/error" }
datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git" , branch = "arrow2" } datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git" , branch = "arrow2" }
enum_dispatch = "0.3" enum_dispatch = "0.3"
ordered-float = "3.0"
paste = "1.0"
num = "0.4" num = "0.4"
num-traits = "0.2" num-traits = "0.2"
serde = { version = "1.0.136", features = ["derive"] } ordered-float = { version = "3.0", features = ["serde"]}
paste = "1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0" serde_json = "1.0"
snafu = { version = "0.7", features = ["backtraces"] } snafu = { version = "0.7", features = ["backtraces"] }

View File

@@ -2,6 +2,7 @@ use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType; use arrow::datatypes::DataType as ArrowDataType;
use paste::paste; use paste::paste;
use serde::{Deserialize, Serialize};
use crate::error::{self, Error, Result}; use crate::error::{self, Error, Result};
use crate::type_id::LogicalTypeId; use crate::type_id::LogicalTypeId;
@@ -11,7 +12,7 @@ use crate::types::{
}; };
use crate::value::Value; use crate::value::Value;
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[enum_dispatch::enum_dispatch(DataType)] #[enum_dispatch::enum_dispatch(DataType)]
pub enum ConcreteDataType { pub enum ConcreteDataType {
Null(NullType), Null(NullType),
@@ -72,6 +73,10 @@ impl ConcreteDataType {
) )
} }
pub fn is_timestamp(&self) -> bool {
matches!(self, ConcreteDataType::Int64(_))
}
pub fn numerics() -> Vec<ConcreteDataType> { pub fn numerics() -> Vec<ConcreteDataType> {
vec![ vec![
ConcreteDataType::int8_datatype(), ConcreteDataType::int8_datatype(),

View File

@@ -30,6 +30,20 @@ pub enum Error {
arrow_type: arrow::datatypes::DataType, arrow_type: arrow::datatypes::DataType,
backtrace: Backtrace, backtrace: Backtrace,
}, },
#[snafu(display(
"Failed to parse index in schema meta, value: {}, source: {}",
value,
source
))]
ParseSchemaIndex {
value: String,
source: std::num::ParseIntError,
backtrace: Backtrace,
},
#[snafu(display("Invalid timestamp index: {}", index))]
InvalidTimestampIndex { index: usize, backtrace: Backtrace },
} }
impl ErrorExt for Error { impl ErrorExt for Error {

View File

@@ -1,15 +1,19 @@
use std::collections::HashMap; use std::collections::{BTreeMap, HashMap};
use std::sync::Arc; use std::sync::Arc;
use arrow::datatypes::{Field, Schema as ArrowSchema}; use arrow::datatypes::{Field, Metadata, Schema as ArrowSchema};
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use crate::data_type::{ConcreteDataType, DataType}; use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{Error, Result}; use crate::error::{self, Error, Result};
const TIMESTAMP_INDEX_KEY: &str = "greptime:timestamp_index";
// TODO(yingwen): consider assign a version to schema so compare schema can be // TODO(yingwen): consider assign a version to schema so compare schema can be
// done by compare version. // done by compare version.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ColumnSchema { pub struct ColumnSchema {
pub name: String, pub name: String,
pub data_type: ConcreteDataType, pub data_type: ConcreteDataType,
@@ -30,31 +34,49 @@ impl ColumnSchema {
} }
} }
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Schema { pub struct Schema {
column_schemas: Vec<ColumnSchema>, column_schemas: Vec<ColumnSchema>,
name_to_index: HashMap<String, usize>, name_to_index: HashMap<String, usize>,
arrow_schema: Arc<ArrowSchema>, arrow_schema: Arc<ArrowSchema>,
/// Index of the timestamp key column.
///
/// Timestamp key column is the column holds the timestamp and forms part of
/// the primary key. None means there is no timestamp key column.
timestamp_index: Option<usize>,
} }
impl Schema { impl Schema {
pub fn new(column_schemas: Vec<ColumnSchema>) -> Schema { pub fn new(column_schemas: Vec<ColumnSchema>) -> Schema {
let mut fields = Vec::with_capacity(column_schemas.len()); let (arrow_schema, name_to_index) = collect_column_schemas(&column_schemas);
let mut name_to_index = HashMap::with_capacity(column_schemas.len());
for (index, column_schema) in column_schemas.iter().enumerate() {
let field = Field::from(column_schema);
fields.push(field);
name_to_index.insert(column_schema.name.clone(), index);
}
let arrow_schema = Arc::new(ArrowSchema::from(fields));
Schema { Schema {
column_schemas, column_schemas,
name_to_index, name_to_index,
arrow_schema, arrow_schema: Arc::new(arrow_schema),
timestamp_index: None,
} }
} }
pub fn with_timestamp_index(
column_schemas: Vec<ColumnSchema>,
timestamp_index: usize,
) -> Result<Schema> {
let (arrow_schema, name_to_index) = collect_column_schemas(&column_schemas);
let mut metadata = BTreeMap::new();
metadata.insert(TIMESTAMP_INDEX_KEY.to_string(), timestamp_index.to_string());
let arrow_schema = Arc::new(arrow_schema.with_metadata(metadata));
validate_timestamp_index(&column_schemas, timestamp_index)?;
Ok(Schema {
column_schemas,
name_to_index,
arrow_schema,
timestamp_index: Some(timestamp_index),
})
}
pub fn arrow_schema(&self) -> &Arc<ArrowSchema> { pub fn arrow_schema(&self) -> &Arc<ArrowSchema> {
&self.arrow_schema &self.arrow_schema
} }
@@ -68,6 +90,55 @@ impl Schema {
.get(name) .get(name)
.map(|index| &self.column_schemas[*index]) .map(|index| &self.column_schemas[*index])
} }
#[inline]
pub fn num_columns(&self) -> usize {
self.column_schemas.len()
}
/// Returns index of the timestamp key column.
#[inline]
pub fn timestamp_index(&self) -> Option<usize> {
self.timestamp_index
}
#[inline]
pub fn timestamp_column(&self) -> Option<&ColumnSchema> {
self.timestamp_index.map(|idx| &self.column_schemas[idx])
}
}
fn collect_column_schemas(
column_schemas: &[ColumnSchema],
) -> (ArrowSchema, HashMap<String, usize>) {
let mut fields = Vec::with_capacity(column_schemas.len());
let mut name_to_index = HashMap::with_capacity(column_schemas.len());
for (index, column_schema) in column_schemas.iter().enumerate() {
let field = Field::from(column_schema);
fields.push(field);
name_to_index.insert(column_schema.name.clone(), index);
}
(ArrowSchema::from(fields), name_to_index)
}
fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: usize) -> Result<()> {
ensure!(
timestamp_index < column_schemas.len(),
error::InvalidTimestampIndexSnafu {
index: timestamp_index,
}
);
let column_schema = &column_schemas[timestamp_index];
ensure!(
column_schema.data_type.is_timestamp(),
error::InvalidTimestampIndexSnafu {
index: timestamp_index,
}
);
Ok(())
} }
pub type SchemaRef = Arc<Schema>; pub type SchemaRef = Arc<Schema>;
@@ -108,14 +179,32 @@ impl TryFrom<Arc<ArrowSchema>> for Schema {
column_schemas.push(column_schema); column_schemas.push(column_schema);
} }
let timestamp_index = try_parse_index(&arrow_schema.metadata, TIMESTAMP_INDEX_KEY)?;
if let Some(index) = timestamp_index {
validate_timestamp_index(&column_schemas, index)?;
}
Ok(Self { Ok(Self {
column_schemas, column_schemas,
name_to_index, name_to_index,
arrow_schema, arrow_schema,
timestamp_index,
}) })
} }
} }
fn try_parse_index(metadata: &Metadata, key: &str) -> Result<Option<usize>> {
if let Some(value) = metadata.get(key) {
let index = value
.parse()
.context(error::ParseSchemaIndexSnafu { value })?;
Ok(Some(index))
} else {
Ok(None)
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use arrow::datatypes::DataType as ArrowDataType; use arrow::datatypes::DataType as ArrowDataType;
@@ -135,13 +224,17 @@ mod tests {
} }
#[test] #[test]
fn test_schema() { fn test_schema_no_timestamp() {
let column_schemas = vec![ let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), false), ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), false),
ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), true), ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), true),
]; ];
let schema = Schema::new(column_schemas.clone()); let schema = Schema::new(column_schemas.clone());
assert_eq!(2, schema.num_columns());
assert!(schema.timestamp_index().is_none());
assert!(schema.timestamp_column().is_none());
for column_schema in &column_schemas { for column_schema in &column_schemas {
let found = schema.column_schema_by_name(&column_schema.name).unwrap(); let found = schema.column_schema_by_name(&column_schema.name).unwrap();
assert_eq!(column_schema, found); assert_eq!(column_schema, found);
@@ -158,4 +251,31 @@ mod tests {
assert_eq!(arrow_schema, *schema.arrow_schema()); assert_eq!(arrow_schema, *schema.arrow_schema());
assert_eq!(arrow_schema, *new_schema.arrow_schema()); assert_eq!(arrow_schema, *new_schema.arrow_schema());
} }
#[test]
fn test_schema_with_timestamp() {
let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new("ts", ConcreteDataType::int64_datatype(), false),
];
let schema = Schema::with_timestamp_index(column_schemas.clone(), 1).unwrap();
assert_eq!(1, schema.timestamp_index().unwrap());
assert_eq!(&column_schemas[1], schema.timestamp_column().unwrap());
let new_schema = Schema::try_from(schema.arrow_schema().clone()).unwrap();
assert_eq!(1, schema.timestamp_index().unwrap());
assert_eq!(schema, new_schema);
}
#[test]
fn test_schema_wrong_timestamp() {
let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), false),
];
assert!(Schema::with_timestamp_index(column_schemas.clone(), 0).is_err());
assert!(Schema::with_timestamp_index(column_schemas.clone(), 1).is_err());
assert!(Schema::with_timestamp_index(column_schemas, 2).is_err());
}
} }

View File

@@ -2,12 +2,13 @@ use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType; use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::StringBytes; use common_base::bytes::StringBytes;
use serde::{Deserialize, Serialize};
use crate::data_type::{DataType, DataTypeRef}; use crate::data_type::{DataType, DataTypeRef};
use crate::type_id::LogicalTypeId; use crate::type_id::LogicalTypeId;
use crate::value::Value; use crate::value::Value;
#[derive(Debug, Default, Clone, PartialEq)] #[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct BinaryType; pub struct BinaryType;
impl BinaryType { impl BinaryType {

View File

@@ -1,12 +1,13 @@
use std::sync::Arc; use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType; use arrow::datatypes::DataType as ArrowDataType;
use serde::{Deserialize, Serialize};
use crate::data_type::{DataType, DataTypeRef}; use crate::data_type::{DataType, DataTypeRef};
use crate::type_id::LogicalTypeId; use crate::type_id::LogicalTypeId;
use crate::value::Value; use crate::value::Value;
#[derive(Debug, Default, Clone, PartialEq)] #[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct BooleanType; pub struct BooleanType;
impl BooleanType { impl BooleanType {

View File

@@ -1,10 +1,11 @@
use arrow::datatypes::{DataType as ArrowDataType, Field}; use arrow::datatypes::{DataType as ArrowDataType, Field};
use serde::{Deserialize, Serialize};
use crate::prelude::*; use crate::prelude::*;
use crate::value::ListValue; use crate::value::ListValue;
/// Used to represent the List datatype. /// Used to represent the List datatype.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ListType { pub struct ListType {
/// The type of List's inner data. /// The type of List's inner data.
inner: Box<ConcreteDataType>, inner: Box<ConcreteDataType>,

View File

@@ -1,12 +1,13 @@
use std::sync::Arc; use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType; use arrow::datatypes::DataType as ArrowDataType;
use serde::{Deserialize, Serialize};
use crate::data_type::{DataType, DataTypeRef}; use crate::data_type::{DataType, DataTypeRef};
use crate::type_id::LogicalTypeId; use crate::type_id::LogicalTypeId;
use crate::value::Value; use crate::value::Value;
#[derive(Debug, Default, Clone, PartialEq)] #[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct NullType; pub struct NullType;
impl NullType { impl NullType {

View File

@@ -2,14 +2,16 @@ use std::marker::PhantomData;
use arrow::datatypes::DataType as ArrowDataType; use arrow::datatypes::DataType as ArrowDataType;
use paste::paste; use paste::paste;
use serde::{Deserialize, Serialize};
use crate::data_type::{ConcreteDataType, DataType}; use crate::data_type::{ConcreteDataType, DataType};
use crate::type_id::LogicalTypeId; use crate::type_id::LogicalTypeId;
use crate::types::primitive_traits::Primitive; use crate::types::primitive_traits::Primitive;
use crate::value::Value; use crate::value::Value;
#[derive(Clone, PartialEq)] #[derive(Clone, PartialEq, Serialize, Deserialize)]
pub struct PrimitiveType<T: Primitive> { pub struct PrimitiveType<T: Primitive> {
#[serde(skip)]
_phantom: PhantomData<T>, _phantom: PhantomData<T>,
} }

View File

@@ -2,11 +2,12 @@ use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType; use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::StringBytes; use common_base::bytes::StringBytes;
use serde::{Deserialize, Serialize};
use crate::data_type::DataType; use crate::data_type::DataType;
use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; use crate::prelude::{DataTypeRef, LogicalTypeId, Value};
#[derive(Debug, Default, Clone, PartialEq)] #[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct StringType; pub struct StringType;
impl StringType { impl StringType {

View File

@@ -3,7 +3,7 @@ use std::cmp::Ordering;
use common_base::bytes::{Bytes, StringBytes}; use common_base::bytes::{Bytes, StringBytes};
use datafusion_common::ScalarValue; use datafusion_common::ScalarValue;
pub use ordered_float::OrderedFloat; pub use ordered_float::OrderedFloat;
use serde::{Serialize, Serializer}; use serde::{Deserialize, Serialize, Serializer};
use crate::prelude::*; use crate::prelude::*;
@@ -15,7 +15,7 @@ pub type OrderedF64 = OrderedFloat<f64>;
/// Although compare Value with different data type is allowed, it is recommended to only /// Although compare Value with different data type is allowed, it is recommended to only
/// compare Value with same data type. Comparing Value with different data type may not /// compare Value with same data type. Comparing Value with different data type may not
/// behaves as what you expect. /// behaves as what you expect.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize)]
pub enum Value { pub enum Value {
Null, Null,
@@ -187,7 +187,7 @@ impl From<Value> for ScalarValue {
} }
} }
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ListValue { pub struct ListValue {
/// List of nested Values (boxed to reduce size_of(Value)) /// List of nested Values (boxed to reduce size_of(Value))
#[allow(clippy::box_collection)] #[allow(clippy::box_collection)]

View File

@@ -1,14 +1,15 @@
use store_api::logstore::entry::{Id, Offset}; use store_api::logstore::entry::{Id, Offset};
use store_api::logstore::AppendResponse; use store_api::logstore::AppendResponse;
mod config; pub mod config;
mod crc; mod crc;
mod entry; mod entry;
mod file; mod file;
mod file_name; mod file_name;
mod index; mod index;
mod log; pub mod log;
mod namespace; mod namespace;
pub mod noop;
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct AppendResponseImpl { pub struct AppendResponseImpl {

View File

@@ -463,81 +463,82 @@ impl AppendRequest {
} }
} }
#[cfg(test)] // TODO(hl): uncomment this test once log file read visibility issue fixed.
mod tests { // #[cfg(test)]
use std::io::Read; // mod tests {
// use std::io::Read;
use common_telemetry::logging; //
use futures_util::StreamExt; // use common_telemetry::logging;
use tempdir::TempDir; // use futures_util::StreamExt;
// use tempdir::TempDir;
use super::*; //
use crate::fs::namespace::LocalNamespace; // use super::*;
// use crate::fs::namespace::LocalNamespace;
#[tokio::test] //
pub async fn test_create_entry_stream() { // #[tokio::test]
logging::init_default_ut_logging(); // pub async fn test_create_entry_stream() {
let config = LogConfig::default(); // logging::init_default_ut_logging();
// let config = LogConfig::default();
let dir = TempDir::new("greptimedb-store-test").unwrap(); //
let path_buf = dir.path().join("0010.log"); // let dir = TempDir::new("greptimedb-store-test").unwrap();
let path = path_buf.to_str().unwrap().to_string(); // let path_buf = dir.path().join("0010.log");
File::create(path.as_str()).await.unwrap(); // let path = path_buf.to_str().unwrap().to_string();
// File::create(path.as_str()).await.unwrap();
let mut file = LogFile::open(path.clone(), &config) //
.await // let mut file = LogFile::open(path.clone(), &config)
.unwrap_or_else(|_| panic!("Failed to open file: {}", path)); // .await
file.start().await.expect("Failed to start log file"); // .unwrap_or_else(|_| panic!("Failed to open file: {}", path));
// file.start().await.expect("Failed to start log file");
assert_eq!( //
10, // assert_eq!(
file.append(&mut EntryImpl::new("test1".as_bytes())) // 10,
.await // file.append(&mut EntryImpl::new("test1".as_bytes()))
.expect("Failed to append entry 1") // .await
.entry_id // .expect("Failed to append entry 1")
); // .entry_id
// );
assert_eq!( //
11, // assert_eq!(
file.append(&mut EntryImpl::new("test-2".as_bytes())) // 11,
.await // file.append(&mut EntryImpl::new("test-2".as_bytes()))
.expect("Failed to append entry 2") // .await
.entry_id // .expect("Failed to append entry 2")
); // .entry_id
// );
let mut log_file = std::fs::File::open(path.clone()).expect("Test log file does not exist"); //
let metadata = log_file.metadata().expect("Failed to read file metadata"); // let mut log_file = std::fs::File::open(path.clone()).expect("Test log file does not exist");
info!("Log file metadata: {:?}", metadata); // let metadata = log_file.metadata().expect("Failed to read file metadata");
// info!("Log file metadata: {:?}", metadata);
assert_eq!(59, metadata.len()); // 24+5+24+6 //
let mut content = vec![0; metadata.len() as usize]; // assert_eq!(59, metadata.len()); // 24+5+24+6
log_file // let mut content = vec![0; metadata.len() as usize];
.read_exact(&mut content) // log_file
.expect("Read log file failed"); // .read_exact(&mut content)
// .expect("Read log file failed");
info!( //
"Log file {:?} content: {}, size:{}", // info!(
dir, // "Log file {:?} content: {}, size:{}",
hex::encode(content), // dir,
metadata.len() // hex::encode(content),
); // metadata.len()
// );
let mut stream = file.create_stream(LocalNamespace::default(), 0); //
// let mut stream = file.create_stream(LocalNamespace::default(), 0);
let mut data = vec![]; //
// let mut data = vec![];
while let Some(v) = stream.next().await { //
let entries = v.unwrap(); // while let Some(v) = stream.next().await {
let content = entries[0].data(); // let entries = v.unwrap();
let vec = content.to_vec(); // let content = entries[0].data();
info!("Read entry: {}", String::from_utf8_lossy(&vec)); // let vec = content.to_vec();
data.push(String::from_utf8(vec).unwrap()); // info!("Read entry: {}", String::from_utf8_lossy(&vec));
} // data.push(String::from_utf8(vec).unwrap());
// }
assert_eq!(vec!["test1".to_string(), "test-2".to_string()], data); //
drop(stream); // assert_eq!(vec!["test1".to_string(), "test-2".to_string()], data);
// drop(stream);
let result = file.stop().await; //
info!("Stop file res: {:?}", result); // let result = file.stop().await;
} // info!("Stop file res: {:?}", result);
} // }
// }

View File

@@ -5,7 +5,7 @@ use std::sync::Arc;
use arc_swap::ArcSwap; use arc_swap::ArcSwap;
use common_telemetry::{error, info, warn}; use common_telemetry::{error, info, warn};
use snafu::{OptionExt, ResultExt}; use snafu::{OptionExt, ResultExt};
use store_api::logstore::entry::Id; use store_api::logstore::entry::{Encode, Id};
use store_api::logstore::LogStore; use store_api::logstore::LogStore;
use tokio::sync::RwLock; use tokio::sync::RwLock;
@@ -167,17 +167,20 @@ impl LogStore for LocalFileLogStore {
async fn append( async fn append(
&self, &self,
_ns: Self::Namespace, _ns: Self::Namespace,
mut e: Self::Entry, mut entry: Self::Entry,
) -> Result<Self::AppendResponse> { ) -> Result<Self::AppendResponse> {
// TODO(hl): configurable retry times // TODO(hl): configurable retry times
for _ in 0..3 { for _ in 0..3 {
let current_active_file = self.active_file(); let current_active_file = self.active_file();
match current_active_file.append(&mut e).await { match current_active_file.append(&mut entry).await {
Ok(r) => return Ok(r), Ok(r) => return Ok(r),
Err(e) => match e { Err(e) => match e {
Error::Eof => { Error::Eof => {
self.roll_next(current_active_file.clone()).await?; self.roll_next(current_active_file.clone()).await?;
info!("Rolled to next file, retry append"); info!(
"Rolled to next file, retry append, entry size: {}",
entry.encoded_size()
);
continue; continue;
} }
Error::Internal { .. } => { Error::Internal { .. } => {

View File

@@ -19,6 +19,14 @@ struct LocalNamespaceInner {
} }
impl Namespace for LocalNamespace { impl Namespace for LocalNamespace {
fn new(name: &str, id: u64) -> Self {
let inner = Arc::new(LocalNamespaceInner {
name: name.to_string(),
id,
});
Self { inner }
}
fn name(&self) -> &str { fn name(&self) -> &str {
self.inner.name.as_str() self.inner.name.as_str()
} }
@@ -29,12 +37,4 @@ impl LocalNamespace {
fn id(&self) -> u64 { fn id(&self) -> u64 {
self.inner.id self.inner.id
} }
pub fn new(name: &str, id: u64) -> Self {
let inner = Arc::new(LocalNamespaceInner {
name: name.to_string(),
id,
});
Self { inner }
}
} }

View File

@@ -0,0 +1,53 @@
use store_api::logstore::{entry::Id, LogStore};
use crate::error::{Error, Result};
use crate::fs::{entry::EntryImpl, namespace::LocalNamespace, AppendResponseImpl};
/// A noop log store which only for test
// TODO: Add a test feature
#[derive(Default)]
pub struct NoopLogStore {}
#[async_trait::async_trait]
impl LogStore for NoopLogStore {
type Error = Error;
type Namespace = LocalNamespace;
type Entry = EntryImpl;
type AppendResponse = AppendResponseImpl;
async fn append(
&self,
_ns: Self::Namespace,
mut _e: Self::Entry,
) -> Result<Self::AppendResponse> {
Ok(AppendResponseImpl {
entry_id: 0,
offset: 0,
})
}
async fn append_batch(&self, _ns: Self::Namespace, _e: Vec<Self::Entry>) -> Result<Id> {
todo!()
}
async fn read(
&self,
_ns: Self::Namespace,
_id: Id,
) -> Result<store_api::logstore::entry_stream::SendableEntryStream<'_, Self::Entry, Self::Error>>
{
todo!()
}
async fn create_namespace(&mut self, _ns: Self::Namespace) -> Result<()> {
todo!()
}
async fn delete_namespace(&mut self, _ns: Self::Namespace) -> Result<()> {
todo!()
}
async fn list_namespaces(&self) -> Result<Vec<Self::Namespace>> {
todo!()
}
}

View File

@@ -1,2 +1,4 @@
mod error; pub mod error;
pub mod fs; pub mod fs;
pub mod test_util;

View File

@@ -0,0 +1 @@
pub mod log_store_util;

View File

@@ -0,0 +1,16 @@
use tempdir::TempDir;
use crate::fs::{config::LogConfig, log::LocalFileLogStore};
/// Create a tmp directory for write log, used for test.
// TODO: Add a test feature
pub async fn create_tmp_local_file_log_store(dir: &str) -> (LocalFileLogStore, TempDir) {
let dir = TempDir::new(dir).unwrap();
let cfg = LogConfig {
append_buffer_size: 128,
max_log_file_size: 128,
log_file_dir: dir.path().to_str().unwrap().to_string(),
};
(LocalFileLogStore::open(&cfg).await.unwrap(), dir)
}

View File

@@ -7,7 +7,7 @@ edition = "2021"
[dependencies] [dependencies]
futures = { version = "0.3"} futures = { version = "0.3"}
opendal = "0.6" opendal = "0.9"
tokio = { version = "1.0", features = ["full"] } tokio = { version = "1.0", features = ["full"] }
[dev-dependencies] [dev-dependencies]

View File

@@ -1,5 +1,6 @@
pub use opendal::{ pub use opendal::{
Accessor, Layer, Metadata, Object, ObjectMode, ObjectStreamer, Operator as ObjectStore, Accessor, DirEntry, DirStreamer, Layer, Metadata, Object, ObjectMetadata, ObjectMode,
Operator as ObjectStore,
}; };
pub mod backend; pub mod backend;
pub mod util; pub mod util;

View File

@@ -1,7 +1,29 @@
use futures::TryStreamExt; use futures::TryStreamExt;
use crate::{Object, ObjectStreamer}; use crate::{DirEntry, DirStreamer};
pub async fn collect(stream: ObjectStreamer) -> Result<Vec<Object>, std::io::Error> { pub async fn collect(stream: DirStreamer) -> Result<Vec<DirEntry>, std::io::Error> {
stream.try_collect::<Vec<_>>().await stream.try_collect::<Vec<_>>().await
} }
/// Normalize a directory path, ensure it is ends with '/'
pub fn normalize_dir(dir: &str) -> String {
let mut dir = dir.to_string();
if !dir.ends_with('/') {
dir.push('/')
}
dir
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_dir() {
assert_eq!("/", normalize_dir("/"));
assert_eq!("/", normalize_dir(""));
assert_eq!("/test/", normalize_dir("/test"));
}
}

View File

@@ -4,7 +4,7 @@ use anyhow::Result;
use common_telemetry::logging; use common_telemetry::logging;
use object_store::{ use object_store::{
backend::{fs, s3}, backend::{fs, s3},
util, Object, ObjectMode, ObjectStore, ObjectStreamer, util, DirStreamer, Object, ObjectMode, ObjectStore,
}; };
use tempdir::TempDir; use tempdir::TempDir;
@@ -25,8 +25,7 @@ async fn test_object_crud(store: &ObjectStore) -> Result<()> {
// Get object's Metadata // Get object's Metadata
let meta = object.metadata().await?; let meta = object.metadata().await?;
assert!(meta.complete()); assert_eq!("test_file", object.path());
assert_eq!("test_file", meta.path());
assert_eq!(ObjectMode::FILE, meta.mode()); assert_eq!(ObjectMode::FILE, meta.mode());
assert_eq!(13, meta.content_length()); assert_eq!(13, meta.content_length());
@@ -50,7 +49,7 @@ async fn test_object_list(store: &ObjectStore) -> Result<()> {
// List objects // List objects
let o: Object = store.object("/"); let o: Object = store.object("/");
let obs: ObjectStreamer = o.list().await?; let obs: DirStreamer = o.list().await?;
let objects = util::collect(obs).await?; let objects = util::collect(obs).await?;
assert_eq!(3, objects.len()); assert_eq!(3, objects.len());
@@ -63,7 +62,7 @@ async fn test_object_list(store: &ObjectStore) -> Result<()> {
assert_eq!(1, objects.len()); assert_eq!(1, objects.len());
// Only o2 is exists // Only o2 is exists
let o2 = &objects[0]; let o2 = &objects[0].clone().into_object();
let bs = o2.read().await?; let bs = o2.read().await?;
assert_eq!("Hello, object2!", String::from_utf8(bs)?); assert_eq!("Hello, object2!", String::from_utf8(bs)?);
// Delete o2 // Delete o2

View File

@@ -7,18 +7,39 @@ edition = "2021"
[dependencies] [dependencies]
arc-swap = "1.0" arc-swap = "1.0"
arrow-format = { version = "0.4", features = ["ipc"] }
async-trait = "0.1" async-trait = "0.1"
bit-vec = "0.6"
bytes = "1.1"
common-error = { path = "../common/error" } common-error = { path = "../common/error" }
common-runtime = { path = "../common/runtime" }
common-telemetry = { path = "../common/telemetry" } common-telemetry = { path = "../common/telemetry" }
common-time = { path = "../common/time" }
datatypes = { path = "../datatypes" } datatypes = { path = "../datatypes" }
futures = "0.3"
futures-util = "0.3"
lazy_static = "1.4"
log-store = { path = "../log-store" }
object-store = { path = "../object-store" }
planus = "0.2"
prost = "0.10"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
snafu = { version = "0.7", features = ["backtraces"] } snafu = { version = "0.7", features = ["backtraces"] }
store-api = { path = "../store-api" } store-api = { path = "../store-api" }
regex = "1.5"
tokio = { version = "1.18", features = ["full"] } tokio = { version = "1.18", features = ["full"] }
tonic = "0.7"
uuid = { version = "1.1" , features=["v4"]}
[dev-dependencies] [dev-dependencies]
atomic_float="0.1"
criterion = "0.3" criterion = "0.3"
rand = "0.8" rand = "0.8"
atomic_float="0.1" tempdir = "0.3"
[build-dependencies]
tonic-build = "0.7"
[[bench]] [[bench]]
name = "bench_main" name = "bench_main"

View File

@@ -27,9 +27,11 @@ impl BenchContext {
let iter_ctx = IterContext { let iter_ctx = IterContext {
batch_size, batch_size,
visible_sequence: SequenceNumber::MAX, visible_sequence: SequenceNumber::MAX,
for_flush: false,
}; };
let mut iter = self.memtable.iter(iter_ctx).unwrap(); let iter = self.memtable.iter(iter_ctx).unwrap();
while let Ok(Some(_)) = iter.next() { for batch in iter {
batch.unwrap();
read_count += batch_size; read_count += batch_size;
} }
read_count read_count

View File

@@ -22,5 +22,5 @@ pub fn schema_for_test() -> MemtableSchema {
} }
pub fn new_memtable() -> MemtableRef { pub fn new_memtable() -> MemtableRef {
DefaultMemtableBuilder {}.build(schema_for_test()) DefaultMemtableBuilder {}.build(1, schema_for_test())
} }

5
src/storage/build.rs Normal file
View File

@@ -0,0 +1,5 @@
fn main() {
tonic_build::configure()
.compile(&["proto/wal.proto"], &["."])
.expect("compile wal proto");
}

View File

@@ -0,0 +1,25 @@
syntax = "proto3";
package greptime.storage.wal.v1;
message WalHeader {
PayloadType payload_type = 1;
uint64 last_manifest_version = 2;
repeated MutationExtra mutation_extras = 3;
}
enum PayloadType {
NONE = 0;
WRITE_BATCH_ARROW = 1;
WRITE_BATCH_PROTO = 2;
}
message MutationExtra {
MutationType mutation_type = 1;
bytes column_null_mask = 2;
}
enum MutationType {
PUT = 0;
DELETE = 1;
}

View File

@@ -0,0 +1,225 @@
//! Forked from [arrow2](https://github.com/jorgecarleitao/arrow2/blob/v0.10.1/src/io/ipc/read/stream.rs),
//! and I made a slight change because arrow2 can only use the same schema to read all data chunks,
//! which doesn't solve the none column problem, so I added a `column_null_mask` parameter to the
//! `StreamReader#maybe_next` method to solve the none column problem.
use std::io::Read;
use arrow_format::{self, ipc::planus::ReadAsRoot};
use datatypes::arrow::{
datatypes::Schema,
error::{ArrowError, Result},
io::ipc::{
read::{read_dictionary, read_record_batch, Dictionaries, StreamMetadata, StreamState},
IpcSchema,
},
};
const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
pub struct ArrowStreamReader<R: Read> {
reader: R,
metadata: StreamMetadata,
dictionaries: Dictionaries,
finished: bool,
data_buffer: Vec<u8>,
message_buffer: Vec<u8>,
}
impl<R: Read> ArrowStreamReader<R> {
pub fn new(reader: R, metadata: StreamMetadata) -> Self {
Self {
reader,
metadata,
dictionaries: Default::default(),
finished: false,
data_buffer: vec![],
message_buffer: vec![],
}
}
/// Return the schema of the stream
pub fn metadata(&self) -> &StreamMetadata {
&self.metadata
}
/// Check if the stream is finished
pub fn is_finished(&self) -> bool {
self.finished
}
/// Check if the stream is exactly finished
pub fn check_exactly_finished(&mut self) -> Result<bool> {
if self.is_finished() {
return Ok(false);
}
let _ = self.maybe_next(&[])?;
Ok(self.is_finished())
}
pub fn maybe_next(&mut self, column_null_mask: &[u8]) -> Result<Option<StreamState>> {
if self.finished {
return Ok(None);
}
let batch = if column_null_mask.is_empty() {
read_next(
&mut self.reader,
&self.metadata,
&mut self.dictionaries,
&mut self.message_buffer,
&mut self.data_buffer,
)?
} else {
read_next(
&mut self.reader,
&valid_metadata(&self.metadata, column_null_mask),
&mut self.dictionaries,
&mut self.message_buffer,
&mut self.data_buffer,
)?
};
if batch.is_none() {
self.finished = true;
}
Ok(batch)
}
}
fn valid_metadata(metadata: &StreamMetadata, column_null_mask: &[u8]) -> StreamMetadata {
let column_null_mask = bit_vec::BitVec::from_bytes(column_null_mask);
let schema = Schema::from(
metadata
.schema
.fields
.iter()
.zip(&column_null_mask)
.filter(|(_, mask)| !*mask)
.map(|(field, _)| field.clone())
.collect::<Vec<_>>(),
)
.with_metadata(metadata.schema.metadata.clone());
let ipc_schema = IpcSchema {
fields: metadata
.ipc_schema
.fields
.iter()
.zip(&column_null_mask)
.filter(|(_, mask)| !*mask)
.map(|(ipc_field, _)| ipc_field.clone())
.collect::<Vec<_>>(),
is_little_endian: metadata.ipc_schema.is_little_endian,
};
StreamMetadata {
schema,
version: metadata.version,
ipc_schema,
}
}
fn read_next<R: Read>(
reader: &mut R,
metadata: &StreamMetadata,
dictionaries: &mut Dictionaries,
message_buffer: &mut Vec<u8>,
data_buffer: &mut Vec<u8>,
) -> Result<Option<StreamState>> {
// determine metadata length
let mut meta_length: [u8; 4] = [0; 4];
match reader.read_exact(&mut meta_length) {
Ok(()) => (),
Err(e) => {
return if e.kind() == std::io::ErrorKind::UnexpectedEof {
// Handle EOF without the "0xFFFFFFFF 0x00000000"
// valid according to:
// https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
Ok(Some(StreamState::Waiting))
} else {
Err(ArrowError::from(e))
};
}
}
let meta_length = {
// If a continuation marker is encountered, skip over it and read
// the size from the next four bytes.
if meta_length == CONTINUATION_MARKER {
reader.read_exact(&mut meta_length)?;
}
i32::from_le_bytes(meta_length) as usize
};
if meta_length == 0 {
// the stream has ended, mark the reader as finished
return Ok(None);
}
message_buffer.clear();
message_buffer.resize(meta_length, 0);
reader.read_exact(message_buffer)?;
let message = arrow_format::ipc::MessageRef::read_as_root(message_buffer).map_err(|err| {
ArrowError::OutOfSpec(format!("Unable to get root as message: {:?}", err))
})?;
let header = message.header()?.ok_or_else(|| {
ArrowError::OutOfSpec(
"IPC: unable to fetch the message header. The file or stream is corrupted.".to_string(),
)
})?;
match header {
arrow_format::ipc::MessageHeaderRef::Schema(_) => {
Err(ArrowError::OutOfSpec("A stream ".to_string()))
}
arrow_format::ipc::MessageHeaderRef::RecordBatch(batch) => {
// read the block that makes up the record batch into a buffer
data_buffer.clear();
data_buffer.resize(message.body_length()? as usize, 0);
reader.read_exact(data_buffer)?;
let mut reader = std::io::Cursor::new(data_buffer);
read_record_batch(
batch,
&metadata.schema.fields,
&metadata.ipc_schema,
None,
dictionaries,
metadata.version,
&mut reader,
0,
)
.map(|x| Some(StreamState::Some(x)))
}
arrow_format::ipc::MessageHeaderRef::DictionaryBatch(batch) => {
// read the block that makes up the dictionary batch into a buffer
let mut buf = vec![0; message.body_length()? as usize];
reader.read_exact(&mut buf)?;
let mut dict_reader = std::io::Cursor::new(buf);
read_dictionary(
batch,
&metadata.schema.fields,
&metadata.ipc_schema,
dictionaries,
&mut dict_reader,
0,
)?;
// read the next message until we encounter a RecordBatch message
read_next(reader, metadata, dictionaries, message_buffer, data_buffer)
}
t => Err(ArrowError::OutOfSpec(format!(
"Reading types other than record batches not yet supported, unable to read {:?} ",
t
))),
}
}

View File

@@ -0,0 +1,104 @@
//! Background job management.
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use async_trait::async_trait;
use common_runtime::{self, JoinHandle};
use snafu::ResultExt;
use crate::error::{self, Result};
/// Background job context.
#[derive(Clone, Default)]
pub struct Context {
inner: Arc<ContextInner>,
}
impl Context {
fn new() -> Context {
Context::default()
}
/// Marks this context as cancelled.
///
/// Job accessing this context should check `is_cancelled()` and exit if it
/// returns true.
pub fn cancel(&self) {
self.inner.cancelled.store(false, Ordering::Relaxed);
}
/// Returns true if this context is cancelled.
pub fn is_cancelled(&self) -> bool {
self.inner.cancelled.load(Ordering::Relaxed)
}
}
#[derive(Default)]
struct ContextInner {
cancelled: AtomicBool,
}
/// Handle to the background job.
pub struct JobHandle {
ctx: Context,
handle: JoinHandle<Result<()>>,
}
impl JobHandle {
/// Waits until this background job is finished.
pub async fn join(self) -> Result<()> {
self.handle.await.context(error::JoinTaskSnafu)?
}
/// Cancels this background job gracefully and waits until it exits.
#[allow(unused)]
pub async fn cancel(self) -> Result<()> {
// Tokio also provides an [`abort()`](https://docs.rs/tokio/latest/tokio/task/struct.JoinHandle.html#method.abort)
// method to abort current task, consider using it if we need to abort a background job.
self.ctx.cancel();
self.join().await
}
}
#[async_trait]
pub trait Job: Send {
async fn run(&mut self, ctx: &Context) -> Result<()>;
}
type BoxedJob = Box<dyn Job>;
/// Thread pool that runs all background jobs.
#[async_trait]
pub trait JobPool: Send + Sync {
/// Submit a job to run in background.
///
/// Returns the [JobHandle] to the job.
async fn submit(&self, job: BoxedJob) -> Result<JobHandle>;
/// Shutdown the manager, pending background jobs may be discarded.
async fn shutdown(&self) -> Result<()>;
}
pub type JobPoolRef = Arc<dyn JobPool>;
pub struct JobPoolImpl {}
#[async_trait]
impl JobPool for JobPoolImpl {
async fn submit(&self, mut job: BoxedJob) -> Result<JobHandle> {
// TODO(yingwen): [flush] Schedule background jobs to background workers, controlling parallelism.
let ctx = Context::new();
let job_ctx = ctx.clone();
let handle = common_runtime::spawn_bg(async move { job.run(&job_ctx).await });
Ok(JobHandle { ctx, handle })
}
async fn shutdown(&self) -> Result<()> {
// TODO(yingwen): [flush] Stop background workers.
unimplemented!()
}
}

View File

@@ -2,12 +2,14 @@ use async_trait::async_trait;
use store_api::storage::{Chunk, ChunkReader, SchemaRef}; use store_api::storage::{Chunk, ChunkReader, SchemaRef};
use crate::error::{Error, Result}; use crate::error::{Error, Result};
use crate::memtable::BatchIteratorPtr; use crate::memtable::Batch;
type IteratorPtr = Box<dyn Iterator<Item = Result<Batch>> + Send>;
pub struct ChunkReaderImpl { pub struct ChunkReaderImpl {
schema: SchemaRef, schema: SchemaRef,
// Now we only read data from one memtable, so we just holds the memtable iterator here. // Now we only read data from memtables, so we just holds the iterator here.
iter: BatchIteratorPtr, iter: IteratorPtr,
} }
#[async_trait] #[async_trait]
@@ -19,8 +21,8 @@ impl ChunkReader for ChunkReaderImpl {
} }
async fn next_chunk(&mut self) -> Result<Option<Chunk>> { async fn next_chunk(&mut self) -> Result<Option<Chunk>> {
let mut batch = match self.iter.next()? { let mut batch = match self.iter.next() {
Some(b) => b, Some(b) => b?,
None => return Ok(None), None => return Ok(None),
}; };
@@ -35,7 +37,7 @@ impl ChunkReader for ChunkReaderImpl {
} }
impl ChunkReaderImpl { impl ChunkReaderImpl {
pub fn new(schema: SchemaRef, iter: BatchIteratorPtr) -> ChunkReaderImpl { pub fn new(schema: SchemaRef, iter: IteratorPtr) -> ChunkReaderImpl {
ChunkReaderImpl { schema, iter } ChunkReaderImpl { schema, iter }
} }
} }

19
src/storage/src/codec.rs Normal file
View File

@@ -0,0 +1,19 @@
use common_error::prelude::ErrorExt;
pub trait Encoder {
/// The type that is decoded.
type Item;
type Error: ErrorExt;
/// Encodes a message into the bytes buffer.
fn encode(&self, item: &Self::Item, dst: &mut Vec<u8>) -> Result<(), Self::Error>;
}
pub trait Decoder {
/// The type that is decoded.
type Item;
type Error: ErrorExt;
/// Decodes a message from the bytes buffer.
fn decode(&self, src: &[u8]) -> Result<Option<Self::Item>, Self::Error>;
}

56
src/storage/src/config.rs Normal file
View File

@@ -0,0 +1,56 @@
//! Engine config
#[derive(Debug, Clone)]
pub struct FileStoreConfig {
/// Storage path
pub store_dir: String,
}
impl Default for FileStoreConfig {
fn default() -> Self {
Self {
store_dir: "/tmp/greptimedb/".to_string(),
}
}
}
#[derive(Debug, Clone)]
pub enum ObjectStoreConfig {
File(FileStoreConfig),
}
impl Default for ObjectStoreConfig {
fn default() -> Self {
ObjectStoreConfig::File(FileStoreConfig::default())
}
}
#[derive(Debug, Clone, Default)]
pub struct EngineConfig {
pub store_config: ObjectStoreConfig,
}
impl EngineConfig {
pub fn with_store_dir(store_dir: &str) -> Self {
Self {
store_config: ObjectStoreConfig::File(FileStoreConfig {
store_dir: store_dir.to_string(),
}),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_engine_config() {
let engine_config = EngineConfig::default();
let store_dir = match &engine_config.store_config {
ObjectStoreConfig::File(file) => &file.store_dir,
};
assert_eq!("/tmp/greptimedb/", store_dir);
}
}

View File

@@ -3,28 +3,46 @@ use std::sync::{Arc, RwLock};
use async_trait::async_trait; use async_trait::async_trait;
use common_telemetry::logging::info; use common_telemetry::logging::info;
use object_store::{backend::fs::Backend, util, ObjectStore};
use snafu::ResultExt; use snafu::ResultExt;
use store_api::storage::{EngineContext, RegionDescriptor, StorageEngine}; use store_api::{
logstore::LogStore,
manifest::Manifest,
storage::{EngineContext, RegionDescriptor, StorageEngine},
};
use crate::config::{EngineConfig, ObjectStoreConfig};
use crate::error::{self, Error, Result}; use crate::error::{self, Error, Result};
use crate::manifest::action::*;
use crate::manifest::region::RegionManifest;
use crate::metadata::RegionMetadata;
use crate::region::RegionImpl; use crate::region::RegionImpl;
use crate::sst::FsAccessLayer;
use crate::wal::Wal;
/// [StorageEngine] implementation. /// [StorageEngine] implementation.
#[derive(Clone)] pub struct EngineImpl<S: LogStore> {
pub struct EngineImpl { inner: Arc<EngineInner<S>>,
inner: Arc<EngineInner>, }
impl<S: LogStore> Clone for EngineImpl<S> {
fn clone(&self) -> Self {
Self {
inner: self.inner.clone(),
}
}
} }
#[async_trait] #[async_trait]
impl StorageEngine for EngineImpl { impl<S: LogStore> StorageEngine for EngineImpl<S> {
type Error = Error; type Error = Error;
type Region = RegionImpl; type Region = RegionImpl<S>;
async fn open_region(&self, _ctx: &EngineContext, _name: &str) -> Result<RegionImpl> { async fn open_region(&self, _ctx: &EngineContext, _name: &str) -> Result<Self::Region> {
unimplemented!() unimplemented!()
} }
async fn close_region(&self, _ctx: &EngineContext, _region: RegionImpl) -> Result<()> { async fn close_region(&self, _ctx: &EngineContext, _region: Self::Region) -> Result<()> {
unimplemented!() unimplemented!()
} }
@@ -32,42 +50,85 @@ impl StorageEngine for EngineImpl {
&self, &self,
_ctx: &EngineContext, _ctx: &EngineContext,
descriptor: RegionDescriptor, descriptor: RegionDescriptor,
) -> Result<RegionImpl> { ) -> Result<Self::Region> {
self.inner.create_region(descriptor).await self.inner.create_region(descriptor).await
} }
async fn drop_region(&self, _ctx: &EngineContext, _region: RegionImpl) -> Result<()> { async fn drop_region(&self, _ctx: &EngineContext, _region: Self::Region) -> Result<()> {
unimplemented!() unimplemented!()
} }
fn get_region(&self, _ctx: &EngineContext, name: &str) -> Result<Option<RegionImpl>> { fn get_region(&self, _ctx: &EngineContext, name: &str) -> Result<Option<Self::Region>> {
Ok(self.inner.get_region(name)) Ok(self.inner.get_region(name))
} }
} }
impl EngineImpl { impl<S: LogStore> EngineImpl<S> {
pub fn new() -> EngineImpl { pub async fn new(config: EngineConfig, log_store: Arc<S>) -> Result<Self> {
EngineImpl { Ok(Self {
inner: Arc::new(EngineInner::default()), inner: Arc::new(EngineInner::new(config, log_store).await?),
} })
} }
} }
impl Default for EngineImpl { /// Engine share data
fn default() -> Self { /// TODO(dennis): merge to EngineInner?
Self::new() #[derive(Clone, Debug)]
struct SharedData {
pub _config: EngineConfig,
pub object_store: ObjectStore,
}
impl SharedData {
async fn new(config: EngineConfig) -> Result<Self> {
// TODO(dennis): supports other backend
let store_dir = util::normalize_dir(match &config.store_config {
ObjectStoreConfig::File(file) => &file.store_dir,
});
let accessor = Backend::build()
.root(&store_dir)
.finish()
.await
.context(error::InitBackendSnafu { dir: &store_dir })?;
let object_store = ObjectStore::new(accessor);
Ok(Self {
_config: config,
object_store,
})
}
#[inline]
fn region_sst_dir(&self, region_name: &str) -> String {
format!("{}/", region_name)
}
#[inline]
fn region_manifest_dir(&self, region_name: &str) -> String {
format!("{}/manifest/", region_name)
} }
} }
type RegionMap = HashMap<String, RegionImpl>; type RegionMap<S> = HashMap<String, RegionImpl<S>>;
#[derive(Default)] struct EngineInner<S: LogStore> {
struct EngineInner { log_store: Arc<S>,
regions: RwLock<RegionMap>, regions: RwLock<RegionMap<S>>,
shared: SharedData,
} }
impl EngineInner { impl<S: LogStore> EngineInner<S> {
async fn create_region(&self, descriptor: RegionDescriptor) -> Result<RegionImpl> { pub async fn new(config: EngineConfig, log_store: Arc<S>) -> Result<Self> {
Ok(Self {
log_store,
regions: RwLock::new(Default::default()),
shared: SharedData::new(config).await?,
})
}
async fn create_region(&self, descriptor: RegionDescriptor) -> Result<RegionImpl<S>> {
{ {
let regions = self.regions.read().unwrap(); let regions = self.regions.read().unwrap();
if let Some(region) = regions.get(&descriptor.name) { if let Some(region) = regions.get(&descriptor.name) {
@@ -75,13 +136,38 @@ impl EngineInner {
} }
} }
let region_id = descriptor.id;
let region_name = descriptor.name.clone(); let region_name = descriptor.name.clone();
let metadata = descriptor let metadata: RegionMetadata =
.try_into() descriptor
.context(error::InvalidRegionDescSnafu { .try_into()
region: &region_name, .context(error::InvalidRegionDescSnafu {
})?; region: &region_name,
let region = RegionImpl::new(region_name.clone(), metadata); })?;
let wal = Wal::new(region_id, region_name.clone(), self.log_store.clone());
let sst_dir = &self.shared.region_sst_dir(&region_name);
let sst_layer = Arc::new(FsAccessLayer::new(
sst_dir,
self.shared.object_store.clone(),
));
let manifest_dir = self.shared.region_manifest_dir(&region_name);
let manifest =
RegionManifest::new(region_id, &manifest_dir, self.shared.object_store.clone());
let region = RegionImpl::new(
region_id,
region_name.clone(),
metadata.clone(),
wal,
sst_layer,
manifest.clone(),
);
// Persist region metadata
manifest
.update(RegionMetaAction::Change(RegionChange {
metadata: Arc::new(metadata),
}))
.await?;
{ {
let mut regions = self.regions.write().unwrap(); let mut regions = self.regions.write().unwrap();
@@ -91,7 +177,6 @@ impl EngineInner {
regions.insert(region_name.clone(), region.clone()); regions.insert(region_name.clone(), region.clone());
} }
// TODO(yingwen): Persist region metadata to log.
// TODO(yingwen): Impl Debug format for region and print region info briefly in log. // TODO(yingwen): Impl Debug format for region and print region info briefly in log.
info!("Storage engine create region {}", region_name); info!("Storage engine create region {}", region_name);
@@ -99,7 +184,7 @@ impl EngineInner {
Ok(region) Ok(region)
} }
fn get_region(&self, name: &str) -> Option<RegionImpl> { fn get_region(&self, name: &str) -> Option<RegionImpl<S>> {
self.regions.read().unwrap().get(name).cloned() self.regions.read().unwrap().get(name).cloned()
} }
} }
@@ -107,14 +192,22 @@ impl EngineInner {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use datatypes::type_id::LogicalTypeId; use datatypes::type_id::LogicalTypeId;
use log_store::test_util::log_store_util;
use store_api::storage::Region; use store_api::storage::Region;
use tempdir::TempDir;
use super::*; use super::*;
use crate::test_util::descriptor_util::RegionDescBuilder; use crate::test_util::descriptor_util::RegionDescBuilder;
#[tokio::test] #[tokio::test]
async fn test_create_new_region() { async fn test_create_new_region() {
let engine = EngineImpl::new(); let (log_store, _tmp) =
log_store_util::create_tmp_local_file_log_store("test_engine_wal").await;
let dir = TempDir::new("test_create_new_region").unwrap();
let store_dir = dir.path().to_string_lossy();
let config = EngineConfig::with_store_dir(&store_dir);
let engine = EngineImpl::new(config, Arc::new(log_store)).await.unwrap();
let region_name = "region-0"; let region_name = "region-0";
let desc = RegionDescBuilder::new(region_name) let desc = RegionDescBuilder::new(region_name)

View File

@@ -1,6 +1,11 @@
use std::any::Any; use std::any::Any;
use std::io::Error as IoError;
use std::str::Utf8Error;
use common_error::prelude::*; use common_error::prelude::*;
use datatypes::arrow;
use serde_json::error::Error as JsonError;
use store_api::manifest::ManifestVersion;
use crate::metadata::Error as MetadataError; use crate::metadata::Error as MetadataError;
@@ -25,6 +30,118 @@ pub enum Error {
column: String, column: String,
backtrace: Backtrace, backtrace: Backtrace,
}, },
#[snafu(display("Missing timestamp in write batch"))]
BatchMissingTimestamp { backtrace: Backtrace },
#[snafu(display("Failed to write columns, source: {}", source))]
FlushIo {
source: std::io::Error,
backtrace: Backtrace,
},
#[snafu(display("Failed to init backend, source: {}", source))]
InitBackend {
dir: String,
source: std::io::Error,
backtrace: Backtrace,
},
#[snafu(display("Failed to write parquet file, source: {}", source))]
WriteParquet {
source: arrow::error::ArrowError,
backtrace: Backtrace,
},
#[snafu(display("Fail to read object from path: {}, source: {}", path, source))]
ReadObject {
path: String,
backtrace: Backtrace,
source: IoError,
},
#[snafu(display("Fail to write object into path: {}, source: {}", path, source))]
WriteObject {
path: String,
backtrace: Backtrace,
source: IoError,
},
#[snafu(display("Fail to delete object from path: {}, source: {}", path, source))]
DeleteObject {
path: String,
backtrace: Backtrace,
source: IoError,
},
#[snafu(display("Fail to list objects in path: {}, source: {}", path, source))]
ListObjects {
path: String,
backtrace: Backtrace,
source: IoError,
},
#[snafu(display("Fail to create str from bytes, source: {}", source))]
Utf8 {
backtrace: Backtrace,
source: Utf8Error,
},
#[snafu(display("Fail to encode object into json , source: {}", source))]
EncodeJson {
backtrace: Backtrace,
source: JsonError,
},
#[snafu(display("Fail to decode object from json , source: {}", source))]
DecodeJson {
backtrace: Backtrace,
source: JsonError,
},
#[snafu(display("Invalid scan index, start: {}, end: {}", start, end))]
InvalidScanIndex {
start: ManifestVersion,
end: ManifestVersion,
backtrace: Backtrace,
},
#[snafu(display(
"Failed to write WAL, region id: {}, WAL name: {}, source: {}",
region_id,
name,
source
))]
WriteWal {
region_id: u32,
name: String,
#[snafu(backtrace)]
source: BoxedError,
},
#[snafu(display("Failed to encode WAL header, source {}", source))]
EncodeWalHeader {
backtrace: Backtrace,
source: std::io::Error,
},
#[snafu(display("Failed to decode WAL header, source {}", source))]
DecodeWalHeader {
backtrace: Backtrace,
source: std::io::Error,
},
#[snafu(display("Failed to join task, source: {}", source))]
JoinTask {
source: common_runtime::JoinError,
backtrace: Backtrace,
},
#[snafu(display("Invalid timestamp in write batch, source: {}", source))]
InvalidTimestamp { source: crate::write_batch::Error },
#[snafu(display("Task already cancelled"))]
Cancelled { backtrace: Backtrace },
} }
pub type Result<T> = std::result::Result<T, Error>; pub type Result<T> = std::result::Result<T, Error>;
@@ -34,9 +151,29 @@ impl ErrorExt for Error {
use Error::*; use Error::*;
match self { match self {
InvalidRegionDesc { .. } | InvalidInputSchema { .. } | BatchMissingColumn { .. } => { InvalidScanIndex { .. }
StatusCode::InvalidArguments | InvalidRegionDesc { .. }
} | InvalidInputSchema { .. }
| BatchMissingColumn { .. }
| BatchMissingTimestamp { .. }
| InvalidTimestamp { .. } => StatusCode::InvalidArguments,
Utf8 { .. }
| EncodeJson { .. }
| DecodeJson { .. }
| JoinTask { .. }
| Cancelled { .. } => StatusCode::Unexpected,
FlushIo { .. }
| InitBackend { .. }
| WriteParquet { .. }
| ReadObject { .. }
| WriteObject { .. }
| ListObjects { .. }
| DeleteObject { .. }
| WriteWal { .. }
| DecodeWalHeader { .. }
| EncodeWalHeader { .. } => StatusCode::StorageUnavailable,
} }
} }
@@ -51,6 +188,9 @@ impl ErrorExt for Error {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use common_error::prelude::StatusCode::*;
use datatypes::arrow::error::ArrowError;
use snafu::GenerateImplicitData; use snafu::GenerateImplicitData;
use super::*; use super::*;
@@ -72,4 +212,32 @@ mod tests {
assert_eq!(StatusCode::InvalidArguments, err.status_code()); assert_eq!(StatusCode::InvalidArguments, err.status_code());
assert!(err.backtrace_opt().is_some()); assert!(err.backtrace_opt().is_some());
} }
#[test]
pub fn test_flush_error() {
fn throw_io_error() -> std::result::Result<(), std::io::Error> {
Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"writer is closed",
))
}
let error = throw_io_error().context(FlushIoSnafu).err().unwrap();
assert_eq!(StatusCode::StorageUnavailable, error.status_code());
assert!(error.backtrace_opt().is_some());
}
#[test]
pub fn test_arrow_error() {
fn throw_arrow_error() -> std::result::Result<(), ArrowError> {
Err(ArrowError::ExternalFormat("Lorem ipsum".to_string()))
}
let error = throw_arrow_error()
.context(WriteParquetSnafu)
.err()
.unwrap();
assert_eq!(StorageUnavailable, error.status_code());
assert!(error.backtrace_opt().is_some());
}
} }

264
src/storage/src/flush.rs Normal file
View File

@@ -0,0 +1,264 @@
use std::sync::Arc;
use async_trait::async_trait;
use common_telemetry::logging;
use common_time::RangeMillis;
use store_api::logstore::LogStore;
use store_api::manifest::Manifest;
use store_api::manifest::ManifestVersion;
use store_api::storage::SequenceNumber;
use uuid::Uuid;
use crate::background::{Context, Job, JobHandle, JobPoolRef};
use crate::error::{CancelledSnafu, Result};
use crate::manifest::action::*;
use crate::manifest::region::RegionManifest;
use crate::memtable::{IterContext, MemtableId, MemtableRef};
use crate::region::RegionWriterRef;
use crate::region::SharedDataRef;
use crate::sst::{AccessLayerRef, FileMeta, WriteOptions};
use crate::version::VersionEdit;
use crate::wal::Wal;
/// Default write buffer size (32M).
const DEFAULT_WRITE_BUFFER_SIZE: usize = 32 * 1024 * 1024;
pub trait FlushStrategy: Send + Sync {
fn should_flush(
&self,
shared: &SharedDataRef,
bytes_mutable: usize,
bytes_total: usize,
) -> bool;
}
pub type FlushStrategyRef = Arc<dyn FlushStrategy>;
#[derive(Debug)]
pub struct SizeBasedStrategy {
/// Write buffer size of memtable.
max_write_buffer_size: usize,
/// Mutable memtable memory size limitation
mutable_limitation: usize,
}
#[inline]
fn get_mutable_limitation(max_write_buffer_size: usize) -> usize {
// Inspired by RocksDB
// https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L86
max_write_buffer_size * 7 / 8
}
impl Default for SizeBasedStrategy {
fn default() -> Self {
let max_write_buffer_size = DEFAULT_WRITE_BUFFER_SIZE;
Self {
max_write_buffer_size,
mutable_limitation: get_mutable_limitation(max_write_buffer_size),
}
}
}
impl FlushStrategy for SizeBasedStrategy {
fn should_flush(
&self,
shared: &SharedDataRef,
bytes_mutable: usize,
bytes_total: usize,
) -> bool {
// Insipired by RocksDB flush strategy
// https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94
if bytes_mutable > self.mutable_limitation {
logging::info!(
"Region should flush, region: {}, bytes_mutable: {}, mutable_limitation: {}, \
bytes_total: {}, max_write_buffer_size: {} .",
shared.name,
bytes_mutable,
self.mutable_limitation,
bytes_total,
self.max_write_buffer_size
);
return true;
}
let buffer_size = self.max_write_buffer_size;
// If the memory exceeds the buffer size, we trigger more aggressive
// flush. But if already more than half memory is being flushed,
// triggering more flush may not help. We will hold it instead.
let should_flush = bytes_total >= buffer_size && bytes_mutable >= buffer_size / 2;
if should_flush {
logging::info!(
"Region should flush, region: {}, bytes_mutable: {}, mutable_limitation: {}, \
bytes_total: {}, max_write_buffer_size: {} .",
shared.name,
bytes_mutable,
self.mutable_limitation,
bytes_total,
buffer_size
);
}
should_flush
}
}
#[derive(Debug)]
pub struct MemtableWithMeta {
pub memtable: MemtableRef,
pub bucket: RangeMillis,
}
#[async_trait]
pub trait FlushScheduler: Send + Sync {
async fn schedule_flush(&self, flush_job: Box<dyn Job>) -> Result<JobHandle>;
}
pub struct FlushSchedulerImpl {
job_pool: JobPoolRef,
}
impl FlushSchedulerImpl {
pub fn new(job_pool: JobPoolRef) -> FlushSchedulerImpl {
FlushSchedulerImpl { job_pool }
}
}
#[async_trait]
impl FlushScheduler for FlushSchedulerImpl {
async fn schedule_flush(&self, flush_job: Box<dyn Job>) -> Result<JobHandle> {
// TODO(yingwen): [flush] Implements flush schedule strategy, controls max background flushes.
self.job_pool.submit(flush_job).await
}
}
pub type FlushSchedulerRef = Arc<dyn FlushScheduler>;
pub struct FlushJob<S: LogStore> {
/// Max memtable id in these memtables,
/// used to remove immutable memtables in current version.
pub max_memtable_id: MemtableId,
/// Memtables to be flushed.
pub memtables: Vec<MemtableWithMeta>,
/// Last sequence of data to be flushed.
pub flush_sequence: SequenceNumber,
/// Shared data of region to be flushed.
pub shared: SharedDataRef,
/// Sst access layer of the region.
pub sst_layer: AccessLayerRef,
/// Region writer, used to persist log entry that points to the latest manifest file.
pub writer: RegionWriterRef,
/// Region write-ahead logging, used to write data/meta to the log file.
pub wal: Wal<S>,
/// Region manifest service, used to persist metadata.
pub manifest: RegionManifest,
}
impl<S: LogStore> FlushJob<S> {
async fn write_memtables_to_layer(&self, ctx: &Context) -> Result<Vec<FileMeta>> {
if ctx.is_cancelled() {
return CancelledSnafu {}.fail();
}
let mut futures = Vec::with_capacity(self.memtables.len());
for m in &self.memtables {
let file_name = Self::generate_sst_file_name();
// TODO(hl): Check if random file name already exists in meta.
let iter_ctx = IterContext {
for_flush: true,
..Default::default()
};
let iter = m.memtable.iter(iter_ctx)?;
futures.push(async move {
self.sst_layer
.write_sst(&file_name, iter, WriteOptions::default())
.await
});
}
let metas = futures_util::future::join_all(futures)
.await
.into_iter()
.collect::<Result<Vec<_>>>()?
.into_iter()
.map(|f| FileMeta {
file_path: f,
level: 0,
})
.collect();
logging::info!("Successfully flush memtables to files: {:?}", metas);
Ok(metas)
}
async fn write_to_manifest(&self, file_metas: &[FileMeta]) -> Result<ManifestVersion> {
let edit = RegionEdit {
region_id: self.shared.id,
region_version: self.shared.version_control.metadata().version,
flush_sequence: self.flush_sequence,
files_to_add: file_metas.to_vec(),
files_to_remove: Vec::default(),
};
logging::debug!("Write region edit: {:?} to manifest.", edit);
self.manifest.update(RegionMetaAction::Edit(edit)).await
}
/// Generates random SST file name in format: `^[a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}.parquet$`
fn generate_sst_file_name() -> String {
format!("{}.parquet", Uuid::new_v4().hyphenated())
}
}
#[async_trait]
impl<S: LogStore> Job for FlushJob<S> {
// TODO(yingwen): [flush] Support in-job parallelism (Flush memtables concurrently)
async fn run(&mut self, ctx: &Context) -> Result<()> {
let file_metas = self.write_memtables_to_layer(ctx).await?;
let manifest_version = self.write_to_manifest(&file_metas).await?;
let edit = VersionEdit {
files_to_add: file_metas,
flushed_sequence: Some(self.flush_sequence),
manifest_version,
max_memtable_id: Some(self.max_memtable_id),
};
self.writer
.apply_version_edit(&self.wal, edit, &self.shared)
.await?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use log_store::fs::noop::NoopLogStore;
use regex::Regex;
use super::*;
#[test]
fn test_get_mutable_limitation() {
assert_eq!(7, get_mutable_limitation(8));
assert_eq!(8, get_mutable_limitation(10));
assert_eq!(56, get_mutable_limitation(64));
}
#[test]
pub fn test_uuid_generate() {
let file_name = FlushJob::<NoopLogStore>::generate_sst_file_name();
let regex = Regex::new(r"^[a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}.parquet$").unwrap();
assert!(
regex.is_match(&file_name),
"illegal sst file name: {}",
file_name
);
}
}

View File

@@ -1,17 +1,24 @@
//! Storage engine implementation. //! Storage engine implementation.
mod arrow_stream;
mod background;
mod chunk; mod chunk;
mod codec;
pub mod config;
mod engine; mod engine;
mod error; pub mod error;
mod flush;
pub mod manifest;
pub mod memtable; pub mod memtable;
pub mod metadata; pub mod metadata;
mod proto;
mod region; mod region;
mod snapshot; mod snapshot;
mod sst;
pub mod sync; pub mod sync;
mod version;
mod write_batch;
#[cfg(test)] #[cfg(test)]
mod test_util; mod test_util;
mod version;
mod wal;
mod write_batch;
pub use engine::EngineImpl; pub use engine::EngineImpl;

View File

@@ -0,0 +1,5 @@
//! manifest storage
pub(crate) mod action;
pub(crate) mod checkpoint;
pub mod region;
pub(crate) mod storage;

View File

@@ -0,0 +1,67 @@
use serde::{Deserialize, Serialize};
use serde_json as json;
use snafu::ResultExt;
use store_api::manifest::MetaAction;
use store_api::manifest::Metadata;
use store_api::storage::RegionId;
use store_api::storage::SequenceNumber;
use crate::error::{DecodeJsonSnafu, EncodeJsonSnafu, Result, Utf8Snafu};
use crate::metadata::{RegionMetadataRef, VersionNumber};
use crate::sst::FileMeta;
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct RegionChange {
pub metadata: RegionMetadataRef,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct RegionRemove {
pub region_id: RegionId,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct RegionEdit {
pub region_id: RegionId,
pub region_version: VersionNumber,
pub flush_sequence: SequenceNumber,
pub files_to_add: Vec<FileMeta>,
pub files_to_remove: Vec<FileMeta>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct RegionManifestData {
pub region_meta: RegionMetadataRef,
// TODO(dennis): version metadata
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub enum RegionMetaAction {
Change(RegionChange),
Remove(RegionRemove),
Edit(RegionEdit),
}
impl RegionMetaAction {
pub(crate) fn encode(&self) -> Result<Vec<u8>> {
Ok(json::to_string(self).context(EncodeJsonSnafu)?.into_bytes())
}
pub(crate) fn decode(bs: &[u8]) -> Result<Self> {
json::from_str(std::str::from_utf8(bs).context(Utf8Snafu)?).context(DecodeJsonSnafu)
}
}
impl Metadata for RegionManifestData {}
impl MetaAction for RegionMetaAction {
type MetadataId = RegionId;
fn metadata_id(&self) -> RegionId {
match self {
RegionMetaAction::Change(c) => c.metadata.id,
RegionMetaAction::Remove(r) => r.region_id,
RegionMetaAction::Edit(e) => e.region_id,
}
}
}

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,205 @@
//! Region manifest impl
use std::sync::{
atomic::{AtomicU64, Ordering},
Arc,
};
use async_trait::async_trait;
use common_telemetry::logging;
use object_store::ObjectStore;
use store_api::manifest::*;
use store_api::storage::RegionId;
use crate::error::{Error, Result};
use crate::manifest::action::*;
use crate::manifest::storage::ManifestObjectStore;
use crate::manifest::storage::ObjectStoreLogIterator;
#[derive(Clone)]
pub struct RegionManifest {
inner: Arc<RegionManifestInner>,
}
#[async_trait]
impl Manifest for RegionManifest {
type Error = Error;
type MetaAction = RegionMetaAction;
type MetadataId = RegionId;
type Metadata = RegionManifestData;
fn new(id: Self::MetadataId, manifest_dir: &str, object_store: ObjectStore) -> Self {
RegionManifest {
inner: Arc::new(RegionManifestInner::new(id, manifest_dir, object_store)),
}
}
async fn update(&self, action: RegionMetaAction) -> Result<ManifestVersion> {
self.inner.save(&action).await
}
async fn load(&self) -> Result<Option<RegionManifestData>> {
let last_version = self.inner.last_version();
let start_bound = if last_version == MIN_VERSION {
// No actions have ever saved
MIN_VERSION
} else {
last_version - 1
};
let mut iter = self.inner.scan(start_bound, MAX_VERSION).await?;
match iter.next_action().await? {
Some((_v, RegionMetaAction::Change(c))) => Ok(Some(RegionManifestData {
region_meta: c.metadata,
})),
Some(_) => todo!(),
None => Ok(None),
}
}
async fn checkpoint(&self) -> Result<ManifestVersion> {
unimplemented!();
}
fn metadata_id(&self) -> RegionId {
self.inner.region_id
}
}
struct RegionManifestInner {
region_id: RegionId,
store: Arc<ManifestObjectStore>,
version: AtomicU64,
}
struct RegionMetaActionIterator {
log_iter: ObjectStoreLogIterator,
}
impl RegionMetaActionIterator {
async fn next_action(&mut self) -> Result<Option<(ManifestVersion, RegionMetaAction)>> {
match self.log_iter.next_log().await? {
Some((v, bytes)) => {
let action: RegionMetaAction = RegionMetaAction::decode(&bytes)?;
Ok(Some((v, action)))
}
None => Ok(None),
}
}
}
impl RegionManifestInner {
fn new(region_id: RegionId, manifest_dir: &str, object_store: ObjectStore) -> Self {
Self {
region_id,
store: Arc::new(ManifestObjectStore::new(manifest_dir, object_store)),
// TODO(dennis): recover the last version from history
version: AtomicU64::new(0),
}
}
#[inline]
fn inc_version(&self) -> ManifestVersion {
self.version.fetch_add(1, Ordering::Relaxed)
}
#[inline]
fn last_version(&self) -> ManifestVersion {
self.version.load(Ordering::Relaxed)
}
async fn save(&self, action: &RegionMetaAction) -> Result<ManifestVersion> {
let version = self.inc_version();
logging::debug!(
"Save region metadata action: {:?}, version: {}",
action,
version
);
self.store.save(version, &action.encode()?).await?;
Ok(version)
}
async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<RegionMetaActionIterator> {
Ok(RegionMetaActionIterator {
log_iter: self.store.scan(start, end).await?,
})
}
}
#[cfg(test)]
mod tests {
use datatypes::type_id::LogicalTypeId;
use object_store::{backend::fs, ObjectStore};
use tempdir::TempDir;
use super::*;
use crate::metadata::RegionMetadata;
use crate::test_util::descriptor_util::RegionDescBuilder;
#[tokio::test]
async fn test_region_manifest() {
common_telemetry::init_default_ut_logging();
let tmp_dir = TempDir::new("test_region_manifest").unwrap();
let object_store = ObjectStore::new(
fs::Backend::build()
.root(&tmp_dir.path().to_string_lossy())
.finish()
.await
.unwrap(),
);
let region_id = 0;
let manifest = RegionManifest::new(region_id, "/manifest/", object_store);
assert_eq!(region_id, manifest.metadata_id());
let region_name = "region-0";
let desc = RegionDescBuilder::new(region_name)
.id(region_id)
.push_key_column(("k1", LogicalTypeId::Int32, false))
.push_value_column(("v1", LogicalTypeId::Float32, true))
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
let region_meta = Arc::new(metadata);
assert!(manifest.load().await.unwrap().is_none());
manifest
.update(RegionMetaAction::Change(RegionChange {
metadata: region_meta.clone(),
}))
.await
.unwrap();
let manifest_data = manifest.load().await.unwrap().unwrap();
assert_eq!(manifest_data.region_meta, region_meta);
// save another metadata
let region_name = "region-0";
let desc = RegionDescBuilder::new(region_name)
.id(region_id)
.push_key_column(("k1", LogicalTypeId::Int32, false))
.push_key_column(("k2", LogicalTypeId::Int64, false))
.push_value_column(("v1", LogicalTypeId::Float32, true))
.push_value_column(("bool", LogicalTypeId::Boolean, true))
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
let region_meta = Arc::new(metadata);
manifest
.update(RegionMetaAction::Change(RegionChange {
metadata: region_meta.clone(),
}))
.await
.unwrap();
let manifest_data = manifest.load().await.unwrap().unwrap();
assert_eq!(manifest_data.region_meta, region_meta);
}
}

View File

@@ -0,0 +1,330 @@
use std::collections::HashMap;
use std::iter::Iterator;
use async_trait::async_trait;
use common_telemetry::logging;
use futures::TryStreamExt;
use lazy_static::lazy_static;
use object_store::{util, DirEntry, ObjectStore};
use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use store_api::manifest::{LogIterator, ManifestLogStorage, ManifestVersion};
use crate::error::{
DecodeJsonSnafu, DeleteObjectSnafu, EncodeJsonSnafu, Error, InvalidScanIndexSnafu,
ListObjectsSnafu, ReadObjectSnafu, Result, Utf8Snafu, WriteObjectSnafu,
};
lazy_static! {
static ref RE: Regex = Regex::new("^\\d+\\.json$").unwrap();
}
const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint";
#[inline]
pub fn delta_file(version: ManifestVersion) -> String {
format!("{:020}.json", version)
}
#[inline]
pub fn checkpoint_file(version: ManifestVersion) -> String {
format!("{:020}.checkpoint", version)
}
/// Return's the delta file version from path
///
/// # Panics
/// Panics if the file path is not a valid delta file.
#[inline]
pub fn delta_version(path: &str) -> ManifestVersion {
let s = path.split('.').next().unwrap();
s.parse()
.unwrap_or_else(|_| panic!("Invalid delta file: {}", path))
}
#[inline]
pub fn is_delta_file(file_name: &str) -> bool {
RE.is_match(file_name)
}
pub struct ObjectStoreLogIterator {
iter: Box<dyn Iterator<Item = (ManifestVersion, DirEntry)> + Send + Sync>,
}
#[async_trait]
impl LogIterator for ObjectStoreLogIterator {
type Error = Error;
async fn next_log(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
match self.iter.next() {
Some((v, e)) => {
let object = e.into_object();
let bytes = object.read().await.context(ReadObjectSnafu {
path: object.path(),
})?;
Ok(Some((v, bytes)))
}
None => Ok(None),
}
}
}
#[derive(Clone, Debug)]
pub struct ManifestObjectStore {
object_store: ObjectStore,
path: String,
}
impl ManifestObjectStore {
pub fn new(path: &str, object_store: ObjectStore) -> Self {
Self {
object_store,
path: util::normalize_dir(path),
}
}
fn delta_file_path(&self, version: ManifestVersion) -> String {
format!("{}{}", self.path, delta_file(version))
}
fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
format!("{}{}", self.path, checkpoint_file(version))
}
}
#[derive(Serialize, Deserialize, Debug)]
struct CheckpointMetadata {
pub size: usize,
pub version: ManifestVersion,
pub checksum: Option<String>,
pub extend_metadata: Option<HashMap<String, String>>,
}
impl CheckpointMetadata {
fn encode(&self) -> Result<impl AsRef<[u8]>> {
serde_json::to_string(self).context(EncodeJsonSnafu)
}
fn decode(bs: &[u8]) -> Result<Self> {
let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
serde_json::from_str(data).context(DecodeJsonSnafu)
}
}
#[async_trait]
impl ManifestLogStorage for ManifestObjectStore {
type Error = Error;
type Iter = ObjectStoreLogIterator;
async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<ObjectStoreLogIterator> {
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
let dir = self.object_store.object(&self.path);
let dir_exists = dir
.is_exist()
.await
.context(ReadObjectSnafu { path: &self.path })?;
if !dir_exists {
return Ok(ObjectStoreLogIterator {
iter: Box::new(Vec::default().into_iter()),
});
}
let streamer = dir
.list()
.await
.context(ListObjectsSnafu { path: &self.path })?;
let mut entries: Vec<(ManifestVersion, DirEntry)> = streamer
.try_filter_map(|e| async move {
let file_name = e.name();
if is_delta_file(file_name) {
let version = delta_version(file_name);
if version >= start && version < end {
Ok(Some((version, e)))
} else {
Ok(None)
}
} else {
Ok(None)
}
})
.try_collect::<Vec<_>>()
.await
.context(ListObjectsSnafu { path: &self.path })?;
entries.sort_unstable_by(|(v1, _), (v2, _)| v1.cmp(v2));
Ok(ObjectStoreLogIterator {
iter: Box::new(entries.into_iter()),
})
}
async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
let object = self.object_store.object(&self.delta_file_path(version));
object.write(bytes).await.context(WriteObjectSnafu {
path: object.path(),
})?;
Ok(())
}
async fn delete(&self, start: ManifestVersion, end: ManifestVersion) -> Result<()> {
//TODO(dennis): delete in batch or concurrently?
for v in start..end {
let object = self.object_store.object(&self.delta_file_path(v));
object.delete().await.context(DeleteObjectSnafu {
path: object.path(),
})?;
}
Ok(())
}
async fn save_checkpoint(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
let object = self
.object_store
.object(&self.checkpoint_file_path(version));
object.write(bytes).await.context(WriteObjectSnafu {
path: object.path(),
})?;
let last_checkpoint = self
.object_store
.object(&format!("{}{}", self.path, LAST_CHECKPOINT_FILE));
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: None,
extend_metadata: None,
};
logging::debug!(
"Save checkpoint in path: {}, metadata: {:?}",
last_checkpoint.path(),
checkpoint_metadata
);
let bs = checkpoint_metadata.encode()?;
last_checkpoint.write(bs).await.context(WriteObjectSnafu {
path: last_checkpoint.path(),
})?;
Ok(())
}
async fn load_checkpoint(&self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let last_checkpoint = self
.object_store
.object(&format!("{}{}", self.path, LAST_CHECKPOINT_FILE));
let checkpoint_exists = last_checkpoint.is_exist().await.context(ReadObjectSnafu {
path: last_checkpoint.path(),
})?;
if checkpoint_exists {
let bytes = last_checkpoint.read().await.context(ReadObjectSnafu {
path: last_checkpoint.path(),
})?;
let checkpoint_metadata = CheckpointMetadata::decode(&bytes)?;
logging::debug!(
"Load checkpoint in path: {}, metadata: {:?}",
last_checkpoint.path(),
checkpoint_metadata
);
let checkpoint = self
.object_store
.object(&self.checkpoint_file_path(checkpoint_metadata.version));
Ok(Some((
checkpoint_metadata.version,
checkpoint.read().await.context(ReadObjectSnafu {
path: checkpoint.path(),
})?,
)))
} else {
Ok(None)
}
}
}
#[cfg(test)]
mod tests {
use object_store::{backend::fs, ObjectStore};
use tempdir::TempDir;
use super::*;
#[tokio::test]
async fn test_manifest_log_store() {
common_telemetry::init_default_ut_logging();
let tmp_dir = TempDir::new("test_manifest_log_store").unwrap();
let object_store = ObjectStore::new(
fs::Backend::build()
.root(&tmp_dir.path().to_string_lossy())
.finish()
.await
.unwrap(),
);
let log_store = ManifestObjectStore::new("/", object_store);
for v in 0..5 {
log_store
.save(v, format!("hello, {}", v).as_bytes())
.await
.unwrap();
}
let mut it = log_store.scan(1, 4).await.unwrap();
for v in 1..4 {
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(v, version);
assert_eq!(format!("hello, {}", v).as_bytes(), bytes);
}
assert!(it.next_log().await.unwrap().is_none());
let mut it = log_store.scan(0, 11).await.unwrap();
for v in 0..5 {
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(v, version);
assert_eq!(format!("hello, {}", v).as_bytes(), bytes);
}
assert!(it.next_log().await.unwrap().is_none());
// Delete [0, 3)
log_store.delete(0, 3).await.unwrap();
// [3, 5) remains
let mut it = log_store.scan(0, 11).await.unwrap();
for v in 3..5 {
let (version, bytes) = it.next_log().await.unwrap().unwrap();
assert_eq!(v, version);
assert_eq!(format!("hello, {}", v).as_bytes(), bytes);
}
assert!(it.next_log().await.unwrap().is_none());
// test checkpoint
assert!(log_store.load_checkpoint().await.unwrap().is_none());
log_store
.save_checkpoint(3, "checkpoint".as_bytes())
.await
.unwrap();
let (v, checkpoint) = log_store.load_checkpoint().await.unwrap().unwrap();
assert_eq!(checkpoint, "checkpoint".as_bytes());
assert_eq!(3, v);
}
}

View File

@@ -2,22 +2,27 @@ mod btree;
mod inserter; mod inserter;
mod schema; mod schema;
#[cfg(test)] #[cfg(test)]
mod tests; pub mod tests;
mod version;
use std::mem;
use std::sync::Arc; use std::sync::Arc;
use datatypes::vectors::{UInt64Vector, UInt8Vector, VectorRef}; use datatypes::vectors::{UInt64Vector, UInt8Vector, VectorRef};
use snafu::Snafu;
use store_api::storage::{consts, SequenceNumber, ValueType}; use store_api::storage::{consts, SequenceNumber, ValueType};
use crate::error::Result; use crate::error::Result;
use crate::memtable::btree::BTreeMemtable; use crate::memtable::btree::BTreeMemtable;
pub use crate::memtable::inserter::Inserter; pub use crate::memtable::inserter::Inserter;
pub use crate::memtable::schema::MemtableSchema; pub use crate::memtable::schema::MemtableSchema;
pub use crate::memtable::version::{MemtableSet, MemtableVersion};
/// Unique id for memtables under same region.
pub type MemtableId = u32;
/// In memory storage. /// In memory storage.
pub trait Memtable: Send + Sync { pub trait Memtable: Send + Sync + std::fmt::Debug {
fn id(&self) -> MemtableId;
fn schema(&self) -> &MemtableSchema; fn schema(&self) -> &MemtableSchema;
/// Write key/values to the memtable. /// Write key/values to the memtable.
@@ -27,7 +32,7 @@ pub trait Memtable: Send + Sync {
fn write(&self, kvs: &KeyValues) -> Result<()>; fn write(&self, kvs: &KeyValues) -> Result<()>;
/// Iterates the memtable. /// Iterates the memtable.
// TODO(yingwen): Consider passing a projector (does column projection). // TODO(yingwen): 1. Use reference of IterContext? 2. Consider passing a projector (does column projection).
fn iter(&self, ctx: IterContext) -> Result<BatchIteratorPtr>; fn iter(&self, ctx: IterContext) -> Result<BatchIteratorPtr>;
/// Returns the estimated bytes allocated by this memtable from heap. /// Returns the estimated bytes allocated by this memtable from heap.
@@ -43,6 +48,11 @@ pub struct IterContext {
pub batch_size: usize, pub batch_size: usize,
/// Max visible sequence (inclusive). /// Max visible sequence (inclusive).
pub visible_sequence: SequenceNumber, pub visible_sequence: SequenceNumber,
// TODO(yingwen): [flush] Maybe delay deduping and visiblility handling, just returns all rows
// in memtable.
/// Returns all rows, ignores sequence visibility and key duplication.
pub for_flush: bool,
} }
impl Default for IterContext { impl Default for IterContext {
@@ -51,6 +61,7 @@ impl Default for IterContext {
batch_size: consts::READ_BATCH_SIZE, batch_size: consts::READ_BATCH_SIZE,
// All data in memory is visible by default. // All data in memory is visible by default.
visible_sequence: SequenceNumber::MAX, visible_sequence: SequenceNumber::MAX,
for_flush: false,
} }
} }
} }
@@ -65,6 +76,7 @@ pub enum RowOrdering {
Key, Key,
} }
// TODO(yingwen): Maybe pack value_type with sequence (reserve 8bits in u64 for value type) like RocksDB.
pub struct Batch { pub struct Batch {
pub keys: Vec<VectorRef>, pub keys: Vec<VectorRef>,
pub sequences: UInt64Vector, pub sequences: UInt64Vector,
@@ -73,24 +85,18 @@ pub struct Batch {
} }
/// Iterator of memtable. /// Iterator of memtable.
pub trait BatchIterator: Send { pub trait BatchIterator: Iterator<Item = Result<Batch>> + Send + Sync {
/// Returns the schema of this iterator. /// Returns the schema of this iterator.
fn schema(&self) -> &MemtableSchema; fn schema(&self) -> &MemtableSchema;
/// Returns the ordering of the output rows from this iterator. /// Returns the ordering of the output rows from this iterator.
fn ordering(&self) -> RowOrdering; fn ordering(&self) -> RowOrdering;
/// Fetch next batch from the memtable.
///
/// # Panics
/// Panics if the iterator has already been exhausted.
fn next(&mut self) -> Result<Option<Batch>>;
} }
pub type BatchIteratorPtr = Box<dyn BatchIterator>; pub type BatchIteratorPtr = Box<dyn BatchIterator>;
pub trait MemtableBuilder: Send + Sync { pub trait MemtableBuilder: Send + Sync {
fn build(&self, schema: MemtableSchema) -> MemtableRef; fn build(&self, id: MemtableId, schema: MemtableSchema) -> MemtableRef;
} }
pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>; pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
@@ -100,7 +106,8 @@ pub type MemtableBuilderRef = Arc<dyn MemtableBuilder>;
pub struct KeyValues { pub struct KeyValues {
pub sequence: SequenceNumber, pub sequence: SequenceNumber,
pub value_type: ValueType, pub value_type: ValueType,
/// Start index of these key-value paris in batch. /// Start index of these key-value paris in batch. Each row in the same batch has
/// a unique index to identify it.
pub start_index_in_batch: usize, pub start_index_in_batch: usize,
pub keys: Vec<VectorRef>, pub keys: Vec<VectorRef>,
pub values: Vec<VectorRef>, pub values: Vec<VectorRef>,
@@ -132,42 +139,7 @@ impl KeyValues {
pub struct DefaultMemtableBuilder {} pub struct DefaultMemtableBuilder {}
impl MemtableBuilder for DefaultMemtableBuilder { impl MemtableBuilder for DefaultMemtableBuilder {
fn build(&self, schema: MemtableSchema) -> MemtableRef { fn build(&self, id: MemtableId, schema: MemtableSchema) -> MemtableRef {
Arc::new(BTreeMemtable::new(schema)) Arc::new(BTreeMemtable::new(id, schema))
}
}
#[derive(Debug, Snafu)]
#[snafu(display("Fail to switch memtable"))]
pub struct SwitchError;
pub struct MemtableSet {
mem: MemtableRef,
// TODO(yingwen): Support multiple immutable memtables.
_immem: Option<MemtableRef>,
}
impl MemtableSet {
pub fn new(mem: MemtableRef) -> MemtableSet {
MemtableSet { mem, _immem: None }
}
pub fn mutable_memtable(&self) -> &MemtableRef {
&self.mem
}
/// Switch mutable memtable to immutable memtable, returns the old mutable memtable if success.
pub fn _switch_memtable(
&mut self,
mem: &MemtableRef,
) -> std::result::Result<MemtableRef, SwitchError> {
match &self._immem {
Some(_) => SwitchSnafu {}.fail(),
None => {
let old_mem = mem::replace(&mut self.mem, mem.clone());
self._immem = Some(old_mem.clone());
Ok(old_mem)
}
}
} }
} }

View File

@@ -8,13 +8,15 @@ use std::sync::{
use datatypes::prelude::*; use datatypes::prelude::*;
use datatypes::value::Value; use datatypes::value::Value;
use datatypes::vectors::{UInt64VectorBuilder, UInt8VectorBuilder, VectorBuilder}; use datatypes::vectors::{
UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, VectorBuilder,
};
use store_api::storage::{SequenceNumber, ValueType}; use store_api::storage::{SequenceNumber, ValueType};
use crate::error::Result; use crate::error::Result;
use crate::memtable::{ use crate::memtable::{
Batch, BatchIterator, BatchIteratorPtr, IterContext, KeyValues, Memtable, MemtableSchema, Batch, BatchIterator, BatchIteratorPtr, IterContext, KeyValues, Memtable, MemtableId,
RowOrdering, MemtableSchema, RowOrdering,
}; };
type RwLockMap = RwLock<BTreeMap<InnerKey, RowValue>>; type RwLockMap = RwLock<BTreeMap<InnerKey, RowValue>>;
@@ -22,15 +24,18 @@ type RwLockMap = RwLock<BTreeMap<InnerKey, RowValue>>;
/// A simple memtable implementation based on std's [`BTreeMap`]. /// A simple memtable implementation based on std's [`BTreeMap`].
/// ///
/// Mainly for test purpose, don't use in production. /// Mainly for test purpose, don't use in production.
#[derive(Debug)]
pub struct BTreeMemtable { pub struct BTreeMemtable {
id: MemtableId,
schema: MemtableSchema, schema: MemtableSchema,
map: Arc<RwLockMap>, map: Arc<RwLockMap>,
estimated_bytes: AtomicUsize, estimated_bytes: AtomicUsize,
} }
impl BTreeMemtable { impl BTreeMemtable {
pub fn new(schema: MemtableSchema) -> BTreeMemtable { pub fn new(id: MemtableId, schema: MemtableSchema) -> BTreeMemtable {
BTreeMemtable { BTreeMemtable {
id,
schema, schema,
map: Arc::new(RwLock::new(BTreeMap::new())), map: Arc::new(RwLock::new(BTreeMap::new())),
estimated_bytes: AtomicUsize::new(0), estimated_bytes: AtomicUsize::new(0),
@@ -39,6 +44,10 @@ impl BTreeMemtable {
} }
impl Memtable for BTreeMemtable { impl Memtable for BTreeMemtable {
fn id(&self) -> MemtableId {
self.id
}
fn schema(&self) -> &MemtableSchema { fn schema(&self) -> &MemtableSchema {
&self.schema &self.schema
} }
@@ -84,9 +93,13 @@ impl BatchIterator for BTreeIterator {
fn ordering(&self) -> RowOrdering { fn ordering(&self) -> RowOrdering {
RowOrdering::Key RowOrdering::Key
} }
}
fn next(&mut self) -> Result<Option<Batch>> { impl Iterator for BTreeIterator {
Ok(self.next_batch()) type Item = Result<Batch>;
fn next(&mut self) -> Option<Result<Batch>> {
self.next_batch().map(Ok)
} }
} }
@@ -107,18 +120,13 @@ impl BTreeIterator {
} else { } else {
map.range(..) map.range(..)
}; };
let iter = MapIterWrapper::new(iter, self.ctx.visible_sequence);
let mut keys = Vec::with_capacity(self.ctx.batch_size); let (keys, sequences, value_types, values) = if self.ctx.for_flush {
let mut sequences = UInt64VectorBuilder::with_capacity(self.ctx.batch_size); collect_iter(iter, self.ctx.batch_size)
let mut value_types = UInt8VectorBuilder::with_capacity(self.ctx.batch_size); } else {
let mut values = Vec::with_capacity(self.ctx.batch_size); let iter = MapIterWrapper::new(iter, self.ctx.visible_sequence);
for (inner_key, row_value) in iter.take(self.ctx.batch_size) { collect_iter(iter, self.ctx.batch_size)
keys.push(inner_key); };
sequences.push(Some(inner_key.sequence));
value_types.push(Some(inner_key.value_type.as_u8()));
values.push(row_value);
}
if keys.is_empty() { if keys.is_empty() {
return None; return None;
@@ -140,14 +148,37 @@ impl BTreeIterator {
Some(Batch { Some(Batch {
keys: rows_to_vectors(key_data_types, keys.as_slice()), keys: rows_to_vectors(key_data_types, keys.as_slice()),
sequences: sequences.finish(), sequences,
value_types: value_types.finish(), value_types,
values: rows_to_vectors(value_data_types, values.as_slice()), values: rows_to_vectors(value_data_types, values.as_slice()),
}) })
} }
} }
/// `MapIterWrapper` removes same user key with elder sequence. fn collect_iter<'a, I: Iterator<Item = (&'a InnerKey, &'a RowValue)>>(
iter: I,
batch_size: usize,
) -> (
Vec<&'a InnerKey>,
UInt64Vector,
UInt8Vector,
Vec<&'a RowValue>,
) {
let mut keys = Vec::with_capacity(batch_size);
let mut sequences = UInt64VectorBuilder::with_capacity(batch_size);
let mut value_types = UInt8VectorBuilder::with_capacity(batch_size);
let mut values = Vec::with_capacity(batch_size);
for (inner_key, row_value) in iter.take(batch_size) {
keys.push(inner_key);
sequences.push(Some(inner_key.sequence));
value_types.push(Some(inner_key.value_type.as_u8()));
values.push(row_value);
}
(keys, sequences.finish(), value_types.finish(), values)
}
/// `MapIterWrapper` removes same user key with invisible sequence.
struct MapIterWrapper<'a, InnerKey, RowValue> { struct MapIterWrapper<'a, InnerKey, RowValue> {
iter: btree_map::Range<'a, InnerKey, RowValue>, iter: btree_map::Range<'a, InnerKey, RowValue>,
prev_key: Option<InnerKey>, prev_key: Option<InnerKey>,

View File

@@ -1,51 +1,80 @@
use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration;
use datatypes::vectors::{NullVector, VectorRef}; use common_time::{RangeMillis, TimestampMillis};
use snafu::ensure; use datatypes::prelude::ScalarVector;
use datatypes::schema::SchemaRef;
use datatypes::vectors::{Int64Vector, NullVector, VectorRef};
use snafu::{ensure, OptionExt};
use store_api::storage::{ColumnDescriptor, SequenceNumber, ValueType}; use store_api::storage::{ColumnDescriptor, SequenceNumber, ValueType};
use crate::error::{self, Result}; use crate::error::{self, Result};
use crate::memtable::{KeyValues, Memtable}; use crate::memtable::{KeyValues, Memtable, MemtableSet};
use crate::write_batch::{Mutation, PutData, WriteBatch}; use crate::write_batch::{Mutation, PutData, WriteBatch};
type RangeIndexMap = HashMap<TimestampMillis, usize>;
/// Wraps logic of inserting key/values in [WriteBatch] to [Memtable]. /// Wraps logic of inserting key/values in [WriteBatch] to [Memtable].
pub struct Inserter { pub struct Inserter {
/// Sequence of the batch to be inserted. /// Sequence of the batch to be inserted.
sequence: SequenceNumber, sequence: SequenceNumber,
/// Time ranges of all input data.
time_ranges: Vec<RangeMillis>,
/// Map time range's start time to its index in time ranges.
time_range_indexes: RangeIndexMap,
/// Bucket duration of memtables.
bucket_duration: Duration,
/// Used to calculate the start index in batch for `KeyValues`.
index_in_batch: usize, index_in_batch: usize,
} }
impl Inserter { impl Inserter {
pub fn new(sequence: SequenceNumber) -> Inserter { pub fn new(
sequence: SequenceNumber,
time_ranges: Vec<RangeMillis>,
bucket_duration: Duration,
) -> Inserter {
let time_range_indexes = new_range_index_map(&time_ranges);
Inserter { Inserter {
sequence, sequence,
time_ranges,
time_range_indexes,
bucket_duration,
index_in_batch: 0, index_in_batch: 0,
} }
} }
// TODO(yingwen): Can we take the WriteBatch? // TODO(yingwen): Can we take the WriteBatch?
/// Insert write batch into memtable. /// Insert write batch into memtables if both `batch` and `memtables` are not empty.
/// ///
/// Won't do schema validation. /// Won't do schema validation, caller (mostly the [`RegionWriter`]) should ensure the
pub fn insert_memtable(&mut self, batch: &WriteBatch, memtable: &dyn Memtable) -> Result<()> { /// schemas of `memtables` are consistent with `batch`'s, and the time ranges of `memtables`
if batch.is_empty() { /// are consistent with `self`'s time ranges.
///
/// # Panics
/// Panics if there is time range in `self.time_ranges` but not in `memtables`.
pub fn insert_memtables(&mut self, batch: &WriteBatch, memtables: &MemtableSet) -> Result<()> {
if batch.is_empty() || memtables.is_empty() {
return Ok(()); return Ok(());
} }
let schema = memtable.schema(); // Enough to hold all key or value columns.
let total_column_num = batch.schema().num_columns();
// Reusable KeyValues buffer. // Reusable KeyValues buffer.
let mut kvs = KeyValues { let mut kvs = KeyValues {
sequence: self.sequence, sequence: self.sequence,
value_type: ValueType::Put, value_type: ValueType::Put,
start_index_in_batch: self.index_in_batch, start_index_in_batch: self.index_in_batch,
keys: Vec::with_capacity(schema.num_row_key_columns()), keys: Vec::with_capacity(total_column_num),
values: Vec::with_capacity(schema.num_value_columns()), values: Vec::with_capacity(total_column_num),
}; };
for mutation in batch { for mutation in batch {
match mutation { match mutation {
Mutation::Put(put_data) => { Mutation::Put(put_data) => {
self.put_impl(put_data, memtable, &mut kvs)?; self.put_memtables(batch.schema(), put_data, memtables, &mut kvs)?;
} }
} }
} }
@@ -53,7 +82,24 @@ impl Inserter {
Ok(()) Ok(())
} }
fn put_impl( fn put_memtables(
&mut self,
schema: &SchemaRef,
put_data: &PutData,
memtables: &MemtableSet,
kvs: &mut KeyValues,
) -> Result<()> {
if memtables.len() == 1 {
// Fast path, only one memtable to put.
let (_range, memtable) = memtables.iter().next().unwrap();
return self.put_one_memtable(put_data, &**memtable, kvs);
}
// Split data by time range and put them into memtables.
self.put_multiple_memtables(schema, put_data, memtables, kvs)
}
fn put_one_memtable(
&mut self, &mut self,
put_data: &PutData, put_data: &PutData,
memtable: &dyn Memtable, memtable: &dyn Memtable,
@@ -78,6 +124,52 @@ impl Inserter {
Ok(()) Ok(())
} }
/// Put data to multiple memtables.
fn put_multiple_memtables(
&mut self,
schema: &SchemaRef,
put_data: &PutData,
memtables: &MemtableSet,
kvs: &mut KeyValues,
) -> Result<()> {
let timestamp_schema = schema
.timestamp_column()
.context(error::BatchMissingTimestampSnafu)?;
let timestamps = put_data.column_by_name(&timestamp_schema.name).context(
error::BatchMissingColumnSnafu {
column: &timestamp_schema.name,
},
)?;
let timestamps = timestamps
.as_any()
.downcast_ref()
.context(error::BatchMissingTimestampSnafu)?;
let slice_indexes =
compute_slice_indexes(timestamps, self.bucket_duration, &self.time_range_indexes);
for slice_index in slice_indexes {
let sliced_data = put_data.slice(slice_index.start, slice_index.end);
let range = &self.time_ranges[slice_index.range_index];
// The caller should ensure memtable for given time range is exists.
let memtable = memtables
.get_by_range(range)
.expect("Memtable not found for range");
self.put_one_memtable(&sliced_data, &**memtable, kvs)?;
}
Ok(())
}
}
fn new_range_index_map(time_ranges: &[RangeMillis]) -> RangeIndexMap {
time_ranges
.iter()
.enumerate()
.map(|(i, range)| (*range.start(), i))
.collect()
} }
fn clone_put_data_column_to( fn clone_put_data_column_to(
@@ -100,3 +192,519 @@ fn clone_put_data_column_to(
Ok(()) Ok(())
} }
/// Holds `start` and `end` indexes to get a slice `[start, end)` from the vector whose
/// timestamps belong to same time range at `range_index`.
#[derive(Debug, PartialEq)]
struct SliceIndex {
start: usize,
end: usize,
/// Index in time ranges.
range_index: usize,
}
/// Computes the indexes used to split timestamps into time ranges aligned by `duration`, stores
/// the indexes in [`SliceIndex`].
///
/// # Panics
/// Panics if the duration is too large to be represented by i64, or `timestamps` are not all
/// included by `time_range_indexes`.
fn compute_slice_indexes(
timestamps: &Int64Vector,
duration: Duration,
time_range_indexes: &RangeIndexMap,
) -> Vec<SliceIndex> {
let duration_ms = duration
.as_millis()
.try_into()
.unwrap_or_else(|e| panic!("Duration {:?} too large, {}", duration, e));
let mut slice_indexes = Vec::with_capacity(time_range_indexes.len());
// Current start and end of a valid `SliceIndex`.
let (mut start, mut end) = (0, 0);
// Time range index of the valid but unpushed `SliceIndex`.
let mut last_range_index = None;
// Iterate all timestamps, split timestamps by its time range.
for (i, ts) in timestamps.iter_data().enumerate() {
// Find index for time range of the timestamp.
let current_range_index = ts
.and_then(|v| TimestampMillis::new(v).align_by_bucket(duration_ms))
.and_then(|aligned| time_range_indexes.get(&aligned).copied());
match current_range_index {
Some(current_range_index) => {
end = i;
match last_range_index {
Some(last_index) => {
if last_index != current_range_index {
// Found a new range, we need to push a SliceIndex for last range.
slice_indexes.push(SliceIndex {
start,
end,
range_index: last_index,
});
// Update last range index.
last_range_index = Some(current_range_index);
// Advance start.
start = i;
}
}
// No previous range index.
None => last_range_index = Some(current_range_index),
}
}
None => {
// Row without timestamp or out of time range will be skipped. This usually should not happen.
if let Some(last_index) = last_range_index {
// Need to store SliceIndex for last range.
slice_indexes.push(SliceIndex {
start,
end: i,
range_index: last_index,
});
// Clear last range index.
last_range_index = None;
}
// Advances start and end, skips current row.
start = i + 1;
end = start;
}
}
}
// Process last slice index.
if let Some(last_index) = last_range_index {
slice_indexes.push(SliceIndex {
start,
// We need to use `end + 1` to include the last element.
end: end + 1,
range_index: last_index,
});
}
slice_indexes
}
#[cfg(test)]
mod tests {
use datatypes::{type_id::LogicalTypeId, value::Value};
use store_api::storage::{PutOperation, WriteRequest};
use super::*;
use crate::memtable::{
DefaultMemtableBuilder, IterContext, MemtableBuilder, MemtableId, MemtableSchema,
};
use crate::metadata::RegionMetadata;
use crate::test_util::descriptor_util::RegionDescBuilder;
use crate::test_util::write_batch_util;
fn new_time_ranges(starts: &[i64], duration: i64) -> Vec<RangeMillis> {
let mut ranges = Vec::with_capacity(starts.len());
for start in starts {
assert_eq!(*start, start / duration * duration);
ranges.push(RangeMillis::new(*start, start + duration).unwrap());
}
ranges
}
fn check_compute_slice_indexes(
timestamps: &[Option<i64>],
range_starts: &[i64],
duration: i64,
expect: &[SliceIndex],
) {
assert!(duration > 0);
let timestamps = Int64Vector::from_iter(timestamps.iter());
let time_ranges = new_time_ranges(range_starts, duration);
let time_range_indexes = new_range_index_map(&time_ranges);
let slice_indexes = compute_slice_indexes(
&timestamps,
Duration::from_millis(duration as u64),
&time_range_indexes,
);
assert_eq!(expect, slice_indexes);
}
#[test]
fn test_compute_slice_indexes_valid() {
// Test empty input.
check_compute_slice_indexes(&[], &[], 100, &[]);
// One valid input.
check_compute_slice_indexes(
&[Some(99)],
&[0],
100,
&[SliceIndex {
start: 0,
end: 1,
range_index: 0,
}],
);
// 2 ranges.
check_compute_slice_indexes(
&[Some(99), Some(234)],
&[0, 200],
100,
&[
SliceIndex {
start: 0,
end: 1,
range_index: 0,
},
SliceIndex {
start: 1,
end: 2,
range_index: 1,
},
],
);
// Multiple elements in first range.
check_compute_slice_indexes(
&[Some(99), Some(13), Some(18), Some(234)],
&[0, 200],
100,
&[
SliceIndex {
start: 0,
end: 3,
range_index: 0,
},
SliceIndex {
start: 3,
end: 4,
range_index: 1,
},
],
);
// Multiple elements in last range.
check_compute_slice_indexes(
&[Some(99), Some(234), Some(271)],
&[0, 200],
100,
&[
SliceIndex {
start: 0,
end: 1,
range_index: 0,
},
SliceIndex {
start: 1,
end: 3,
range_index: 1,
},
],
);
// Mulitple ranges.
check_compute_slice_indexes(
&[Some(99), Some(13), Some(234), Some(456)],
&[0, 200, 400],
100,
&[
SliceIndex {
start: 0,
end: 2,
range_index: 0,
},
SliceIndex {
start: 2,
end: 3,
range_index: 1,
},
SliceIndex {
start: 3,
end: 4,
range_index: 2,
},
],
);
// Different slices with same range.
check_compute_slice_indexes(
&[Some(99), Some(234), Some(15)],
&[0, 200],
100,
&[
SliceIndex {
start: 0,
end: 1,
range_index: 0,
},
SliceIndex {
start: 1,
end: 2,
range_index: 1,
},
SliceIndex {
start: 2,
end: 3,
range_index: 0,
},
],
);
}
#[test]
fn test_compute_slice_indexes_null_timestamp() {
check_compute_slice_indexes(&[None], &[0], 100, &[]);
check_compute_slice_indexes(
&[None, None, Some(53)],
&[0],
100,
&[SliceIndex {
start: 2,
end: 3,
range_index: 0,
}],
);
check_compute_slice_indexes(
&[Some(53), None, None],
&[0],
100,
&[SliceIndex {
start: 0,
end: 1,
range_index: 0,
}],
);
check_compute_slice_indexes(
&[None, Some(53), None, Some(240), Some(13), None],
&[0, 200],
100,
&[
SliceIndex {
start: 1,
end: 2,
range_index: 0,
},
SliceIndex {
start: 3,
end: 4,
range_index: 1,
},
SliceIndex {
start: 4,
end: 5,
range_index: 0,
},
],
);
}
#[test]
fn test_compute_slice_indexes_no_range() {
check_compute_slice_indexes(
&[Some(99), Some(234), Some(15)],
&[0],
100,
&[
SliceIndex {
start: 0,
end: 1,
range_index: 0,
},
SliceIndex {
start: 2,
end: 3,
range_index: 0,
},
],
);
check_compute_slice_indexes(
&[Some(99), Some(15), Some(234)],
&[0],
100,
&[SliceIndex {
start: 0,
end: 2,
range_index: 0,
}],
);
check_compute_slice_indexes(
&[Some(i64::MIN), Some(99), Some(15)],
&[0],
100,
&[SliceIndex {
start: 1,
end: 3,
range_index: 0,
}],
);
}
fn new_test_write_batch() -> WriteBatch {
write_batch_util::new_write_batch(
&[
("ts", LogicalTypeId::Int64, false),
("value", LogicalTypeId::Int64, true),
],
Some(0),
)
}
fn new_memtable_schema() -> MemtableSchema {
let desc = RegionDescBuilder::new("test")
.timestamp(("ts", LogicalTypeId::Int64, false))
.push_value_column(("value", LogicalTypeId::Int64, true))
.enable_version_column(false)
.build();
let metadata: RegionMetadata = desc.try_into().unwrap();
MemtableSchema::new(metadata.columns_row_key)
}
fn put_batch(batch: &mut WriteBatch, data: &[(i64, Option<i64>)]) {
let mut put_data = PutData::with_num_columns(2);
let ts = Int64Vector::from_values(data.iter().map(|v| v.0));
put_data.add_key_column("ts", Arc::new(ts)).unwrap();
let value = Int64Vector::from_iter(data.iter().map(|v| v.1));
put_data.add_value_column("value", Arc::new(value)).unwrap();
batch.put(put_data).unwrap();
}
fn new_memtable_set(time_ranges: &[RangeMillis], schema: &MemtableSchema) -> MemtableSet {
let mut set = MemtableSet::new();
for (id, range) in time_ranges.iter().enumerate() {
let mem = DefaultMemtableBuilder {}.build(id as MemtableId, schema.clone());
set.insert(*range, mem)
}
set
}
fn check_memtable_content(
mem: &dyn Memtable,
sequence: SequenceNumber,
data: &[(i64, Option<i64>)],
) {
let iter = mem.iter(IterContext::default()).unwrap();
let mut index = 0;
for batch in iter {
let batch = batch.unwrap();
let row_num = batch.keys[0].len();
for i in 0..row_num {
let ts = batch.keys[0].get(i);
let v = batch.values[0].get(i);
assert_eq!(Value::from(data[index].0), ts);
assert_eq!(Value::from(data[index].1), v);
assert_eq!(sequence, batch.sequences.get_data(i).unwrap());
index += 1;
}
}
assert_eq!(data.len(), index);
}
#[test]
fn test_inserter_put_one_memtable() {
let sequence = 11111;
let bucket_duration = 100;
let time_ranges = new_time_ranges(&[0], bucket_duration);
let memtable_schema = new_memtable_schema();
let memtables = new_memtable_set(&time_ranges, &memtable_schema);
let mut inserter = Inserter::new(
sequence,
time_ranges,
Duration::from_millis(bucket_duration as u64),
);
let mut batch = new_test_write_batch();
put_batch(&mut batch, &[(1, Some(1)), (2, None)]);
// Also test multiple put data in one batch.
put_batch(
&mut batch,
&[
(3, None),
// Duplicate entries in same put data.
(2, None),
(2, Some(2)),
(4, Some(4)),
],
);
inserter.insert_memtables(&batch, &memtables).unwrap();
let mem = memtables
.get_by_range(&RangeMillis::new(0, 100).unwrap())
.unwrap();
check_memtable_content(
&**mem,
sequence,
&[(1, Some(1)), (2, Some(2)), (3, None), (4, Some(4))],
);
}
#[test]
fn test_inserter_put_multiple() {
let sequence = 11111;
let bucket_duration = 100;
let time_ranges = new_time_ranges(&[0, 100, 200], bucket_duration);
let memtable_schema = new_memtable_schema();
let memtables = new_memtable_set(&time_ranges, &memtable_schema);
let mut inserter = Inserter::new(
sequence,
time_ranges,
Duration::from_millis(bucket_duration as u64),
);
let mut batch = new_test_write_batch();
put_batch(
&mut batch,
&[
(1, Some(1)),
(2, None),
(201, Some(201)),
(102, None),
(101, Some(101)),
],
);
put_batch(
&mut batch,
&[
(180, Some(1)),
(3, Some(3)),
(1, None),
(211, Some(211)),
(180, Some(180)),
],
);
inserter.insert_memtables(&batch, &memtables).unwrap();
let mem = memtables
.get_by_range(&RangeMillis::new(0, 100).unwrap())
.unwrap();
check_memtable_content(&**mem, sequence, &[(1, None), (2, None), (3, Some(3))]);
let mem = memtables
.get_by_range(&RangeMillis::new(100, 200).unwrap())
.unwrap();
check_memtable_content(
&**mem,
sequence,
&[(101, Some(101)), (102, None), (180, Some(180))],
);
let mem = memtables
.get_by_range(&RangeMillis::new(200, 300).unwrap())
.unwrap();
check_memtable_content(&**mem, sequence, &[(201, Some(201)), (211, Some(211))]);
}
}

View File

@@ -6,12 +6,16 @@ use super::*;
use crate::metadata::RegionMetadata; use crate::metadata::RegionMetadata;
use crate::test_util::descriptor_util::RegionDescBuilder; use crate::test_util::descriptor_util::RegionDescBuilder;
// For simplicity, all memtables in test share same memtable id.
const MEMTABLE_ID: MemtableId = 1;
// Schema for testing memtable: // Schema for testing memtable:
// - key: Int64(timestamp), UInt64(version), // - key: Int64(timestamp), UInt64(version),
// - value: UInt64 // - value: UInt64
fn schema_for_test() -> MemtableSchema { pub fn schema_for_test() -> MemtableSchema {
// Just build a region desc and use its columns_row_key metadata. // Just build a region desc and use its columns_row_key metadata.
let desc = RegionDescBuilder::new("test") let desc = RegionDescBuilder::new("test")
.enable_version_column(true)
.push_value_column(("v1", LogicalTypeId::UInt64, true)) .push_value_column(("v1", LogicalTypeId::UInt64, true))
.build(); .build();
let metadata: RegionMetadata = desc.try_into().unwrap(); let metadata: RegionMetadata = desc.try_into().unwrap();
@@ -70,7 +74,7 @@ fn kvs_for_test(
kvs_for_test_with_index(sequence, value_type, 0, keys, values) kvs_for_test_with_index(sequence, value_type, 0, keys, values)
} }
fn write_kvs( pub fn write_kvs(
memtable: &dyn Memtable, memtable: &dyn Memtable,
sequence: SequenceNumber, sequence: SequenceNumber,
value_type: ValueType, value_type: ValueType,
@@ -100,7 +104,8 @@ fn check_iter_content(
values: &[Option<u64>], values: &[Option<u64>],
) { ) {
let mut index = 0; let mut index = 0;
while let Some(batch) = iter.next().unwrap() { for batch in iter {
let batch = batch.unwrap();
check_batch_valid(&batch); check_batch_valid(&batch);
let row_num = batch.keys[0].len(); let row_num = batch.keys[0].len();
@@ -147,7 +152,7 @@ impl MemtableTester {
fn new_memtables(&self) -> Vec<MemtableRef> { fn new_memtables(&self) -> Vec<MemtableRef> {
self.builders self.builders
.iter() .iter()
.map(|b| b.build(self.schema.clone())) .map(|b| b.build(MEMTABLE_ID, self.schema.clone()))
.collect() .collect()
} }
@@ -174,7 +179,9 @@ struct TestContext {
fn write_iter_memtable_case(ctx: &TestContext) { fn write_iter_memtable_case(ctx: &TestContext) {
// Test iterating an empty memtable. // Test iterating an empty memtable.
let mut iter = ctx.memtable.iter(IterContext::default()).unwrap(); let mut iter = ctx.memtable.iter(IterContext::default()).unwrap();
assert!(iter.next().unwrap().is_none()); assert!(iter.next().is_none());
// Poll the empty iterator again.
assert!(iter.next().is_none());
assert_eq!(0, ctx.memtable.bytes_allocated()); assert_eq!(0, ctx.memtable.bytes_allocated());
// Init test data. // Init test data.
@@ -262,7 +269,8 @@ fn test_write_iter_memtable() {
fn check_iter_batch_size(iter: &mut dyn BatchIterator, total: usize, batch_size: usize) { fn check_iter_batch_size(iter: &mut dyn BatchIterator, total: usize, batch_size: usize) {
let mut remains = total; let mut remains = total;
while let Some(batch) = iter.next().unwrap() { for batch in iter {
let batch = batch.unwrap();
check_batch_valid(&batch); check_batch_valid(&batch);
let row_num = batch.keys[0].len(); let row_num = batch.keys[0].len();
@@ -419,6 +427,7 @@ fn test_sequence_visibility() {
let iter_ctx = IterContext { let iter_ctx = IterContext {
batch_size: 1, batch_size: 1,
visible_sequence: 9, visible_sequence: 9,
for_flush: false,
}; };
let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
@@ -435,6 +444,7 @@ fn test_sequence_visibility() {
let iter_ctx = IterContext { let iter_ctx = IterContext {
batch_size: 1, batch_size: 1,
visible_sequence: 10, visible_sequence: 10,
for_flush: false,
}; };
let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
@@ -451,6 +461,7 @@ fn test_sequence_visibility() {
let iter_ctx = IterContext { let iter_ctx = IterContext {
batch_size: 1, batch_size: 1,
visible_sequence: 11, visible_sequence: 11,
for_flush: false,
}; };
let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
@@ -465,4 +476,26 @@ fn test_sequence_visibility() {
}); });
} }
// TODO(yingwen): Test key overwrite in same batch. #[test]
fn test_iter_after_none() {
let tester = MemtableTester::default();
tester.run_testcase(|ctx| {
write_kvs(
&*ctx.memtable,
10, // sequence
ValueType::Put,
&[(1000, 0), (1001, 1), (1002, 2)], // keys
&[Some(0), Some(1), Some(2)], // values
);
let iter_ctx = IterContext {
batch_size: 4,
..Default::default()
};
let mut iter = ctx.memtable.iter(iter_ctx).unwrap();
assert!(iter.next().is_some());
assert!(iter.next().is_none());
assert!(iter.next().is_none());
});
}

View File

@@ -0,0 +1,415 @@
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::sync::Arc;
use common_time::RangeMillis;
use crate::flush::MemtableWithMeta;
use crate::memtable::{MemtableId, MemtableRef};
/// A version of all memtables.
///
/// This structure is immutable now.
#[derive(Default, Debug, PartialEq, Eq)]
pub struct MemtableVersion {
mutable: MemtableSet,
/// Immutable memtables.
immutables: Vec<MemtableSetRef>,
}
impl MemtableVersion {
pub fn new() -> MemtableVersion {
MemtableVersion::default()
}
#[inline]
pub fn mutable_memtables(&self) -> &MemtableSet {
&self.mutable
}
#[inline]
pub fn immutable_memtables(&self) -> &[MemtableSetRef] {
&self.immutables
}
pub fn num_memtables(&self) -> usize {
self.mutable.len() + self.immutables.iter().map(|set| set.len()).sum::<usize>()
}
/// Clone current memtable version and freeze its mutable memtables, which moves
/// all mutable memtables to immutable memtable list.
pub fn freeze_mutable(&self) -> MemtableVersion {
let mut immutables = self.immutables.clone();
immutables.push(Arc::new(self.mutable.clone()));
MemtableVersion {
mutable: MemtableSet::new(),
immutables,
}
}
pub fn mutable_bytes_allocated(&self) -> usize {
self.mutable.bytes_allocated()
}
pub fn total_bytes_allocated(&self) -> usize {
self.immutables
.iter()
.map(|m| m.bytes_allocated())
.sum::<usize>()
+ self.mutable.bytes_allocated()
}
/// Creates a new `MemtableVersion` that contains memtables both in this and `other`.
///
/// # Panics
/// Panics if there are memtables with same time ranges.
pub fn add_mutable(&self, other: MemtableSet) -> MemtableVersion {
let mutable = self.mutable.add(other);
Self {
mutable,
immutables: self.immutables.clone(),
}
}
/// Creates a new `MemtableVersion` that removes immutable memtables
/// less than or equal to max_memtable_id.
pub fn remove_immutables(&self, max_memtable_id: MemtableId) -> MemtableVersion {
let immutables = self
.immutables
.iter()
.filter(|immem| immem.max_memtable_id() > max_memtable_id)
.cloned()
.collect();
MemtableVersion {
mutable: self.mutable.clone(),
immutables,
}
}
pub fn memtables_to_flush(&self) -> (Option<MemtableId>, Vec<MemtableWithMeta>) {
let max_memtable_id = self
.immutables
.iter()
.map(|immem| immem.max_memtable_id())
.max();
let memtables = self
.immutables
.iter()
.flat_map(|immem| immem.to_memtable_with_metas())
.collect();
(max_memtable_id, memtables)
}
}
// We use a new type to order time ranges by (end, start).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct RangeKey(RangeMillis);
impl Ord for RangeKey {
fn cmp(&self, other: &RangeKey) -> Ordering {
self.0
.end()
.cmp(other.0.end())
.then_with(|| self.0.start().cmp(other.0.start()))
}
}
impl PartialOrd for RangeKey {
fn partial_cmp(&self, other: &RangeKey) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// Collection of mutable memtables.
///
/// Memtables are partitioned by their time range. Caller should ensure
/// there are no overlapped ranges and all ranges are aligned by same
/// bucket duration.
#[derive(Default, Clone, Debug)]
pub struct MemtableSet {
memtables: BTreeMap<RangeKey, MemtableRef>,
max_memtable_id: MemtableId,
}
pub type MemtableSetRef = Arc<MemtableSet>;
impl PartialEq for MemtableSet {
fn eq(&self, other: &MemtableSet) -> bool {
self.max_memtable_id == other.max_memtable_id
&& self.memtables.len() == other.memtables.len()
&& self
.memtables
.iter()
.zip(&other.memtables)
.all(|(a, b)| a.0 == b.0 && a.1.id() == b.1.id() && a.1.schema() == b.1.schema())
}
}
impl Eq for MemtableSet {}
impl MemtableSet {
pub fn new() -> MemtableSet {
MemtableSet::default()
}
/// Get memtable by time range.
///
/// The range must exactly equal to the range of the memtable, otherwise `None`
/// is returned.
pub fn get_by_range(&self, range: &RangeMillis) -> Option<&MemtableRef> {
let range_key = RangeKey(*range);
self.memtables.get(&range_key)
}
/// Insert a new memtable.
///
/// # Panics
/// Panics if memtable with same range already exists.
pub fn insert(&mut self, range: RangeMillis, mem: MemtableRef) {
self.max_memtable_id = MemtableId::max(self.max_memtable_id, mem.id());
let old = self.memtables.insert(RangeKey(range), mem);
assert!(old.is_none());
}
/// Returns number of memtables in the set.
#[inline]
pub fn len(&self) -> usize {
self.memtables.len()
}
/// Returns true if there is no memtable in the set.
#[inline]
pub fn is_empty(&self) -> bool {
self.memtables.is_empty()
}
pub fn bytes_allocated(&self) -> usize {
self.memtables.values().map(|m| m.bytes_allocated()).sum()
}
pub fn max_memtable_id(&self) -> MemtableId {
self.max_memtable_id
}
/// Creates a new `MemtableSet` that contains memtables both in `self` and
/// `other`, let `self` unchanged.
pub fn add(&self, mut other: MemtableSet) -> MemtableSet {
// We use `other.memtables` to extend `self.memtables` since memtables
// in other should be empty in usual, so overwriting it is okay.
other
.memtables
.extend(self.memtables.iter().map(|(k, v)| (*k, v.clone())));
MemtableSet {
memtables: other.memtables,
max_memtable_id: MemtableId::max(self.max_memtable_id, other.max_memtable_id),
}
}
pub fn to_memtable_with_metas(&self) -> Vec<MemtableWithMeta> {
self.memtables
.iter()
.map(|(range_key, memtable)| MemtableWithMeta {
memtable: memtable.clone(),
bucket: range_key.0,
})
.collect()
}
pub fn iter(&self) -> impl Iterator<Item = (&RangeMillis, &MemtableRef)> {
self.memtables.iter().map(|(k, v)| (&k.0, v))
}
}
#[cfg(test)]
mod tests {
use store_api::storage::ValueType;
use super::*;
use crate::memtable::tests;
use crate::memtable::BTreeMemtable;
use crate::memtable::Memtable;
#[test]
fn test_memtableset_misc() {
let mut set = MemtableSet::new();
assert!(set.is_empty());
assert_eq!(0, set.max_memtable_id());
assert_eq!(0, set.bytes_allocated());
assert!(set
.get_by_range(&RangeMillis::new(0, 10).unwrap())
.is_none());
set.insert(
RangeMillis::new(0, 10).unwrap(),
Arc::new(BTreeMemtable::new(0, tests::schema_for_test())),
);
set.insert(
RangeMillis::new(10, 20).unwrap(),
Arc::new(BTreeMemtable::new(1, tests::schema_for_test())),
);
let memtable = Arc::new(BTreeMemtable::new(2, tests::schema_for_test()));
// Write some test data
tests::write_kvs(
&*memtable,
10, // sequence
ValueType::Put,
&[
(1000, 1),
(1000, 2),
(2002, 1),
(2003, 1),
(2003, 5),
(1001, 1),
], // keys
&[Some(1), Some(2), Some(7), Some(8), Some(9), Some(3)], // values
);
set.insert(RangeMillis::new(20, 30).unwrap(), memtable.clone());
for (i, (range, _)) in set.iter().enumerate() {
assert_eq!(
*range,
RangeMillis::new(i as i64 * 10, i as i64 * 10 + 10).unwrap()
);
}
assert!(!set.is_empty());
assert_eq!(2, set.max_memtable_id());
assert_eq!(memtable.bytes_allocated(), set.bytes_allocated());
assert!(set
.get_by_range(&RangeMillis::new(0, 10).unwrap())
.is_some());
assert!(set
.get_by_range(&RangeMillis::new(10, 20).unwrap())
.is_some());
assert!(set
.get_by_range(&RangeMillis::new(20, 30).unwrap())
.is_some());
assert!(set
.get_by_range(&RangeMillis::new(0, 100).unwrap())
.is_none());
}
fn create_test_memtableset(ids: &[MemtableId]) -> MemtableSet {
let mut set = MemtableSet::new();
for id in ids {
let i = *id as i64;
set.insert(
RangeMillis::new(i * 10, (i + 1) * 10).unwrap(),
Arc::new(BTreeMemtable::new(*id, tests::schema_for_test())),
);
}
set
}
#[test]
fn test_add_memtableset() {
let s1 = create_test_memtableset(&[0, 1, 2]);
let s2 = create_test_memtableset(&[3, 4, 5, 6]);
let mut s1_memtables = s1.to_memtable_with_metas();
let s2_memtables = s2.to_memtable_with_metas();
s1_memtables.extend(s2_memtables);
let empty = create_test_memtableset(&[]);
assert_eq!(s1, s1.add(empty));
let s3 = s1.add(s2);
assert_ne!(s1, s3);
assert_eq!(7, s3.memtables.len());
let s3_memtables = s3.to_memtable_with_metas();
assert_eq!(7, s3_memtables.len());
for i in 0..7 {
assert_eq!(s1_memtables[i].bucket, s3_memtables[i].bucket);
assert_eq!(s1_memtables[i].memtable.id(), s3_memtables[i].memtable.id());
}
assert_eq!(6, s3.max_memtable_id());
}
#[test]
fn test_memtableversion() {
let s1 = create_test_memtableset(&[0, 1, 2]);
let s2 = create_test_memtableset(&[3, 4, 5, 6]);
let s3 = s1.add(s2.clone());
let v1 = MemtableVersion::new();
assert!(v1.mutable_memtables().is_empty());
assert_eq!(0, v1.num_memtables());
// Add one mutable
let v2 = v1.add_mutable(s1.clone());
assert_ne!(v1, v2);
let mutables = v2.mutable_memtables();
assert_eq!(s1, *mutables);
assert_eq!(3, v2.num_memtables());
// Add another mutable
let v3 = v2.add_mutable(s2);
assert_ne!(v1, v3);
assert_ne!(v2, v3);
let mutables = v3.mutable_memtables();
assert_eq!(s3, *mutables);
assert!(v3.memtables_to_flush().1.is_empty());
assert_eq!(7, v3.num_memtables());
// Try to freeze s1, s2
let v4 = v3.freeze_mutable();
assert_ne!(v1, v4);
assert_ne!(v2, v4);
assert_ne!(v3, v4);
assert!(v4.mutable_memtables().is_empty());
assert_eq!(v4.immutables.len(), 1);
assert_eq!(v4.immutables[0], Arc::new(s3.clone()));
let (max_id, tables) = v4.memtables_to_flush();
assert_eq!(6, max_id.unwrap());
assert_eq!(7, tables.len());
assert_eq!(7, v4.num_memtables());
// Add another mutable
let s4 = create_test_memtableset(&[7, 8]);
let v5 = v4.add_mutable(s4.clone());
let mutables = v5.mutable_memtables();
assert_eq!(s4, *mutables);
assert_eq!(v4.immutables, v5.immutables);
// Try to freeze s4
let v6 = v5.freeze_mutable();
assert_eq!(v6.immutables.len(), 2);
assert_eq!(v6.immutables[0], Arc::new(s3));
assert_eq!(v6.immutables[1], Arc::new(s4.clone()));
let (max_id, tables) = v6.memtables_to_flush();
assert_eq!(8, max_id.unwrap());
assert_eq!(9, tables.len());
assert_eq!(9, v6.num_memtables());
// verify tables
for (i, table) in tables.iter().enumerate() {
assert_eq!(i as u32, table.memtable.id());
let i = i as i64;
assert_eq!(
table.bucket,
RangeMillis::new(i * 10, (i + 1) * 10).unwrap()
);
}
// Remove tables
let v7 = v6.remove_immutables(6);
assert_eq!(v7.immutables.len(), 1);
assert_eq!(v7.immutables[0], Arc::new(s4));
let v8 = v7.remove_immutables(8);
assert_eq!(v8.immutables.len(), 0);
assert_eq!(0, v8.num_memtables());
}
}

View File

@@ -3,10 +3,12 @@ use std::sync::Arc;
use common_error::prelude::*; use common_error::prelude::*;
use datatypes::data_type::ConcreteDataType; use datatypes::data_type::ConcreteDataType;
use serde::{Deserialize, Serialize};
use snafu::ensure; use snafu::ensure;
use store_api::storage::{ use store_api::storage::{
consts, ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptor, ColumnFamilyId, consts, ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptor, ColumnFamilyId,
ColumnId, ColumnSchema, RegionDescriptor, RegionMeta, RowKeyDescriptor, Schema, SchemaRef, ColumnId, ColumnSchema, RegionDescriptor, RegionId, RegionMeta, RowKeyDescriptor, Schema,
SchemaRef,
}; };
/// Error for handling metadata. /// Error for handling metadata.
@@ -20,6 +22,12 @@ pub enum Error {
#[snafu(display("Column family id already exists, id: {}", id))] #[snafu(display("Column family id already exists, id: {}", id))]
CfIdExists { id: ColumnId, backtrace: Backtrace }, CfIdExists { id: ColumnId, backtrace: Backtrace },
#[snafu(display("Failed to build schema, source: {}", source))]
InvalidSchema {
source: datatypes::error::Error,
backtrace: Backtrace,
},
} }
pub type Result<T> = std::result::Result<T, Error>; pub type Result<T> = std::result::Result<T, Error>;
@@ -27,6 +35,7 @@ pub type Result<T> = std::result::Result<T, Error>;
/// Implementation of [RegionMeta]. /// Implementation of [RegionMeta].
/// ///
/// Holds a snapshot of region metadata. /// Holds a snapshot of region metadata.
#[derive(Clone, Debug)]
pub struct RegionMetaImpl { pub struct RegionMetaImpl {
metadata: RegionMetadataRef, metadata: RegionMetadataRef,
} }
@@ -48,8 +57,9 @@ pub type VersionNumber = u32;
// TODO(yingwen): Make some fields of metadata private. // TODO(yingwen): Make some fields of metadata private.
/// In memory metadata of region. /// In memory metadata of region.
#[derive(Clone, Debug)] #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct RegionMetadata { pub struct RegionMetadata {
pub id: RegionId,
/// Schema of the region. /// Schema of the region.
/// ///
/// Holding a [SchemaRef] to allow converting into `SchemaRef`/`arrow::SchemaRef` /// Holding a [SchemaRef] to allow converting into `SchemaRef`/`arrow::SchemaRef`
@@ -66,13 +76,13 @@ pub struct RegionMetadata {
pub type RegionMetadataRef = Arc<RegionMetadata>; pub type RegionMetadataRef = Arc<RegionMetadata>;
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ColumnMetadata { pub struct ColumnMetadata {
pub cf_id: ColumnFamilyId, pub cf_id: ColumnFamilyId,
pub desc: ColumnDescriptor, pub desc: ColumnDescriptor,
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ColumnsMetadata { pub struct ColumnsMetadata {
/// All columns, in `(key columns, timestamp, [version,] value columns)` order. /// All columns, in `(key columns, timestamp, [version,] value columns)` order.
/// ///
@@ -82,7 +92,7 @@ pub struct ColumnsMetadata {
pub name_to_col_index: HashMap<String, usize>, pub name_to_col_index: HashMap<String, usize>,
} }
#[derive(Clone, Debug, Default, PartialEq)] #[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct RowKeyMetadata { pub struct RowKeyMetadata {
/// Exclusive end index of row key columns. /// Exclusive end index of row key columns.
row_key_end: usize, row_key_end: usize,
@@ -93,7 +103,7 @@ pub struct RowKeyMetadata {
pub enable_version_column: bool, pub enable_version_column: bool,
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ColumnsRowKeyMetadata { pub struct ColumnsRowKeyMetadata {
columns: ColumnsMetadata, columns: ColumnsMetadata,
row_key: RowKeyMetadata, row_key: RowKeyMetadata,
@@ -121,7 +131,7 @@ impl ColumnsRowKeyMetadata {
pub type ColumnsRowKeyMetadataRef = Arc<ColumnsRowKeyMetadata>; pub type ColumnsRowKeyMetadataRef = Arc<ColumnsRowKeyMetadata>;
#[derive(Clone, Debug)] #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnFamiliesMetadata { pub struct ColumnFamiliesMetadata {
/// Map column family id to column family metadata. /// Map column family id to column family metadata.
id_to_cfs: HashMap<ColumnFamilyId, ColumnFamilyMetadata>, id_to_cfs: HashMap<ColumnFamilyId, ColumnFamilyMetadata>,
@@ -133,7 +143,7 @@ impl ColumnFamiliesMetadata {
} }
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnFamilyMetadata { pub struct ColumnFamilyMetadata {
/// Column family name. /// Column family name.
pub name: String, pub name: String,
@@ -151,18 +161,20 @@ impl TryFrom<RegionDescriptor> for RegionMetadata {
// Doesn't set version explicitly here, because this is a new region meta // Doesn't set version explicitly here, because this is a new region meta
// created from descriptor, using initial version is reasonable. // created from descriptor, using initial version is reasonable.
let mut builder = RegionMetadataBuilder::new() let mut builder = RegionMetadataBuilder::new()
.id(desc.id)
.row_key(desc.row_key)? .row_key(desc.row_key)?
.add_column_family(desc.default_cf)?; .add_column_family(desc.default_cf)?;
for cf in desc.extra_cfs { for cf in desc.extra_cfs {
builder = builder.add_column_family(cf)?; builder = builder.add_column_family(cf)?;
} }
Ok(builder.build()) builder.build()
} }
} }
#[derive(Default)] #[derive(Default)]
struct RegionMetadataBuilder { struct RegionMetadataBuilder {
id: RegionId,
columns: Vec<ColumnMetadata>, columns: Vec<ColumnMetadata>,
column_schemas: Vec<ColumnSchema>, column_schemas: Vec<ColumnSchema>,
name_to_col_index: HashMap<String, usize>, name_to_col_index: HashMap<String, usize>,
@@ -178,6 +190,11 @@ impl RegionMetadataBuilder {
RegionMetadataBuilder::default() RegionMetadataBuilder::default()
} }
fn id(mut self, id: RegionId) -> Self {
self.id = id;
self
}
fn row_key(mut self, key: RowKeyDescriptor) -> Result<Self> { fn row_key(mut self, key: RowKeyDescriptor) -> Result<Self> {
for col in key.columns { for col in key.columns {
self.push_row_key_column(col)?; self.push_row_key_column(col)?;
@@ -234,8 +251,15 @@ impl RegionMetadataBuilder {
Ok(self) Ok(self)
} }
fn build(self) -> RegionMetadata { fn build(self) -> Result<RegionMetadata> {
let schema = Arc::new(Schema::new(self.column_schemas)); let schema = if self.column_schemas.is_empty() {
Arc::new(Schema::new(self.column_schemas))
} else {
Arc::new(
Schema::with_timestamp_index(self.column_schemas, self.row_key.timestamp_key_index)
.context(InvalidSchemaSnafu)?,
)
};
let columns = ColumnsMetadata { let columns = ColumnsMetadata {
columns: self.columns, columns: self.columns,
name_to_col_index: self.name_to_col_index, name_to_col_index: self.name_to_col_index,
@@ -245,14 +269,15 @@ impl RegionMetadataBuilder {
row_key: self.row_key, row_key: self.row_key,
}); });
RegionMetadata { Ok(RegionMetadata {
id: self.id,
schema, schema,
columns_row_key, columns_row_key,
column_families: ColumnFamiliesMetadata { column_families: ColumnFamiliesMetadata {
id_to_cfs: self.id_to_cfs, id_to_cfs: self.id_to_cfs,
}, },
version: 0, version: 0,
} })
} }
// Helper methods: // Helper methods:
@@ -308,17 +333,20 @@ mod tests {
#[test] #[test]
fn test_descriptor_to_region_metadata() { fn test_descriptor_to_region_metadata() {
let desc = RegionDescBuilder::new("region-0") let desc = RegionDescBuilder::new("region-0")
.timestamp(("ts", LogicalTypeId::UInt64, false)) .timestamp(("ts", LogicalTypeId::Int64, false))
.enable_version_column(false) .enable_version_column(false)
.push_key_column(("k1", LogicalTypeId::Int32, false)) .push_key_column(("k1", LogicalTypeId::Int32, false))
.push_value_column(("v1", LogicalTypeId::Float32, true)) .push_value_column(("v1", LogicalTypeId::Float32, true))
.build(); .build();
let expect_schema = schema_util::new_schema_ref(&[ let expect_schema = schema_util::new_schema_ref(
("k1", LogicalTypeId::Int32, false), &[
("ts", LogicalTypeId::UInt64, false), ("k1", LogicalTypeId::Int32, false),
("v1", LogicalTypeId::Float32, true), ("ts", LogicalTypeId::Int64, false),
]); ("v1", LogicalTypeId::Float32, true),
],
Some(1),
);
let metadata = RegionMetadata::try_from(desc).unwrap(); let metadata = RegionMetadata::try_from(desc).unwrap();
assert_eq!(expect_schema, metadata.schema); assert_eq!(expect_schema, metadata.schema);
@@ -328,7 +356,7 @@ mod tests {
#[test] #[test]
fn test_build_empty_region_metadata() { fn test_build_empty_region_metadata() {
let metadata = RegionMetadataBuilder::default().build(); let metadata = RegionMetadataBuilder::default().build().unwrap();
assert!(metadata.schema.column_schemas().is_empty()); assert!(metadata.schema.column_schemas().is_empty());
assert!(metadata.columns_row_key.columns.columns.is_empty()); assert!(metadata.columns_row_key.columns.columns.is_empty());
@@ -373,17 +401,21 @@ mod tests {
.add_column_family(cf) .add_column_family(cf)
.unwrap() .unwrap()
.build() .build()
.unwrap()
} }
#[test] #[test]
fn test_build_metedata_disable_version() { fn test_build_metedata_disable_version() {
let metadata = new_metadata(false); let metadata = new_metadata(false);
let expect_schema = schema_util::new_schema_ref(&[ let expect_schema = schema_util::new_schema_ref(
("k1", LogicalTypeId::Int64, false), &[
("ts", LogicalTypeId::Int64, false), ("k1", LogicalTypeId::Int64, false),
("v1", LogicalTypeId::Int64, true), ("ts", LogicalTypeId::Int64, false),
]); ("v1", LogicalTypeId::Int64, true),
],
Some(1),
);
assert_eq!(expect_schema, metadata.schema); assert_eq!(expect_schema, metadata.schema);
@@ -422,12 +454,15 @@ mod tests {
fn test_build_metedata_enable_version() { fn test_build_metedata_enable_version() {
let metadata = new_metadata(true); let metadata = new_metadata(true);
let expect_schema = schema_util::new_schema_ref(&[ let expect_schema = schema_util::new_schema_ref(
("k1", LogicalTypeId::Int64, false), &[
("ts", LogicalTypeId::Int64, false), ("k1", LogicalTypeId::Int64, false),
(consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("ts", LogicalTypeId::Int64, false),
("v1", LogicalTypeId::Int64, true), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false),
]); ("v1", LogicalTypeId::Int64, true),
],
Some(1),
);
assert_eq!(expect_schema, metadata.schema); assert_eq!(expect_schema, metadata.schema);

43
src/storage/src/proto.rs Normal file
View File

@@ -0,0 +1,43 @@
#![allow(clippy::all)]
tonic::include_proto!("greptime.storage.wal.v1");
use crate::write_batch::{Mutation, WriteBatch};
pub fn gen_mutation_extras(write_batch: &WriteBatch) -> Vec<MutationExtra> {
let column_schemas = write_batch.schema().column_schemas();
write_batch
.iter()
.map(|m| match m {
Mutation::Put(put) => {
if put.num_columns() == column_schemas.len() {
MutationExtra {
mutation_type: MutationType::Put.into(),
column_null_mask: Default::default(),
}
} else {
let mut column_null_mask =
bit_vec::BitVec::from_elem(column_schemas.len(), false);
for (i, cs) in column_schemas.iter().enumerate() {
if put.column_by_name(&cs.name).is_none() {
column_null_mask.set(i, true);
}
}
MutationExtra {
mutation_type: MutationType::Put.into(),
column_null_mask: column_null_mask.to_bytes(),
}
}
}
})
.collect::<Vec<_>>()
}
impl WalHeader {
pub fn with_last_manifest_version(last_manifest_version: u64) -> Self {
Self {
last_manifest_version,
..Default::default()
}
}
}

View File

@@ -6,32 +6,44 @@ use std::sync::Arc;
use async_trait::async_trait; use async_trait::async_trait;
use snafu::ensure; use snafu::ensure;
use store_api::storage::{ReadContext, Region, RegionMeta, WriteContext, WriteResponse}; use store_api::logstore::LogStore;
use tokio::sync::Mutex; use store_api::storage::{ReadContext, Region, RegionId, RegionMeta, WriteContext, WriteResponse};
use crate::background::JobPoolImpl;
use crate::error::{self, Error, Result}; use crate::error::{self, Error, Result};
use crate::memtable::{DefaultMemtableBuilder, MemtableBuilder, MemtableSchema, MemtableSet}; use crate::flush::{FlushSchedulerImpl, FlushSchedulerRef, FlushStrategyRef, SizeBasedStrategy};
use crate::manifest::region::RegionManifest;
use crate::memtable::{DefaultMemtableBuilder, MemtableVersion};
use crate::metadata::{RegionMetaImpl, RegionMetadata}; use crate::metadata::{RegionMetaImpl, RegionMetadata};
use crate::region::writer::RegionWriter; pub use crate::region::writer::{RegionWriter, RegionWriterRef, WriterContext};
use crate::snapshot::SnapshotImpl; use crate::snapshot::SnapshotImpl;
use crate::sst::AccessLayerRef;
use crate::version::{VersionControl, VersionControlRef}; use crate::version::{VersionControl, VersionControlRef};
use crate::wal::Wal;
use crate::write_batch::WriteBatch; use crate::write_batch::WriteBatch;
/// [Region] implementation. /// [Region] implementation.
#[derive(Clone)] pub struct RegionImpl<S: LogStore> {
pub struct RegionImpl { inner: Arc<RegionInner<S>>,
inner: Arc<RegionInner>, }
impl<S: LogStore> Clone for RegionImpl<S> {
fn clone(&self) -> Self {
Self {
inner: self.inner.clone(),
}
}
} }
#[async_trait] #[async_trait]
impl Region for RegionImpl { impl<S: LogStore> Region for RegionImpl<S> {
type Error = Error; type Error = Error;
type Meta = RegionMetaImpl; type Meta = RegionMetaImpl;
type WriteRequest = WriteBatch; type WriteRequest = WriteBatch;
type Snapshot = SnapshotImpl; type Snapshot = SnapshotImpl;
fn name(&self) -> &str { fn name(&self) -> &str {
&self.inner.name &self.inner.shared.name
} }
fn in_memory_metadata(&self) -> RegionMetaImpl { fn in_memory_metadata(&self) -> RegionMetaImpl {
@@ -47,61 +59,109 @@ impl Region for RegionImpl {
} }
} }
impl RegionImpl { impl<S: LogStore> RegionImpl<S> {
pub fn new(name: String, metadata: RegionMetadata) -> RegionImpl { pub fn new(
id: RegionId,
name: String,
metadata: RegionMetadata,
wal: Wal<S>,
sst_layer: AccessLayerRef,
manifest: RegionManifest,
) -> RegionImpl<S> {
let memtable_builder = Arc::new(DefaultMemtableBuilder {}); let memtable_builder = Arc::new(DefaultMemtableBuilder {});
let memtable_schema = MemtableSchema::new(metadata.columns_row_key.clone()); let memtable_version = MemtableVersion::new();
let mem = memtable_builder.build(memtable_schema); // TODO(yingwen): Pass flush scheduler to `RegionImpl::new`.
let memtables = MemtableSet::new(mem); let job_pool = Arc::new(JobPoolImpl {});
let flush_scheduler = Arc::new(FlushSchedulerImpl::new(job_pool));
let version = VersionControl::new(metadata, memtables); let version_control = VersionControl::new(metadata, memtable_version);
let inner = Arc::new(RegionInner { let inner = Arc::new(RegionInner {
name, shared: Arc::new(SharedData {
version: Arc::new(version), id,
writer: Mutex::new(RegionWriter::new(memtable_builder)), name,
version_control: Arc::new(version_control),
}),
writer: Arc::new(RegionWriter::new(memtable_builder)),
wal,
flush_strategy: Arc::new(SizeBasedStrategy::default()),
flush_scheduler,
sst_layer,
manifest,
}); });
RegionImpl { inner } RegionImpl { inner }
} }
}
#[cfg(test)] // Private methods for tests.
#[cfg(test)]
impl<S: LogStore> RegionImpl<S> {
#[inline] #[inline]
fn committed_sequence(&self) -> store_api::storage::SequenceNumber { fn committed_sequence(&self) -> store_api::storage::SequenceNumber {
self.inner.version.committed_sequence() self.inner.version_control().committed_sequence()
} }
} }
struct RegionInner { /// Shared data of region.
name: String, pub struct SharedData {
version: VersionControlRef, pub id: RegionId,
writer: Mutex<RegionWriter>, pub name: String,
// TODO(yingwen): Maybe no need to use Arc for version control.
pub version_control: VersionControlRef,
} }
impl RegionInner { pub type SharedDataRef = Arc<SharedData>;
struct RegionInner<S: LogStore> {
shared: SharedDataRef,
writer: RegionWriterRef,
wal: Wal<S>,
flush_strategy: FlushStrategyRef,
flush_scheduler: FlushSchedulerRef,
sst_layer: AccessLayerRef,
manifest: RegionManifest,
}
impl<S: LogStore> RegionInner<S> {
#[inline]
fn version_control(&self) -> &VersionControl {
&*self.shared.version_control
}
fn in_memory_metadata(&self) -> RegionMetaImpl { fn in_memory_metadata(&self) -> RegionMetaImpl {
let metadata = self.version.metadata(); let metadata = self.version_control().metadata();
RegionMetaImpl::new(metadata) RegionMetaImpl::new(metadata)
} }
fn create_snapshot(&self) -> SnapshotImpl {
let version = self.version_control().current();
let sequence = self.version_control().committed_sequence();
SnapshotImpl::new(version, sequence)
}
async fn write(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> { async fn write(&self, ctx: &WriteContext, request: WriteBatch) -> Result<WriteResponse> {
let metadata = self.in_memory_metadata(); let metadata = self.in_memory_metadata();
let schema = metadata.schema(); let schema = metadata.schema();
// Only compare column schemas. // Only compare column schemas.
ensure!( ensure!(
schema.column_schemas() == request.schema().column_schemas(), schema.column_schemas() == request.schema().column_schemas(),
error::InvalidInputSchemaSnafu { region: &self.name } error::InvalidInputSchemaSnafu {
region: &self.shared.name
}
); );
let writer_ctx = WriterContext {
shared: &self.shared,
flush_strategy: &self.flush_strategy,
flush_scheduler: &self.flush_scheduler,
sst_layer: &self.sst_layer,
wal: &self.wal,
writer: &self.writer,
manifest: &self.manifest,
};
// Now altering schema is not allowed, so it is safe to validate schema outside of the lock. // Now altering schema is not allowed, so it is safe to validate schema outside of the lock.
let mut writer = self.writer.lock().await; self.writer.write(ctx, request, writer_ctx).await
writer.write(ctx, &self.version, request).await
}
fn create_snapshot(&self) -> SnapshotImpl {
let version = self.version.current();
let sequence = self.version.committed_sequence();
SnapshotImpl::new(version, sequence)
} }
} }

View File

@@ -3,28 +3,58 @@
mod read_write; mod read_write;
use datatypes::type_id::LogicalTypeId; use datatypes::type_id::LogicalTypeId;
use log_store::fs::noop::NoopLogStore;
use object_store::{backend::fs::Backend, ObjectStore};
use store_api::manifest::Manifest;
use store_api::storage::consts; use store_api::storage::consts;
use tempdir::TempDir;
use super::*; use super::*;
use crate::manifest::region::RegionManifest;
use crate::sst::FsAccessLayer;
use crate::test_util::{self, descriptor_util::RegionDescBuilder, schema_util}; use crate::test_util::{self, descriptor_util::RegionDescBuilder, schema_util};
#[test] #[tokio::test]
fn test_new_region() { async fn test_new_region() {
let region_id = 0;
let region_name = "region-0"; let region_name = "region-0";
let desc = RegionDescBuilder::new(region_name) let desc = RegionDescBuilder::new(region_name)
.enable_version_column(true)
.push_key_column(("k1", LogicalTypeId::Int32, false)) .push_key_column(("k1", LogicalTypeId::Int32, false))
.push_value_column(("v1", LogicalTypeId::Float32, true)) .push_value_column(("v1", LogicalTypeId::Float32, true))
.build(); .build();
let metadata = desc.try_into().unwrap(); let metadata = desc.try_into().unwrap();
let region = RegionImpl::new(region_name.to_string(), metadata); let wal = Wal::new(region_id, region_name, Arc::new(NoopLogStore::default()));
let store_dir = TempDir::new("test_new_region")
.unwrap()
.path()
.to_string_lossy()
.to_string();
let expect_schema = schema_util::new_schema_ref(&[ let accessor = Backend::build().root(&store_dir).finish().await.unwrap();
("k1", LogicalTypeId::Int32, false), let object_store = ObjectStore::new(accessor);
(test_util::TIMESTAMP_NAME, LogicalTypeId::Int64, false), let sst_layer = Arc::new(FsAccessLayer::new("/", object_store.clone()));
(consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), let manifest = RegionManifest::new(region_id, "/manifest/", object_store);
("v1", LogicalTypeId::Float32, true),
]); let region = RegionImpl::new(
region_id,
region_name.to_string(),
metadata,
wal,
sst_layer,
manifest,
);
let expect_schema = schema_util::new_schema_ref(
&[
("k1", LogicalTypeId::Int32, false),
(test_util::TIMESTAMP_NAME, LogicalTypeId::Int64, false),
(consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false),
("v1", LogicalTypeId::Float32, true),
],
Some(1),
);
assert_eq!(region_name, region.name()); assert_eq!(region_name, region.name());
assert_eq!(expect_schema, *region.in_memory_metadata().schema()); assert_eq!(expect_schema, *region.in_memory_metadata().schema());

View File

@@ -5,39 +5,71 @@ use std::sync::Arc;
use datatypes::prelude::*; use datatypes::prelude::*;
use datatypes::type_id::LogicalTypeId; use datatypes::type_id::LogicalTypeId;
use datatypes::vectors::Int64Vector; use datatypes::vectors::Int64Vector;
use log_store::fs::noop::NoopLogStore;
use object_store::{backend::fs::Backend, ObjectStore};
use store_api::manifest::Manifest;
use store_api::storage::{ use store_api::storage::{
consts, Chunk, ChunkReader, PutOperation, ReadContext, Region, RegionMeta, ScanRequest, consts, Chunk, ChunkReader, PutOperation, ReadContext, Region, RegionMeta, ScanRequest,
SequenceNumber, Snapshot, WriteContext, WriteRequest, WriteResponse, SequenceNumber, Snapshot, WriteContext, WriteRequest, WriteResponse,
}; };
use tempdir::TempDir;
use crate::manifest::region::RegionManifest;
use crate::region::RegionImpl; use crate::region::RegionImpl;
use crate::sst::FsAccessLayer;
use crate::test_util::{self, descriptor_util::RegionDescBuilder, write_batch_util}; use crate::test_util::{self, descriptor_util::RegionDescBuilder, write_batch_util};
use crate::wal::Wal;
use crate::write_batch::{PutData, WriteBatch}; use crate::write_batch::{PutData, WriteBatch};
/// Create a new region for read/write test /// Create a new region for read/write test
fn new_region_for_rw(enable_version_column: bool) -> RegionImpl { async fn new_region_for_rw(
store_dir: &str,
enable_version_column: bool,
) -> RegionImpl<NoopLogStore> {
let region_id = 0;
let region_name = "region-rw-0"; let region_name = "region-rw-0";
let sst_dir = format!("{}/{}/", store_dir, region_name);
let manifest_dir = format!("{}/{}/maniffest/", store_dir, region_name);
let desc = RegionDescBuilder::new(region_name) let desc = RegionDescBuilder::new(region_name)
.enable_version_column(enable_version_column) .enable_version_column(enable_version_column)
.push_value_column(("v1", LogicalTypeId::Int64, true)) .push_value_column(("v1", LogicalTypeId::Int64, true))
.build(); .build();
let metadata = desc.try_into().unwrap(); let metadata = desc.try_into().unwrap();
let wal = Wal::new(region_id, region_name, Arc::new(NoopLogStore::default()));
let accessor = Backend::build().root(store_dir).finish().await.unwrap();
let object_store = ObjectStore::new(accessor);
let sst_layer = Arc::new(FsAccessLayer::new(&sst_dir, object_store.clone()));
let manifest = RegionManifest::new(region_id, &manifest_dir, object_store);
RegionImpl::new(region_name.to_string(), metadata) RegionImpl::new(
region_id,
region_name.to_string(),
metadata,
wal,
sst_layer,
manifest,
)
} }
fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch {
if enable_version_column { if enable_version_column {
write_batch_util::new_write_batch(&[ write_batch_util::new_write_batch(
(test_util::TIMESTAMP_NAME, LogicalTypeId::Int64, false), &[
(consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), (test_util::TIMESTAMP_NAME, LogicalTypeId::Int64, false),
("v1", LogicalTypeId::Int64, true), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false),
]) ("v1", LogicalTypeId::Int64, true),
],
Some(0),
)
} else { } else {
write_batch_util::new_write_batch(&[ write_batch_util::new_write_batch(
(test_util::TIMESTAMP_NAME, LogicalTypeId::Int64, false), &[
("v1", LogicalTypeId::Int64, true), (test_util::TIMESTAMP_NAME, LogicalTypeId::Int64, false),
]) ("v1", LogicalTypeId::Int64, true),
],
Some(0),
)
} }
} }
@@ -73,20 +105,14 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<(i64, Option<i64>)>) {
/// Test region without considering version column. /// Test region without considering version column.
struct Tester { struct Tester {
region: RegionImpl, region: RegionImpl<NoopLogStore>,
write_ctx: WriteContext, write_ctx: WriteContext,
read_ctx: ReadContext, read_ctx: ReadContext,
} }
impl Default for Tester {
fn default() -> Tester {
Tester::new()
}
}
impl Tester { impl Tester {
fn new() -> Tester { async fn new(store_dir: &str) -> Tester {
let region = new_region_for_rw(false); let region = new_region_for_rw(store_dir, false).await;
Tester { Tester {
region, region,
@@ -134,7 +160,9 @@ impl Tester {
#[tokio::test] #[tokio::test]
async fn test_simple_put_scan() { async fn test_simple_put_scan() {
let tester = Tester::default(); let dir = TempDir::new("write_parquet").unwrap();
let store_dir = dir.path().to_str().unwrap();
let tester = Tester::new(store_dir).await;
let data = vec![ let data = vec![
(1000, Some(100)), (1000, Some(100)),
@@ -151,7 +179,9 @@ async fn test_simple_put_scan() {
} }
#[tokio::test] #[tokio::test]
async fn test_sequence_increase() { async fn test_sequence_increase() {
let tester = Tester::default(); let dir = TempDir::new("write_parquet").unwrap();
let store_dir = dir.path().to_str().unwrap();
let tester = Tester::new(store_dir).await;
let mut committed_sequence = tester.committed_sequence(); let mut committed_sequence = tester.committed_sequence();
for i in 0..100 { for i in 0..100 {

View File

@@ -1,46 +1,291 @@
use store_api::storage::{WriteContext, WriteResponse}; use std::sync::Arc;
use crate::error::Result; use common_telemetry::logging;
use crate::memtable::{Inserter, MemtableBuilderRef}; use common_time::RangeMillis;
use crate::version::VersionControlRef; use snafu::ResultExt;
use store_api::logstore::LogStore;
use store_api::storage::{SequenceNumber, WriteContext, WriteRequest, WriteResponse};
use tokio::sync::Mutex;
use crate::background::JobHandle;
use crate::error::{InvalidTimestampSnafu, Result};
use crate::flush::{FlushJob, FlushSchedulerRef, FlushStrategyRef};
use crate::memtable::{Inserter, MemtableBuilderRef, MemtableId, MemtableSet};
use crate::proto::WalHeader;
use crate::region::RegionManifest;
use crate::region::SharedDataRef;
use crate::sst::AccessLayerRef;
use crate::version::{VersionControlRef, VersionEdit};
use crate::wal::{Payload, Wal};
use crate::write_batch::WriteBatch; use crate::write_batch::WriteBatch;
pub type RegionWriterRef = Arc<RegionWriter>;
pub struct RegionWriter { pub struct RegionWriter {
_memtable_builder: MemtableBuilderRef, inner: Mutex<WriterInner>,
} }
impl RegionWriter { impl RegionWriter {
pub fn new(_memtable_builder: MemtableBuilderRef) -> RegionWriter { pub fn new(memtable_builder: MemtableBuilderRef) -> RegionWriter {
RegionWriter { _memtable_builder } RegionWriter {
inner: Mutex::new(WriterInner::new(memtable_builder)),
}
}
pub async fn write<S: LogStore>(
&self,
ctx: &WriteContext,
request: WriteBatch,
writer_ctx: WriterContext<'_, S>,
) -> Result<WriteResponse> {
let mut inner = self.inner.lock().await;
inner.write(ctx, request, writer_ctx).await
}
pub async fn apply_version_edit<S: LogStore>(
&self,
wal: &Wal<S>,
edit: VersionEdit,
shared: &SharedDataRef,
) -> Result<()> {
let mut inner = self.inner.lock().await;
inner.apply_version_edit(wal, edit, shared).await
}
}
pub struct WriterContext<'a, S: LogStore> {
pub shared: &'a SharedDataRef,
pub flush_strategy: &'a FlushStrategyRef,
pub flush_scheduler: &'a FlushSchedulerRef,
pub sst_layer: &'a AccessLayerRef,
pub wal: &'a Wal<S>,
pub writer: &'a RegionWriterRef,
pub manifest: &'a RegionManifest,
}
impl<'a, S: LogStore> WriterContext<'a, S> {
#[inline]
fn version_control(&self) -> &VersionControlRef {
&self.shared.version_control
}
}
struct WriterInner {
memtable_builder: MemtableBuilderRef,
last_memtable_id: MemtableId,
flush_handle: Option<JobHandle>,
}
impl WriterInner {
fn new(memtable_builder: MemtableBuilderRef) -> WriterInner {
WriterInner {
memtable_builder,
last_memtable_id: 0,
flush_handle: None,
}
} }
// TODO(yingwen): Support group commit so we can avoid taking mutable reference. // TODO(yingwen): Support group commit so we can avoid taking mutable reference.
/// Write `WriteBatch` to region, now the schema of batch needs to be validated outside. /// Write `WriteBatch` to region, now the schema of batch needs to be validated outside.
pub async fn write( ///
/// Mutable reference of writer ensure no other reference of this writer can modify the
/// version control (write is exclusive).
async fn write<S: LogStore>(
&mut self, &mut self,
_ctx: &WriteContext, _ctx: &WriteContext,
version_control: &VersionControlRef,
request: WriteBatch, request: WriteBatch,
writer_ctx: WriterContext<'_, S>,
) -> Result<WriteResponse> { ) -> Result<WriteResponse> {
// Mutable reference of writer ensure no other reference of this writer can modify let time_ranges = self.preprocess_write(&request, &writer_ctx).await?;
// the version control (write is exclusive).
// TODO(yingwen): Write wal and get sequence. // TODO(yingwen): Write wal and get sequence.
let version_control = writer_ctx.version_control();
let version = version_control.current(); let version = version_control.current();
let mem = version.mutable_memtable();
let committed_sequence = version_control.committed_sequence(); let committed_sequence = version_control.committed_sequence();
// Sequence for current write batch. // Sequence for current write batch.
let next_sequence = committed_sequence + 1; let next_sequence = committed_sequence + 1;
// Insert batch into memtable. let wal_header = WalHeader::with_last_manifest_version(version.manifest_version());
let mut inserter = Inserter::new(next_sequence); writer_ctx
inserter.insert_memtable(&request, &**mem)?; .wal
.write_to_wal(
next_sequence,
wal_header,
Payload::WriteBatchArrow(&request),
)
.await?;
// Update committed_sequence to make current batch visible. The `&mut self` of RegionWriter // Insert batch into memtable.
let mut inserter = Inserter::new(next_sequence, time_ranges, version.bucket_duration());
inserter.insert_memtables(&request, version.mutable_memtables())?;
// Update committed_sequence to make current batch visible. The `&mut self` of WriterInner
// guarantees the writer is exclusive. // guarantees the writer is exclusive.
version_control.set_committed_sequence(next_sequence); version_control.set_committed_sequence(next_sequence);
Ok(WriteResponse {}) Ok(WriteResponse {})
} }
/// Preprocess before write.
///
/// Creates needed mutable memtables, ensures there is enough capacity in memtable and trigger
/// flush if necessary. Returns time ranges of the input write batch.
async fn preprocess_write<S: LogStore>(
&mut self,
request: &WriteBatch,
writer_ctx: &WriterContext<'_, S>,
) -> Result<Vec<RangeMillis>> {
let version_control = writer_ctx.version_control();
// Check whether memtable is full or flush should be triggered. We need to do this first since
// switching memtables will clear all mutable memtables.
if self.should_flush(
writer_ctx.shared,
version_control,
writer_ctx.flush_strategy,
) {
self.trigger_flush(
writer_ctx.shared,
writer_ctx.flush_scheduler,
writer_ctx.sst_layer,
writer_ctx.writer,
writer_ctx.wal,
writer_ctx.manifest,
)
.await?;
}
let current_version = version_control.current();
let duration = current_version.bucket_duration();
let time_ranges = request
.time_ranges(duration)
.context(InvalidTimestampSnafu)?;
let mutable = current_version.mutable_memtables();
let mut memtables_to_add = MemtableSet::default();
// Pre-create all needed mutable memtables.
for range in &time_ranges {
if mutable.get_by_range(range).is_none()
&& memtables_to_add.get_by_range(range).is_none()
{
// Memtable for this range is missing, need to create a new memtable.
let memtable_schema = current_version.memtable_schema();
let id = self.alloc_memtable_id();
let memtable = self.memtable_builder.build(id, memtable_schema);
memtables_to_add.insert(*range, memtable);
}
}
if !memtables_to_add.is_empty() {
version_control.add_mutable(memtables_to_add);
}
Ok(time_ranges)
}
fn should_flush(
&self,
shared: &SharedDataRef,
version_control: &VersionControlRef,
flush_strategy: &FlushStrategyRef,
) -> bool {
let current = version_control.current();
let memtables = current.memtables();
let mutable_bytes_allocated = memtables.mutable_bytes_allocated();
let total_bytes_allocated = memtables.total_bytes_allocated();
flush_strategy.should_flush(shared, mutable_bytes_allocated, total_bytes_allocated)
}
async fn trigger_flush<S: LogStore>(
&mut self,
shared: &SharedDataRef,
flush_scheduler: &FlushSchedulerRef,
sst_layer: &AccessLayerRef,
writer: &RegionWriterRef,
wal: &Wal<S>,
manifest: &RegionManifest,
) -> Result<()> {
let version_control = &shared.version_control;
// Freeze all mutable memtables so we can flush them later.
version_control.freeze_mutable();
if let Some(flush_handle) = self.flush_handle.take() {
// Previous flush job is incomplete, wait util it is finished (write stall).
// However the last flush job may fail, in which case, we just return error
// and abort current write request. The flush handle is left empty, so the next
// time we still have chance to trigger a new flush.
flush_handle.join().await.map_err(|e| {
logging::error!(
"Previous flush job failed, region: {}, err: {}",
shared.name,
e
);
e
})?;
}
let current_version = version_control.current();
let (max_memtable_id, mem_to_flush) = current_version.memtables().memtables_to_flush();
if max_memtable_id.is_none() {
logging::info!("No memtables to flush in region: {}", shared.name);
return Ok(());
}
let flush_req = FlushJob {
max_memtable_id: max_memtable_id.unwrap(),
memtables: mem_to_flush,
// In write thread, safe to use current commited sequence.
flush_sequence: version_control.committed_sequence(),
shared: shared.clone(),
sst_layer: sst_layer.clone(),
writer: writer.clone(),
wal: wal.clone(),
manifest: manifest.clone(),
};
let flush_handle = flush_scheduler.schedule_flush(Box::new(flush_req)).await?;
self.flush_handle = Some(flush_handle);
Ok(())
}
async fn apply_version_edit<S: LogStore>(
&mut self,
wal: &Wal<S>,
edit: VersionEdit,
shared: &SharedDataRef,
) -> Result<()> {
let version_control = &shared.version_control;
let next_sequence = version_control.committed_sequence() + 1;
self.persist_manifest_version(wal, next_sequence, &edit)
.await?;
version_control.apply_edit(edit);
version_control.set_committed_sequence(next_sequence);
Ok(())
}
async fn persist_manifest_version<S: LogStore>(
&self,
wal: &Wal<S>,
seq: SequenceNumber,
edit: &VersionEdit,
) -> Result<()> {
let header = WalHeader::with_last_manifest_version(edit.manifest_version);
wal.write_to_wal(seq, header, Payload::None).await?;
Ok(())
}
#[inline]
fn alloc_memtable_id(&mut self) -> MemtableId {
self.last_memtable_id += 1;
self.last_memtable_id
}
} }

View File

@@ -33,13 +33,34 @@ impl Snapshot for SnapshotImpl {
request: ScanRequest, request: ScanRequest,
) -> Result<ScanResponse<ChunkReaderImpl>> { ) -> Result<ScanResponse<ChunkReaderImpl>> {
let visible_sequence = self.sequence_to_read(request.sequence); let visible_sequence = self.sequence_to_read(request.sequence);
let memtable_version = self.version.memtables();
let mutables = memtable_version.mutable_memtables();
let immutables = memtable_version.immutable_memtables();
let mut batch_iters = Vec::with_capacity(memtable_version.num_memtables());
let mem = self.version.mutable_memtable();
let iter_ctx = IterContext { let iter_ctx = IterContext {
batch_size: ctx.batch_size, batch_size: ctx.batch_size,
visible_sequence, visible_sequence,
..Default::default()
}; };
let iter = mem.iter(iter_ctx)?;
for (_range, mem) in mutables.iter() {
let iter = mem.iter(iter_ctx.clone())?;
batch_iters.push(iter);
}
for mem_set in immutables {
for (_range, mem) in mem_set.iter() {
let iter = mem.iter(iter_ctx.clone())?;
batch_iters.push(iter);
}
}
// Now we just simply chain all iterators together, ignore duplications/ordering.
let iter = Box::new(batch_iters.into_iter().flatten());
let reader = ChunkReaderImpl::new(self.version.schema().clone(), iter); let reader = ChunkReaderImpl::new(self.version.schema().clone(), iter);

172
src/storage/src/sst.rs Normal file
View File

@@ -0,0 +1,172 @@
mod parquet;
use std::sync::Arc;
use async_trait::async_trait;
use object_store::{util, ObjectStore};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::memtable::BatchIteratorPtr;
use crate::sst::parquet::ParquetWriter;
/// Maximum level of ssts.
pub const MAX_LEVEL: usize = 1;
// We only has fixed number of level, so we array to hold elements. This implement
// detail of LevelMetaVec should not be exposed to the user of [LevelMetas].
type LevelMetaVec = [LevelMeta; MAX_LEVEL];
/// Metadata of all ssts under a region.
///
/// Files are organized into multiple level, though there may be only one level.
#[derive(Debug, Clone)]
pub struct LevelMetas {
levels: LevelMetaVec,
}
impl LevelMetas {
/// Create a new LevelMetas and initialized each level.
pub fn new() -> LevelMetas {
LevelMetas {
levels: [LevelMeta::default(); MAX_LEVEL],
}
}
/// Merge `self` with files to add/remove to create a new [LevelMetas].
///
/// # Panics
/// Panics if level of [FileHandle] is greater than [MAX_LEVEL].
pub fn merge(&self, files_to_add: impl Iterator<Item = FileHandle>) -> LevelMetas {
let mut merged = self.clone();
for file in files_to_add {
let level = file.level_index();
merged.levels[level].add_file(file);
}
// TODO(yingwen): Support file removal.
merged
}
}
impl Default for LevelMetas {
fn default() -> LevelMetas {
LevelMetas::new()
}
}
/// Metadata of files in same sst level.
#[derive(Debug, Default, Clone)]
pub struct LevelMeta {
/// Handles to the files in this level.
// TODO(yingwen): Now for simplicity, files are unordered, maybe sort the files by time range
// or use another structure to hold them.
files: Vec<FileHandle>,
}
impl LevelMeta {
fn add_file(&mut self, file: FileHandle) {
self.files.push(file);
}
}
/// In-memory handle to a file.
#[derive(Debug, Clone)]
pub struct FileHandle {
inner: Arc<FileHandleInner>,
}
impl FileHandle {
pub fn new(meta: FileMeta) -> FileHandle {
FileHandle {
inner: Arc::new(FileHandleInner::new(meta)),
}
}
/// Returns level as usize so it can be used as index.
#[inline]
pub fn level_index(&self) -> usize {
self.inner.meta.level.into()
}
}
/// Actually data of [FileHandle].
///
/// Contains meta of the file, and other mutable info like metrics.
#[derive(Debug)]
struct FileHandleInner {
meta: FileMeta,
}
impl FileHandleInner {
fn new(meta: FileMeta) -> FileHandleInner {
FileHandleInner { meta }
}
}
/// Immutable metadata of a sst file.
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct FileMeta {
pub file_path: String,
/// SST level of the file.
pub level: u8,
}
#[derive(Debug, Default)]
pub struct WriteOptions {
// TODO(yingwen): [flush] row group size.
}
/// Sst access layer.
#[async_trait]
pub trait AccessLayer: Send + Sync {
// Writes SST file with given name and returns the full path.
async fn write_sst(
&self,
file_name: &str,
iter: BatchIteratorPtr,
opts: WriteOptions,
) -> Result<String>;
}
pub type AccessLayerRef = Arc<dyn AccessLayer>;
/// Sst access layer based on local file system.
pub struct FsAccessLayer {
sst_dir: String,
object_store: ObjectStore,
}
impl FsAccessLayer {
pub fn new(sst_dir: &str, object_store: ObjectStore) -> FsAccessLayer {
FsAccessLayer {
sst_dir: util::normalize_dir(sst_dir),
object_store,
}
}
#[inline]
fn sst_file_path(&self, file_name: &str) -> String {
format!("{}{}", self.sst_dir, file_name)
}
}
#[async_trait]
impl AccessLayer for FsAccessLayer {
async fn write_sst(
&self,
file_name: &str,
iter: BatchIteratorPtr,
opts: WriteOptions,
) -> Result<String> {
// Now we only supports parquet format. We may allow caller to specific sst format in
// WriteOptions in the future.
let file_path = self.sst_file_path(file_name);
let writer = ParquetWriter::new(&file_path, iter, self.object_store.clone());
writer.write_sst(opts).await?;
Ok(file_path)
}
}

View File

@@ -0,0 +1,263 @@
//! Parquet sst format.
use std::collections::HashMap;
use datatypes::arrow::chunk::Chunk;
use datatypes::arrow::datatypes::{DataType, Field, Schema};
use datatypes::arrow::io::parquet::write::{
Compression, Encoding, FileSink, Version, WriteOptions,
};
use datatypes::prelude::{ConcreteDataType, Vector};
use datatypes::schema::ColumnSchema;
use futures_util::sink::SinkExt;
use object_store::ObjectStore;
use snafu::ResultExt;
use store_api::storage::consts;
use crate::error::{FlushIoSnafu, Result, WriteParquetSnafu};
use crate::memtable::{BatchIteratorPtr, MemtableSchema};
use crate::metadata::ColumnMetadata;
use crate::sst;
/// Parquet sst writer.
pub struct ParquetWriter<'a> {
file_name: &'a str,
iter: BatchIteratorPtr,
object_store: ObjectStore,
}
impl<'a> ParquetWriter<'a> {
pub fn new(
file_name: &'a str,
iter: BatchIteratorPtr,
object_store: ObjectStore,
) -> ParquetWriter {
ParquetWriter {
file_name,
iter,
object_store,
}
}
pub async fn write_sst(self, _opts: sst::WriteOptions) -> Result<()> {
self.write_rows(None).await
}
/// Iterates memtable and writes rows to Parquet file.
/// A chunk of records yielded from each iteration with a size given
/// in config will be written to a single row group.
async fn write_rows(self, extra_meta: Option<HashMap<String, String>>) -> Result<()> {
let schema = memtable_schema_to_arrow_schema(self.iter.schema());
let object = self.object_store.object(self.file_name);
// FIXME(hl): writer size is not used in fs backend so just leave it to 0,
// but in s3/azblob backend the Content-Length field of HTTP request is set
// to this value.
let writer = object.writer(0).await.context(FlushIoSnafu)?;
// now all physical types use plain encoding, maybe let caller to choose encoding for each type.
let encodings = get_encoding_for_schema(&schema, |_| Encoding::Plain);
let mut sink = FileSink::try_new(
writer,
schema,
encodings,
WriteOptions {
write_statistics: true,
compression: Compression::Gzip,
version: Version::V2,
},
)
.context(WriteParquetSnafu)?;
for batch in self.iter {
let batch = batch?;
sink.send(Chunk::new(
batch
.keys
.iter()
.map(|v| v.to_arrow_array())
.chain(std::iter::once(batch.sequences.to_arrow_array()))
.chain(std::iter::once(batch.value_types.to_arrow_array()))
.chain(batch.values.iter().map(|v| v.to_arrow_array()))
.collect(),
))
.await
.context(WriteParquetSnafu)?;
}
if let Some(meta) = extra_meta {
for (k, v) in meta {
sink.metadata.insert(k, Some(v));
}
}
sink.close().await.context(WriteParquetSnafu)
}
}
/// Assembles arrow schema from memtable schema info.
fn memtable_schema_to_arrow_schema(schema: &MemtableSchema) -> Schema {
let col_meta_to_field: fn(&ColumnMetadata) -> Field = |col_meta| {
Field::from(&ColumnSchema::new(
col_meta.desc.name.clone(),
col_meta.desc.data_type.clone(),
col_meta.desc.is_nullable,
))
};
let fields = schema
.row_key_columns()
.map(col_meta_to_field)
.chain(std::iter::once(Field::from(&ColumnSchema::new(
consts::SEQUENCE_COLUMN_NAME,
ConcreteDataType::uint64_datatype(),
false,
))))
.chain(std::iter::once(Field::from(&ColumnSchema::new(
consts::VALUE_TYPE_COLUMN_NAME,
ConcreteDataType::uint8_datatype(),
false,
))))
.chain(schema.value_columns().map(col_meta_to_field))
.collect::<Vec<_>>();
Schema::from(fields)
}
fn get_encoding_for_schema<F: Fn(&DataType) -> Encoding + Clone>(
schema: &Schema,
map: F,
) -> Vec<Encoding> {
schema
.fields
.iter()
.flat_map(|f| transverse(&f.data_type, map.clone()))
.collect()
}
// TODO(hl): backport from arrow2 v0.12 (https://github.com/jorgecarleitao/arrow2/blob/f57dbd5dbc61b940a71decd5f81d0fd4c93b158d/src/io/parquet/write/mod.rs#L454-L509)
// remove it when upgrade to newer version
pub fn transverse<T, F: Fn(&DataType) -> T + Clone>(data_type: &DataType, map: F) -> Vec<T> {
let mut encodings = vec![];
transverse_recursive(data_type, map, &mut encodings);
encodings
}
fn transverse_recursive<T, F: Fn(&DataType) -> T + Clone>(
data_type: &DataType,
map: F,
encodings: &mut Vec<T>,
) {
use datatypes::arrow::datatypes::PhysicalType::*;
match data_type.to_physical_type() {
Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8
| Dictionary(_) | LargeUtf8 => encodings.push(map(data_type)),
List | FixedSizeList | LargeList => {
let a = data_type.to_logical_type();
if let DataType::List(inner) = a {
transverse_recursive(&inner.data_type, map, encodings)
} else if let DataType::LargeList(inner) = a {
transverse_recursive(&inner.data_type, map, encodings)
} else if let DataType::FixedSizeList(inner, _) = a {
transverse_recursive(&inner.data_type, map, encodings)
} else {
unreachable!()
}
}
Struct => {
if let DataType::Struct(fields) = data_type.to_logical_type() {
for field in fields {
transverse_recursive(&field.data_type, map.clone(), encodings)
}
} else {
unreachable!()
}
}
Union => todo!(),
Map => todo!(),
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datatypes::arrow::array::{Array, Int64Array, UInt64Array, UInt8Array};
use datatypes::arrow::io::parquet::read::FileReader;
use object_store::backend::fs::Backend;
use store_api::storage::ValueType;
use tempdir::TempDir;
use super::*;
use crate::memtable::tests as memtable_tests;
use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder};
#[tokio::test]
async fn test_parquet_writer() {
let schema = memtable_tests::schema_for_test();
let memtable = DefaultMemtableBuilder {}.build(1, schema);
memtable_tests::write_kvs(
&*memtable,
10, // sequence
ValueType::Put,
&[
(1000, 1),
(1000, 2),
(2002, 1),
(2003, 1),
(2003, 5),
(1001, 1),
], // keys
&[Some(1), Some(2), Some(7), Some(8), Some(9), Some(3)], // values
);
let dir = TempDir::new("write_parquet").unwrap();
let path = dir.path().to_str().unwrap();
let backend = Backend::build().root(path).finish().await.unwrap();
let object_store = ObjectStore::new(backend);
let sst_file_name = "test-flush.parquet";
let iter = memtable.iter(IterContext::default()).unwrap();
let writer = ParquetWriter::new(sst_file_name, iter, object_store);
writer
.write_sst(sst::WriteOptions::default())
.await
.unwrap();
// verify parquet file
let reader = std::fs::File::open(dir.path().join(sst_file_name)).unwrap();
let mut file_reader = FileReader::try_new(reader, None, Some(128), None, None).unwrap();
// chunk schema: timestamp, __version, __sequence, __value_type, v1
let chunk = file_reader.next().unwrap().unwrap();
assert_eq!(5, chunk.arrays().len());
assert_eq!(
Arc::new(Int64Array::from_slice(&[
1000, 1000, 1001, 2002, 2003, 2003
])) as Arc<dyn Array>,
chunk.arrays()[0]
);
assert_eq!(
Arc::new(UInt64Array::from_slice(&[1, 2, 1, 1, 1, 5])) as Arc<dyn Array>,
chunk.arrays()[1]
);
assert_eq!(
Arc::new(UInt64Array::from_slice(&[10, 10, 10, 10, 10, 10])) as Arc<dyn Array>,
chunk.arrays()[2]
);
assert_eq!(
Arc::new(UInt8Array::from_slice(&[0, 0, 0, 0, 0, 0])) as Arc<dyn Array>,
chunk.arrays()[3]
);
assert_eq!(
Arc::new(UInt64Array::from_slice(&[1, 2, 3, 7, 8, 9])) as Arc<dyn Array>,
chunk.arrays()[4]
);
}
}

View File

@@ -1,13 +1,14 @@
use datatypes::prelude::ConcreteDataType; use datatypes::prelude::ConcreteDataType;
use store_api::storage::{ use store_api::storage::{
ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, ColumnId, ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, ColumnId,
RegionDescriptor, RowKeyDescriptorBuilder, RegionDescriptor, RegionId, RowKeyDescriptorBuilder,
}; };
use crate::test_util::{self, schema_util::ColumnDef}; use crate::test_util::{self, schema_util::ColumnDef};
/// A RegionDescriptor builder for test. /// A RegionDescriptor builder for test.
pub struct RegionDescBuilder { pub struct RegionDescBuilder {
id: RegionId,
name: String, name: String,
last_column_id: ColumnId, last_column_id: ColumnId,
key_builder: RowKeyDescriptorBuilder, key_builder: RowKeyDescriptorBuilder,
@@ -27,6 +28,7 @@ impl RegionDescBuilder {
); );
Self { Self {
id: 0,
name: name.into(), name: name.into(),
last_column_id: 2, last_column_id: 2,
key_builder, key_builder,
@@ -34,6 +36,11 @@ impl RegionDescBuilder {
} }
} }
pub fn id(mut self, id: RegionId) -> Self {
self.id = id;
self
}
// This will reset the row key builder, so should be called before `push_key_column()` // This will reset the row key builder, so should be called before `push_key_column()`
// and `enable_version_column()`, or just call after `new()`. // and `enable_version_column()`, or just call after `new()`.
pub fn timestamp(mut self, column_def: ColumnDef) -> Self { pub fn timestamp(mut self, column_def: ColumnDef) -> Self {
@@ -61,7 +68,7 @@ impl RegionDescBuilder {
pub fn build(self) -> RegionDescriptor { pub fn build(self) -> RegionDescriptor {
RegionDescriptor { RegionDescriptor {
id: 0, id: self.id,
name: self.name, name: self.name,
row_key: self.key_builder.build(), row_key: self.key_builder.build(),
default_cf: self.default_cf_builder.build(), default_cf: self.default_cf_builder.build(),

View File

@@ -6,7 +6,7 @@ use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
/// Column definition: (name, datatype, is_nullable) /// Column definition: (name, datatype, is_nullable)
pub type ColumnDef<'a> = (&'a str, LogicalTypeId, bool); pub type ColumnDef<'a> = (&'a str, LogicalTypeId, bool);
pub fn new_schema(column_defs: &[ColumnDef]) -> Schema { pub fn new_schema(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> Schema {
let column_schemas = column_defs let column_schemas = column_defs
.iter() .iter()
.map(|column_def| { .map(|column_def| {
@@ -15,9 +15,13 @@ pub fn new_schema(column_defs: &[ColumnDef]) -> Schema {
}) })
.collect(); .collect();
Schema::new(column_schemas) if let Some(index) = timestamp_index {
Schema::with_timestamp_index(column_schemas, index).unwrap()
} else {
Schema::new(column_schemas)
}
} }
pub fn new_schema_ref(column_defs: &[ColumnDef]) -> SchemaRef { pub fn new_schema_ref(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> SchemaRef {
Arc::new(new_schema(column_defs)) Arc::new(new_schema(column_defs, timestamp_index))
} }

View File

@@ -3,8 +3,8 @@ use store_api::storage::WriteRequest;
use crate::test_util::schema_util::{self, ColumnDef}; use crate::test_util::schema_util::{self, ColumnDef};
use crate::write_batch::WriteBatch; use crate::write_batch::WriteBatch;
pub fn new_write_batch(column_defs: &[ColumnDef]) -> WriteBatch { pub fn new_write_batch(column_defs: &[ColumnDef], timestamp_index: Option<usize>) -> WriteBatch {
let schema = schema_util::new_schema_ref(column_defs); let schema = schema_util::new_schema_ref(column_defs, timestamp_index);
WriteBatch::new(schema) WriteBatch::new(schema)
} }

View File

@@ -9,15 +9,26 @@
use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration;
use store_api::manifest::ManifestVersion;
use store_api::storage::{SchemaRef, SequenceNumber}; use store_api::storage::{SchemaRef, SequenceNumber};
use crate::memtable::{MemtableRef, MemtableSet}; use crate::memtable::{MemtableId, MemtableSchema, MemtableSet, MemtableVersion};
use crate::metadata::{RegionMetadata, RegionMetadataRef}; use crate::metadata::{RegionMetadata, RegionMetadataRef};
use crate::sst::LevelMetas;
use crate::sst::{FileHandle, FileMeta};
use crate::sync::CowCell; use crate::sync::CowCell;
/// Default bucket duration: 2 Hours.
const DEFAULT_BUCKET_DURATION: Duration = Duration::from_secs(3600 * 2);
/// Controls version of in memory state for a region. /// Controls version of in memory state for a region.
pub struct VersionControl { pub struct VersionControl {
// TODO(yingwen): If all modification to version must acquire the region writer lock first,
// then we may just use ArcSwap to hold version. But some operations may only require the
// version lock, instead of the writer lock, since we can use the version lock the protect
// the read-modify-write of version.
version: CowCell<Version>, version: CowCell<Version>,
/// Latest sequence that is committed and visible to user. /// Latest sequence that is committed and visible to user.
committed_sequence: AtomicU64, committed_sequence: AtomicU64,
@@ -25,7 +36,7 @@ pub struct VersionControl {
impl VersionControl { impl VersionControl {
/// Construct a new version control from `metadata`. /// Construct a new version control from `metadata`.
pub fn new(metadata: RegionMetadata, memtables: MemtableSet) -> VersionControl { pub fn new(metadata: RegionMetadata, memtables: MemtableVersion) -> VersionControl {
VersionControl { VersionControl {
version: CowCell::new(Version::new(metadata, memtables)), version: CowCell::new(Version::new(metadata, memtables)),
committed_sequence: AtomicU64::new(0), committed_sequence: AtomicU64::new(0),
@@ -58,34 +69,91 @@ impl VersionControl {
// Release ordering should be enough to guarantee sequence is updated at last. // Release ordering should be enough to guarantee sequence is updated at last.
self.committed_sequence.store(value, Ordering::Release); self.committed_sequence.store(value, Ordering::Release);
} }
/// Add mutable memtables and commit.
///
/// # Panics
/// See [MemtableVersion::add_mutable](MemtableVersion::add_mutable).
pub fn add_mutable(&self, memtables_to_add: MemtableSet) {
let mut version_to_update = self.version.lock();
let memtable_version = version_to_update.memtables();
let merged = memtable_version.add_mutable(memtables_to_add);
version_to_update.memtables = Arc::new(merged);
version_to_update.commit();
}
/// Freeze all mutable memtables.
pub fn freeze_mutable(&self) {
let mut version_to_update = self.version.lock();
let memtable_version = version_to_update.memtables();
let freezed = memtable_version.freeze_mutable();
version_to_update.memtables = Arc::new(freezed);
version_to_update.commit();
}
pub fn apply_edit(&self, edit: VersionEdit) {
let mut version_to_update = self.version.lock();
if let Some(max_memtable_id) = edit.max_memtable_id {
// Remove flushed memtables
let memtable_version = version_to_update.memtables();
let removed = memtable_version.remove_immutables(max_memtable_id);
version_to_update.memtables = Arc::new(removed);
}
version_to_update.apply_edit(edit);
version_to_update.commit();
}
}
#[derive(Debug)]
pub struct VersionEdit {
pub files_to_add: Vec<FileMeta>,
pub flushed_sequence: Option<SequenceNumber>,
pub manifest_version: ManifestVersion,
pub max_memtable_id: Option<MemtableId>,
} }
pub type VersionControlRef = Arc<VersionControl>; pub type VersionControlRef = Arc<VersionControl>;
pub type VersionRef = Arc<Version>; pub type VersionRef = Arc<Version>;
type MemtableVersionRef = Arc<MemtableVersion>;
// Get data from version, need to type LevelMetasRef = Arc<LevelMetas>;
// 1. acquire version first
// 2. acquire sequence later
//
// Reason: data may flush and some data with old sequence may be removed, so need
// to acquire version at first.
/// Version contains metadata and state of region. /// Version contains metadata and state of region.
#[derive(Clone)]
pub struct Version { pub struct Version {
/// Metadata of the region. Altering metadata isn't frequent, storing metadata /// Metadata of the region.
/// in Arc to allow sharing metadata and reuse metadata when creating a new ///
/// `Version`. /// Altering metadata isn't frequent, storing metadata in Arc to allow sharing
/// metadata and reuse metadata when creating a new `Version`.
metadata: RegionMetadataRef, metadata: RegionMetadataRef,
memtables: MemtableSet, /// Mutable and immutable memtables.
// TODO(yingwen): Also need to store last sequence to this version when switching ///
/// Wrapped in Arc to make clone of `Version` much cheaper.
memtables: MemtableVersionRef,
/// SSTs of the region.
ssts: LevelMetasRef,
/// Inclusive max sequence of flushed data.
flushed_sequence: SequenceNumber,
/// Current version of manifest.
manifest_version: ManifestVersion,
// TODO(yingwen): Maybe also store last sequence to this version when switching
// version, so we can know the newest data can read from this version. // version, so we can know the newest data can read from this version.
} }
impl Version { impl Version {
pub fn new(metadata: RegionMetadata, memtables: MemtableSet) -> Version { pub fn new(metadata: RegionMetadata, memtables: MemtableVersion) -> Version {
Version { Version {
metadata: Arc::new(metadata), metadata: Arc::new(metadata),
memtables, memtables: Arc::new(memtables),
ssts: Arc::new(LevelMetas::new()),
flushed_sequence: 0,
manifest_version: 0,
} }
} }
@@ -95,15 +163,47 @@ impl Version {
} }
#[inline] #[inline]
pub fn mutable_memtable(&self) -> &MemtableRef { pub fn mutable_memtables(&self) -> &MemtableSet {
self.memtables.mutable_memtable() self.memtables.mutable_memtables()
}
pub fn memtables(&self) -> &MemtableVersionRef {
&self.memtables
}
/// Returns duration used to partition the memtables and ssts by time.
pub fn bucket_duration(&self) -> Duration {
DEFAULT_BUCKET_DURATION
}
#[inline]
pub fn memtable_schema(&self) -> MemtableSchema {
MemtableSchema::new(self.metadata.columns_row_key.clone())
}
pub fn apply_edit(&mut self, edit: VersionEdit) {
let flushed_sequence = edit.flushed_sequence.unwrap_or(self.flushed_sequence);
if self.flushed_sequence < flushed_sequence {
self.flushed_sequence = flushed_sequence;
}
if self.manifest_version < edit.manifest_version {
self.manifest_version = edit.manifest_version;
}
let handles_to_add = edit.files_to_add.into_iter().map(FileHandle::new);
let merged_ssts = self.ssts.merge(handles_to_add);
self.ssts = Arc::new(merged_ssts);
}
#[inline]
pub fn manifest_version(&self) -> ManifestVersion {
self.manifest_version
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::memtable::{DefaultMemtableBuilder, MemtableBuilder, MemtableSchema};
use crate::test_util::descriptor_util::RegionDescBuilder; use crate::test_util::descriptor_util::RegionDescBuilder;
fn new_version_control() -> VersionControl { fn new_version_control() -> VersionControl {
@@ -112,11 +212,7 @@ mod tests {
.build(); .build();
let metadata: RegionMetadata = desc.try_into().unwrap(); let metadata: RegionMetadata = desc.try_into().unwrap();
let schema = MemtableSchema::new(metadata.columns_row_key.clone()); VersionControl::new(metadata, MemtableVersion::new())
let memtable = DefaultMemtableBuilder {}.build(schema);
let memtable_set = MemtableSet::new(memtable);
VersionControl::new(metadata, memtable_set)
} }
#[test] #[test]

225
src/storage/src/wal.rs Normal file
View File

@@ -0,0 +1,225 @@
use std::sync::Arc;
use common_error::prelude::BoxedError;
use prost::Message;
use snafu::ResultExt;
use store_api::{
logstore::{entry::Entry, namespace::Namespace, AppendResponse, LogStore},
storage::SequenceNumber,
};
use crate::{
codec::{Decoder, Encoder},
error::{self, Error, Result},
proto::{self, PayloadType, WalHeader},
write_batch::{codec::WriteBatchArrowEncoder, WriteBatch},
};
pub struct Wal<S: LogStore> {
region_id: u32,
namespace: S::Namespace,
store: Arc<S>,
}
// wal should be cheap to clone
impl<S: LogStore> Clone for Wal<S> {
fn clone(&self) -> Self {
Self {
region_id: self.region_id,
namespace: self.namespace.clone(),
store: self.store.clone(),
}
}
}
impl<S: LogStore> Wal<S> {
pub fn new(region_id: u32, region_name: impl Into<String>, store: Arc<S>) -> Self {
let region_name = region_name.into();
let namespace = S::Namespace::new(&region_name, region_id as u64);
Self {
region_id,
namespace,
store,
}
}
#[inline]
pub fn region_id(&self) -> u32 {
self.region_id
}
#[inline]
pub fn name(&self) -> &str {
self.namespace.name()
}
}
impl<S: LogStore> Wal<S> {
/// Data format:
///
/// ```text
/// | |
/// |--------------------------> Header Len <-----------------------------| Arrow/Protobuf/... encoded
/// | |
/// v v
/// +---------------------+----------------------------------------------------+--------------+-------------+--------------+
/// | | Header | | | |
/// | Header Len(varint) | (last_manifest_version + mutation_extras + ...) | Data Chunk0 | Data Chunk1 | ... |
/// | | | | | |
/// +---------------------+----------------------------------------------------+--------------+-------------+--------------+
/// ```
///
pub async fn write_to_wal<'a>(
&self,
seq: SequenceNumber,
mut header: WalHeader,
payload: Payload<'a>,
) -> Result<(u64, usize)> {
header.payload_type = payload.payload_type();
if let Payload::WriteBatchArrow(batch) = payload {
header.mutation_extras = proto::gen_mutation_extras(batch);
}
let mut buf = vec![];
// header
let wal_header_encoder = WalHeaderEncoder {};
wal_header_encoder.encode(&header, &mut buf)?;
if let Payload::WriteBatchArrow(batch) = payload {
// entry
let encoder = WriteBatchArrowEncoder::new(header.mutation_extras);
// TODO(jiachun): provide some way to compute data size before encode, so we can preallocate an exactly sized buf.
encoder
.encode(batch, &mut buf)
.map_err(BoxedError::new)
.context(error::WriteWalSnafu {
region_id: self.region_id(),
name: self.name(),
})?;
}
// TODO(jiachun): encode protobuf payload
// write bytes to wal
self.write(seq, &buf).await
}
async fn write(&self, seq: SequenceNumber, bytes: &[u8]) -> Result<(u64, usize)> {
let ns = self.namespace.clone();
let mut e = S::Entry::new(bytes);
e.set_id(seq);
let res = self
.store
.append(ns, e)
.await
.map_err(BoxedError::new)
.context(error::WriteWalSnafu {
region_id: self.region_id(),
name: self.name(),
})?;
Ok((res.entry_id(), res.offset()))
}
}
pub enum Payload<'a> {
None, // only header
WriteBatchArrow(&'a WriteBatch),
WriteBatchProto(&'a WriteBatch),
}
impl<'a> Payload<'a> {
pub fn payload_type(&self) -> i32 {
match self {
Payload::None => PayloadType::None.into(),
Payload::WriteBatchArrow(_) => PayloadType::WriteBatchArrow.into(),
Payload::WriteBatchProto(_) => PayloadType::WriteBatchProto.into(),
}
}
}
pub struct WalHeaderEncoder {}
impl Encoder for WalHeaderEncoder {
type Item = WalHeader;
type Error = Error;
fn encode(&self, item: &WalHeader, dst: &mut Vec<u8>) -> Result<()> {
item.encode_length_delimited(dst)
.map_err(|err| err.into())
.context(error::EncodeWalHeaderSnafu)
}
}
pub struct WalHeaderDecoder {}
impl Decoder for WalHeaderDecoder {
type Item = (usize, WalHeader);
type Error = Error;
fn decode(&self, src: &[u8]) -> Result<Option<(usize, WalHeader)>> {
let mut data_pos = prost::decode_length_delimiter(src)
.map_err(|err| err.into())
.context(error::DecodeWalHeaderSnafu)?;
data_pos += prost::length_delimiter_len(data_pos);
let wal_header = WalHeader::decode_length_delimited(src)
.map_err(|err| err.into())
.context(error::DecodeWalHeaderSnafu)?;
Ok(Some((data_pos, wal_header)))
}
}
#[cfg(test)]
mod tests {
use log_store::test_util;
use super::*;
#[tokio::test]
pub async fn test_write_wal() {
let (log_store, _tmp) =
test_util::log_store_util::create_tmp_local_file_log_store("wal_test").await;
let wal = Wal::new(0, "test_region", Arc::new(log_store));
let res = wal.write(0, b"test1").await.unwrap();
assert_eq!(0, res.0);
assert_eq!(0, res.1);
let res = wal.write(1, b"test2").await.unwrap();
assert_eq!(1, res.0);
assert_eq!(29, res.1);
}
#[test]
pub fn test_wal_header_codec() {
let wal_header = WalHeader {
payload_type: 1,
last_manifest_version: 99999999,
mutation_extras: vec![],
};
let mut buf: Vec<u8> = vec![];
let wal_encoder = WalHeaderEncoder {};
wal_encoder.encode(&wal_header, &mut buf).unwrap();
buf.push(1u8); // data
buf.push(2u8); // data
buf.push(3u8); // data
let decoder = WalHeaderDecoder {};
let res = decoder.decode(&buf).unwrap();
assert!(res.is_some());
let data_pos = res.unwrap().0;
assert_eq!(buf.len() - 3, data_pos);
}
}

View File

@@ -1,11 +1,19 @@
use std::any::Any; use std::{
use std::collections::HashMap; any::Any,
use std::slice; collections::{BTreeSet, HashMap},
slice,
time::Duration,
};
use common_error::prelude::*; use common_error::prelude::*;
use datatypes::data_type::ConcreteDataType; use common_time::{RangeMillis, TimestampMillis};
use datatypes::schema::SchemaRef; use datatypes::{
use datatypes::vectors::VectorRef; arrow::error::ArrowError,
data_type::ConcreteDataType,
prelude::ScalarVector,
schema::SchemaRef,
vectors::{Int64Vector, VectorRef},
};
use snafu::ensure; use snafu::ensure;
use store_api::storage::{consts, PutOperation, WriteRequest}; use store_api::storage::{consts, PutOperation, WriteRequest};
@@ -58,6 +66,42 @@ pub enum Error {
num_rows: usize, num_rows: usize,
backtrace: Backtrace, backtrace: Backtrace,
}, },
#[snafu(display("Cannot align timestamp: {}", ts))]
TimestampOverflow { ts: i64 },
#[snafu(display("Failed to encode, source: {}", source))]
EncodeArrow {
backtrace: Backtrace,
source: ArrowError,
},
#[snafu(display("Failed to decode, source: {}", source))]
DecodeArrow {
backtrace: Backtrace,
source: ArrowError,
},
#[snafu(display("Failed to parse schema, source: {}", source))]
ParseSchema {
backtrace: Backtrace,
source: datatypes::error::Error,
},
#[snafu(display("Failed to decode, in stream waiting state"))]
StreamWaiting,
#[snafu(display("Failed to decode, data corruption {}", message))]
DataCorruption {
message: String,
backtrace: Backtrace,
},
#[snafu(display("Failed to decode vector, source {}", source))]
DecodeVector {
backtrace: Backtrace,
source: datatypes::error::Error,
},
} }
pub type Result<T> = std::result::Result<T, Error>; pub type Result<T> = std::result::Result<T, Error>;
@@ -110,6 +154,57 @@ impl WriteRequest for WriteBatch {
Ok(()) Ok(())
} }
/// Aligns timestamps in write batch specified by schema to durations.
///
/// A negative timestamp means "before Unix epoch".
/// Valid timestamp range is `[i64::MIN + duration, i64::MAX-(i64::MAX%duration))`.
fn time_ranges(&self, duration: Duration) -> Result<Vec<RangeMillis>> {
let ts_col_name = match self.schema.timestamp_column() {
None => {
// write batch does not have a timestamp column
return Ok(Vec::new());
}
Some(ts_col) => &ts_col.name,
};
let durations_millis = duration.as_millis() as i64;
let mut aligned_timestamps: BTreeSet<i64> = BTreeSet::new();
for m in &self.mutations {
match m {
Mutation::Put(put_data) => {
let column = put_data
.column_by_name(ts_col_name)
.unwrap_or_else(|| panic!("Cannot find column by name: {}", ts_col_name));
let ts_vector = column.as_any().downcast_ref::<Int64Vector>().unwrap(); // not expected to fail
for ts in ts_vector.iter_data().flatten() {
let aligned = align_timestamp(ts, durations_millis)
.context(TimestampOverflowSnafu { ts })?;
aligned_timestamps.insert(aligned);
}
}
}
}
let ranges = aligned_timestamps
.iter()
.map(|t| RangeMillis::new(*t, *t + durations_millis).unwrap())
.collect::<Vec<_>>();
Ok(ranges)
}
}
/// Aligns timestamp to nearest time interval.
/// Negative ts means a timestamp before Unix epoch.
/// If arithmetic overflows, this function returns None.
/// So timestamp within `[i64::MIN, i64::MIN + duration)` or
/// `[i64::MAX-(i64::MAX%duration), i64::MAX]` is not a valid input.
fn align_timestamp(ts: i64, duration: i64) -> Option<i64> {
let aligned = TimestampMillis::new(ts).align_by_bucket(duration)?.as_i64();
// Also ensure end timestamp won't overflow.
aligned.checked_add(duration)?;
Some(aligned)
} }
// WriteBatch pub methods. // WriteBatch pub methods.
@@ -169,6 +264,11 @@ impl PutData {
self.columns.get(name) self.columns.get(name)
} }
/// Returns number of columns in data.
pub fn num_columns(&self) -> usize {
self.columns.len()
}
/// Returns number of rows in data. /// Returns number of rows in data.
pub fn num_rows(&self) -> usize { pub fn num_rows(&self) -> usize {
self.columns self.columns
@@ -184,6 +284,22 @@ impl PutData {
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.num_rows() == 0 self.num_rows() == 0
} }
/// Returns slice of [PutData] in range `[start, end)`.
///
/// # Panics
/// Panics if `start > end`.
pub fn slice(&self, start: usize, end: usize) -> PutData {
assert!(start <= end);
let columns = self
.columns
.iter()
.map(|(k, v)| (k.clone(), v.slice(start, end - start)))
.collect();
PutData { columns }
}
} }
impl WriteBatch { impl WriteBatch {
@@ -273,15 +389,253 @@ impl PutData {
} }
} }
pub mod codec {
use std::{io::Cursor, sync::Arc};
use common_error::prelude::*;
use datatypes::{
arrow::{
chunk::Chunk as ArrowChunk,
io::ipc::{
self,
read::{self, StreamState},
write::{StreamWriter, WriteOptions},
},
},
error::Result as DataTypesResult,
schema::Schema,
vectors::Helper,
};
use snafu::ensure;
use store_api::storage::{PutOperation, WriteRequest};
use super::{
DataCorruptionSnafu, DecodeArrowSnafu, DecodeVectorSnafu, EncodeArrowSnafu,
Error as WriteBatchError, Mutation, ParseSchemaSnafu, Result, WriteBatch,
};
use crate::{
arrow_stream::ArrowStreamReader,
codec::{Decoder, Encoder},
};
use crate::{
proto::{MutationExtra, MutationType},
write_batch::PutData,
};
// TODO(jiachun): The codec logic is too complex, maybe we should use protobuf to
// serialize/deserialize all our data.
// And we can make a comparison with protobuf, including performance, storage cost,
// CPU consumption, etc
pub struct WriteBatchArrowEncoder {
mutation_extras: Vec<MutationExtra>,
}
impl WriteBatchArrowEncoder {
pub fn new(mutation_extras: Vec<MutationExtra>) -> Self {
Self { mutation_extras }
}
}
impl Encoder for WriteBatchArrowEncoder {
type Item = WriteBatch;
type Error = WriteBatchError;
fn encode(&self, item: &WriteBatch, dst: &mut Vec<u8>) -> Result<()> {
let schema = item.schema().arrow_schema();
let column_names = item
.schema()
.column_schemas()
.iter()
.map(|column_schema| column_schema.name.clone())
.collect::<Vec<_>>();
let data = item
.iter()
.zip(self.mutation_extras.iter())
.map(|(mtn, ext)| match mtn {
Mutation::Put(put) => {
let arrays = column_names
.iter()
.filter_map(|column_name| put.column_by_name(column_name))
.map(|vector| vector.to_arrow_array())
.collect::<Vec<_>>();
(arrays, &ext.column_null_mask)
}
});
let opts = WriteOptions { compression: None };
let mut writer = StreamWriter::new(dst, opts);
let ipc_fields = ipc::write::default_ipc_fields(&schema.fields);
writer
.start(schema, Some(ipc_fields.clone()))
.context(EncodeArrowSnafu)?;
for (arrays, column_null_mask) in data {
let chunk = ArrowChunk::try_new(arrays).context(EncodeArrowSnafu)?;
if column_null_mask.is_empty() {
writer.write(&chunk, None).context(EncodeArrowSnafu)?;
} else {
let valid_ipc_fields = ipc_fields
.iter()
.zip(bit_vec::BitVec::from_bytes(column_null_mask))
.filter(|(_, mask)| !*mask)
.map(|(ipc_field, _)| ipc_field.clone())
.collect::<Vec<_>>();
writer
.write(&chunk, Some(&valid_ipc_fields))
.context(EncodeArrowSnafu)?;
}
}
writer.finish().context(EncodeArrowSnafu)?;
Ok(())
}
}
pub struct WriteBatchArrowDecoder {
mutation_extras: Vec<MutationExtra>,
}
impl WriteBatchArrowDecoder {
#[allow(dead_code)]
pub fn new(mutation_extras: Vec<MutationExtra>) -> Self {
Self { mutation_extras }
}
}
impl Decoder for WriteBatchArrowDecoder {
type Item = WriteBatch;
type Error = WriteBatchError;
fn decode(&self, src: &[u8]) -> Result<Option<WriteBatch>> {
let mut reader = Cursor::new(src);
let metadata = read::read_stream_metadata(&mut reader).context(DecodeArrowSnafu)?;
let mut reader = ArrowStreamReader::new(reader, metadata);
let schema = reader.metadata().schema.clone();
let stream_states = self
.mutation_extras
.iter()
.map(|ext| {
reader
.maybe_next(&ext.column_null_mask)
.context(DecodeArrowSnafu)
})
.collect::<Result<Vec<_>>>()?;
// check if exactly finished
ensure!(
reader.check_exactly_finished().context(DecodeArrowSnafu)?,
DataCorruptionSnafu {
message: "Impossible, the num of data chunks is different than expected."
}
);
let mut chunks = Vec::with_capacity(self.mutation_extras.len());
for state_opt in stream_states {
match state_opt {
Some(s) => match s {
StreamState::Some(chunk) => chunks.push(chunk),
StreamState::Waiting => return Err(WriteBatchError::StreamWaiting),
},
None => (),
}
}
// chunks -> mutations
let chunks = chunks
.iter()
.map(|chunk| chunk.arrays())
.map(|arrays| {
arrays
.iter()
.map(Helper::try_into_vector)
.collect::<DataTypesResult<Vec<_>>>()
.context(DecodeVectorSnafu)
})
.collect::<Result<Vec<_>>>()?;
ensure!(
chunks.len() == self.mutation_extras.len(),
DataCorruptionSnafu {
message: &format!(
"expected {} mutations, but got {}",
self.mutation_extras.len(),
chunks.len()
)
}
);
let schema = Schema::try_from(Arc::new(schema)).context(ParseSchemaSnafu)?;
let column_names = schema
.column_schemas()
.iter()
.map(|column| column.name.clone())
.collect::<Vec<_>>();
let mutations = self
.mutation_extras
.iter()
.zip(chunks.iter())
.map(|(ext, mtn)| match ext.mutation_type {
x if x == MutationType::Put as i32 => {
let valid_column_names = if ext.column_null_mask.is_empty() {
column_names.clone()
} else {
bit_vec::BitVec::from_bytes(&ext.column_null_mask)
.iter()
.zip(column_names.iter())
.filter(|(mask, _)| !*mask)
.map(|(_, column_name)| column_name.clone())
.collect::<Vec<_>>()
};
let mut put_data = PutData::with_num_columns(valid_column_names.len());
let res = valid_column_names
.iter()
.zip(mtn)
.map(|(name, vector)| put_data.add_column_by_name(name, vector.clone()))
.collect::<Result<Vec<_>>>();
res.map(|_| Mutation::Put(put_data))
}
x if x == MutationType::Delete as i32 => {
todo!()
}
_ => {
unreachable!()
}
})
.collect::<Result<Vec<_>>>()?;
let mut write_batch = WriteBatch::new(Arc::new(schema));
mutations
.into_iter()
.try_for_each(|mutation| match mutation {
Mutation::Put(put_data) => write_batch.put(put_data),
})?;
Ok(Some(write_batch))
}
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::iter; use std::iter;
use std::sync::Arc; use std::sync::Arc;
use datatypes::type_id::LogicalTypeId; use datatypes::type_id::LogicalTypeId;
use datatypes::vectors::{BooleanVector, Int32Vector, UInt64Vector}; use datatypes::vectors::{BooleanVector, Int32Vector, Int64Vector, UInt64Vector};
use super::*; use super::*;
use crate::codec::{Decoder, Encoder};
use crate::proto;
use crate::test_util::write_batch_util; use crate::test_util::write_batch_util;
#[test] #[test]
@@ -320,22 +674,28 @@ mod tests {
} }
fn new_test_batch() -> WriteBatch { fn new_test_batch() -> WriteBatch {
write_batch_util::new_write_batch(&[ write_batch_util::new_write_batch(
("k1", LogicalTypeId::UInt64, false), &[
(consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("k1", LogicalTypeId::UInt64, false),
("v1", LogicalTypeId::Boolean, true), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false),
]) ("ts", LogicalTypeId::Int64, false),
("v1", LogicalTypeId::Boolean, true),
],
Some(2),
)
} }
#[test] #[test]
fn test_write_batch_put() { fn test_write_batch_put() {
let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3]));
let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true]));
let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0]));
let mut put_data = PutData::new(); let mut put_data = PutData::new();
put_data.add_key_column("k1", intv.clone()).unwrap(); put_data.add_key_column("k1", intv.clone()).unwrap();
put_data.add_version_column(intv).unwrap(); put_data.add_version_column(intv).unwrap();
put_data.add_value_column("v1", boolv).unwrap(); put_data.add_value_column("v1", boolv).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
let mut batch = new_test_batch(); let mut batch = new_test_batch();
assert!(batch.is_empty()); assert!(batch.is_empty());
@@ -362,7 +722,8 @@ mod tests {
let mut put_data = PutData::new(); let mut put_data = PutData::new();
put_data.add_key_column("k1", boolv).unwrap(); put_data.add_key_column("k1", boolv).unwrap();
let mut batch = write_batch_util::new_write_batch(&[("k1", LogicalTypeId::Boolean, false)]); let mut batch =
write_batch_util::new_write_batch(&[("k1", LogicalTypeId::Boolean, false)], None);
let err = batch.put(put_data).err().unwrap(); let err = batch.put(put_data).err().unwrap();
check_err(err, "Request is too large"); check_err(err, "Request is too large");
} }
@@ -391,9 +752,11 @@ mod tests {
#[test] #[test]
fn test_put_type_mismatch() { fn test_put_type_mismatch() {
let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true]));
let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0]));
let mut put_data = PutData::new(); let mut put_data = PutData::new();
put_data.add_key_column("k1", boolv).unwrap(); put_data.add_key_column("k1", boolv).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
let mut batch = new_test_batch(); let mut batch = new_test_batch();
let err = batch.put(put_data).err().unwrap(); let err = batch.put(put_data).err().unwrap();
@@ -403,9 +766,11 @@ mod tests {
#[test] #[test]
fn test_put_type_has_null() { fn test_put_type_has_null() {
let intv = Arc::new(UInt64Vector::from_iter(&[Some(1), None, Some(3)])); let intv = Arc::new(UInt64Vector::from_iter(&[Some(1), None, Some(3)]));
let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0]));
let mut put_data = PutData::new(); let mut put_data = PutData::new();
put_data.add_key_column("k1", intv).unwrap(); put_data.add_key_column("k1", intv).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
let mut batch = new_test_batch(); let mut batch = new_test_batch();
let err = batch.put(put_data).err().unwrap(); let err = batch.put(put_data).err().unwrap();
@@ -415,10 +780,11 @@ mod tests {
#[test] #[test]
fn test_put_missing_column() { fn test_put_missing_column() {
let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true]));
let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0]));
let mut put_data = PutData::new(); let mut put_data = PutData::new();
put_data.add_key_column("v1", boolv).unwrap(); put_data.add_key_column("v1", boolv).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
let mut batch = new_test_batch(); let mut batch = new_test_batch();
let err = batch.put(put_data).err().unwrap(); let err = batch.put(put_data).err().unwrap();
check_err(err, "Missing column k1"); check_err(err, "Missing column k1");
@@ -427,16 +793,125 @@ mod tests {
#[test] #[test]
fn test_put_unknown_column() { fn test_put_unknown_column() {
let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3]));
let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0]));
let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true]));
let mut put_data = PutData::new(); let mut put_data = PutData::new();
put_data.add_key_column("k1", intv.clone()).unwrap(); put_data.add_key_column("k1", intv.clone()).unwrap();
put_data.add_version_column(intv).unwrap(); put_data.add_version_column(intv).unwrap();
put_data.add_value_column("v1", boolv.clone()).unwrap(); put_data.add_value_column("v1", boolv.clone()).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
put_data.add_value_column("v2", boolv).unwrap(); put_data.add_value_column("v2", boolv).unwrap();
let mut batch = new_test_batch(); let mut batch = new_test_batch();
let err = batch.put(put_data).err().unwrap(); let err = batch.put(put_data).err().unwrap();
check_err(err, "Unknown column v2"); check_err(err, "Unknown column v2");
} }
#[test]
pub fn test_align_timestamp() {
let duration_millis = 20;
let ts = [-21, -20, -19, -1, 0, 5, 15, 19, 20, 21];
let res = ts.map(|t| align_timestamp(t, duration_millis));
assert_eq!(res, [-40, -20, -20, -20, 0, 0, 0, 0, 20, 20].map(Some));
}
#[test]
pub fn test_align_timestamp_overflow() {
assert_eq!(Some(i64::MIN), align_timestamp(i64::MIN, 1));
assert_eq!(None, align_timestamp(i64::MIN, 2));
assert_eq!(
Some(((i64::MIN + 20) / 20 - 1) * 20),
align_timestamp(i64::MIN + 20, 20)
);
assert_eq!(None, align_timestamp(i64::MAX - (i64::MAX % 23), 23));
assert_eq!(
Some(9223372036854775780),
align_timestamp(i64::MAX / 20 * 20 - 1, 20)
);
}
#[test]
pub fn test_write_batch_time_range() {
let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4, 5, 6]));
let tsv = Arc::new(Int64Vector::from_vec(vec![-21, -20, -1, 0, 1, 20]));
let boolv = Arc::new(BooleanVector::from(vec![
true, false, true, false, false, false,
]));
let mut put_data = PutData::new();
put_data.add_key_column("k1", intv.clone()).unwrap();
put_data.add_version_column(intv).unwrap();
put_data.add_value_column("v1", boolv).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
let mut batch = new_test_batch();
batch.put(put_data).unwrap();
let duration_millis = 20i64;
let ranges = batch
.time_ranges(Duration::from_millis(duration_millis as u64))
.unwrap();
assert_eq!(
[-40, -20, 0, 20].map(|v| RangeMillis::new(v, v + duration_millis).unwrap()),
ranges.as_slice()
)
}
#[test]
fn test_codec() -> Result<()> {
let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3]));
let boolv = Arc::new(BooleanVector::from(vec![Some(true), Some(false), None]));
let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0]));
let mut put_data = PutData::new();
put_data.add_key_column("k1", intv.clone()).unwrap();
put_data.add_version_column(intv).unwrap();
put_data.add_value_column("v1", boolv).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
let mut batch = new_test_batch();
assert!(batch.is_empty());
batch.put(put_data).unwrap();
assert!(!batch.is_empty());
let encoder = codec::WriteBatchArrowEncoder::new(proto::gen_mutation_extras(&batch));
let mut dst = vec![];
let result = encoder.encode(&batch, &mut dst);
assert!(result.is_ok());
let decoder = codec::WriteBatchArrowDecoder::new(proto::gen_mutation_extras(&batch));
let result = decoder.decode(&dst);
let batch2 = result?.unwrap();
assert_eq!(batch.num_rows, batch2.num_rows);
Ok(())
}
#[test]
fn test_codec_with_none_column() -> Result<()> {
let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3]));
let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0]));
let mut put_data = PutData::new();
put_data.add_key_column("k1", intv.clone()).unwrap();
put_data.add_version_column(intv).unwrap();
put_data.add_key_column("ts", tsv).unwrap();
let mut batch = new_test_batch();
assert!(batch.is_empty());
batch.put(put_data).unwrap();
assert!(!batch.is_empty());
let encoder = codec::WriteBatchArrowEncoder::new(proto::gen_mutation_extras(&batch));
let mut dst = vec![];
let result = encoder.encode(&batch, &mut dst);
assert!(result.is_ok());
let decoder = codec::WriteBatchArrowDecoder::new(proto::gen_mutation_extras(&batch));
let result = decoder.decode(&dst);
let batch2 = result?.unwrap();
assert_eq!(batch.num_rows, batch2.num_rows);
Ok(())
}
} }

View File

@@ -10,8 +10,11 @@ async-trait = "0.1"
bytes = "1.1" bytes = "1.1"
common-base = { path = "../common/base" } common-base = { path = "../common/base" }
common-error = { path = "../common/error" } common-error = { path = "../common/error" }
common-time = { path = "../common/time" }
datatypes = { path = "../datatypes" } datatypes = { path = "../datatypes" }
futures = "0.3" futures = "0.3"
object-store = { path = "../object-store" }
serde = { version = "1.0", features = ["derive"] }
snafu = { version = "0.7", features = ["backtraces"] } snafu = { version = "0.7", features = ["backtraces"] }
[dev-dependencies] [dev-dependencies]

View File

@@ -1,4 +1,5 @@
//! Storage related APIs //! Storage related APIs
pub mod logstore; pub mod logstore;
pub mod manifest;
pub mod storage; pub mod storage;

View File

@@ -12,8 +12,8 @@ pub mod namespace;
/// `LogStore` serves as a Write-Ahead-Log for storage engine. /// `LogStore` serves as a Write-Ahead-Log for storage engine.
#[async_trait::async_trait] #[async_trait::async_trait]
pub trait LogStore { pub trait LogStore: Send + Sync + 'static {
type Error: ErrorExt + Send + Sync; type Error: ErrorExt + Send + Sync + 'static;
type Namespace: Namespace; type Namespace: Namespace;
type Entry: Entry; type Entry: Entry;
type AppendResponse: AppendResponse; type AppendResponse: AppendResponse;

View File

@@ -1,3 +1,5 @@
pub trait Namespace: Send + Sync + Clone { pub trait Namespace: Send + Sync + Clone {
fn new(name: &str, id: u64) -> Self;
fn name(&self) -> &str; fn name(&self) -> &str;
} }

View File

@@ -0,0 +1,45 @@
//! metadata service
mod storage;
use async_trait::async_trait;
use common_error::ext::ErrorExt;
use object_store::ObjectStore;
use serde::{de::DeserializeOwned, Serialize};
pub use storage::*;
pub type ManifestVersion = u64;
pub const MIN_VERSION: u64 = 0;
pub const MAX_VERSION: u64 = u64::MAX;
pub trait Metadata: Clone {}
pub trait MetadataId: Clone + Copy {}
/// The action to apply on metadata
pub trait MetaAction: Serialize + DeserializeOwned {
type MetadataId: MetadataId;
/// Returns the metadata id of the action
fn metadata_id(&self) -> Self::MetadataId;
}
/// Manifest service
#[async_trait]
pub trait Manifest: Send + Sync + Clone + 'static {
type Error: ErrorExt + Send + Sync;
type MetaAction: MetaAction;
type MetadataId: MetadataId;
type Metadata: Metadata;
fn new(id: Self::MetadataId, manifest_dir: &str, object_store: ObjectStore) -> Self;
/// Update metadata by the action
async fn update(&self, action: Self::MetaAction) -> Result<ManifestVersion, Self::Error>;
/// Retrieve the latest metadata
async fn load(&self) -> Result<Option<Self::Metadata>, Self::Error>;
async fn checkpoint(&self) -> Result<ManifestVersion, Self::Error>;
fn metadata_id(&self) -> Self::MetadataId;
}

View File

@@ -0,0 +1,41 @@
use async_trait::async_trait;
use common_error::ext::ErrorExt;
use crate::manifest::ManifestVersion;
#[async_trait]
pub trait LogIterator: Send + Sync {
type Error: ErrorExt + Send + Sync;
async fn next_log(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>, Self::Error>;
}
#[async_trait]
pub trait ManifestLogStorage {
type Error: ErrorExt + Send + Sync;
type Iter: LogIterator<Error = Self::Error>;
/// Scan the logs in [start, end)
async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<Self::Iter, Self::Error>;
/// Save a log
async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<(), Self::Error>;
/// Delete logs in [start, end)
async fn delete(&self, start: ManifestVersion, end: ManifestVersion)
-> Result<(), Self::Error>;
/// Save a checkpoint
async fn save_checkpoint(
&self,
version: ManifestVersion,
bytes: &[u8],
) -> Result<(), Self::Error>;
/// Load the latest checkpoint
async fn load_checkpoint(&self) -> Result<Option<(ManifestVersion, Vec<u8>)>, Self::Error>;
}

View File

@@ -29,6 +29,12 @@ pub const VERSION_COLUMN_NAME: &str = "__version";
// Names for default column family. // Names for default column family.
pub const DEFAULT_CF_NAME: &str = "default"; pub const DEFAULT_CF_NAME: &str = "default";
// Name for reserved column: sequence
pub const SEQUENCE_COLUMN_NAME: &str = "__sequence";
// Name for reserved column: value_type
pub const VALUE_TYPE_COLUMN_NAME: &str = "__value_type";
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// ---------- Default options -------------------------------------------------- // ---------- Default options --------------------------------------------------

View File

@@ -1,5 +1,7 @@
use datatypes::value::Value; use datatypes::value::Value;
use serde::{Deserialize, Serialize};
use crate::manifest::MetadataId;
use crate::storage::{consts, ColumnSchema, ConcreteDataType}; use crate::storage::{consts, ColumnSchema, ConcreteDataType};
/// Id of column, unique in each region. /// Id of column, unique in each region.
@@ -7,6 +9,7 @@ pub type ColumnId = u32;
/// Id of column family, unique in each region. /// Id of column family, unique in each region.
pub type ColumnFamilyId = u32; pub type ColumnFamilyId = u32;
pub type RegionId = u32; pub type RegionId = u32;
impl MetadataId for RegionId {}
/// Default region name prefix /// Default region name prefix
pub const REGION_PREFIX: &str = "r_"; pub const REGION_PREFIX: &str = "r_";
@@ -17,7 +20,7 @@ pub fn gen_region_name(id: RegionId) -> String {
// TODO(yingwen): Validate default value has same type with column, and name is a valid column name. // TODO(yingwen): Validate default value has same type with column, and name is a valid column name.
/// A [ColumnDescriptor] contains information to create a column. /// A [ColumnDescriptor] contains information to create a column.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ColumnDescriptor { pub struct ColumnDescriptor {
pub id: ColumnId, pub id: ColumnId,
pub name: String, pub name: String,
@@ -131,7 +134,7 @@ impl RowKeyDescriptorBuilder {
Self { Self {
columns: Vec::new(), columns: Vec::new(),
timestamp, timestamp,
enable_version_column: true, enable_version_column: false,
} }
} }
@@ -254,7 +257,7 @@ mod tests {
let desc = RowKeyDescriptorBuilder::new(timestamp.clone()).build(); let desc = RowKeyDescriptorBuilder::new(timestamp.clone()).build();
assert!(desc.columns.is_empty()); assert!(desc.columns.is_empty());
assert!(desc.enable_version_column); assert!(!desc.enable_version_column);
let desc = RowKeyDescriptorBuilder::new(timestamp.clone()) let desc = RowKeyDescriptorBuilder::new(timestamp.clone())
.columns_capacity(1) .columns_capacity(1)
@@ -266,7 +269,7 @@ mod tests {
) )
.build(); .build();
assert_eq!(2, desc.columns.len()); assert_eq!(2, desc.columns.len());
assert!(desc.enable_version_column); assert!(!desc.enable_version_column);
let desc = RowKeyDescriptorBuilder::new(timestamp) let desc = RowKeyDescriptorBuilder::new(timestamp)
.enable_version_column(false) .enable_version_column(false)

View File

@@ -1,4 +1,7 @@
use std::time::Duration;
use common_error::ext::ErrorExt; use common_error::ext::ErrorExt;
use common_time::RangeMillis;
use datatypes::schema::SchemaRef; use datatypes::schema::SchemaRef;
use datatypes::vectors::VectorRef; use datatypes::vectors::VectorRef;
@@ -12,6 +15,11 @@ pub trait WriteRequest: Send {
fn new(schema: SchemaRef) -> Self; fn new(schema: SchemaRef) -> Self;
fn put(&mut self, put: Self::PutOp) -> Result<(), Self::Error>; fn put(&mut self, put: Self::PutOp) -> Result<(), Self::Error>;
/// Returns all possible time ranges that contain the timestamp in this batch.
///
/// Each time range is aligned to given `duration`.
fn time_ranges(&self, duration: Duration) -> Result<Vec<RangeMillis>, Self::Error>;
} }
/// Put multiple rows. /// Put multiple rows.

View File

@@ -14,6 +14,7 @@ common-telemetry = {path = "../common/telemetry" }
datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git" , branch = "arrow2"} datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git" , branch = "arrow2"}
datatypes = { path = "../datatypes" } datatypes = { path = "../datatypes" }
futures = "0.3" futures = "0.3"
log-store = { path = "../log-store" }
snafu = { version = "0.7", features = ["backtraces"] } snafu = { version = "0.7", features = ["backtraces"] }
storage ={ path = "../storage" } storage ={ path = "../storage" }
store-api ={ path = "../store-api" } store-api ={ path = "../store-api" }
@@ -21,4 +22,5 @@ table = { path = "../table" }
[dev-dependencies] [dev-dependencies]
datatypes = { path = "../datatypes" } datatypes = { path = "../datatypes" }
tokio = { version = "1.18", features = ["full"] } tempdir = "0.3"
tokio = { version = "1.18", features = ["full"] }

View File

@@ -194,8 +194,8 @@ mod tests {
use crate::table::test; use crate::table::test;
#[tokio::test] #[tokio::test]
async fn test_creat_table_insert_scan() { async fn test_create_table_insert_scan() {
let (_engine, table, schema) = test::setup_test_engine_and_table().await; let (_engine, table, schema, _dir) = test::setup_test_engine_and_table().await;
assert_eq!(TableType::Base, table.table_type()); assert_eq!(TableType::Base, table.table_type());
assert_eq!(schema, table.schema()); assert_eq!(schema, table.schema());

View File

@@ -3,14 +3,23 @@ use std::sync::Arc;
use datatypes::prelude::ConcreteDataType; use datatypes::prelude::ConcreteDataType;
use datatypes::schema::SchemaRef; use datatypes::schema::SchemaRef;
use datatypes::schema::{ColumnSchema, Schema}; use datatypes::schema::{ColumnSchema, Schema};
use log_store::fs::noop::NoopLogStore;
use storage::config::EngineConfig;
use storage::EngineImpl; use storage::EngineImpl;
use table::engine::{EngineContext, TableEngine}; use table::engine::EngineContext;
use table::engine::TableEngine;
use table::requests::CreateTableRequest; use table::requests::CreateTableRequest;
use table::TableRef; use table::TableRef;
use tempdir::TempDir;
use crate::engine::MitoEngine; use crate::engine::MitoEngine;
pub async fn setup_test_engine_and_table() -> (MitoEngine<EngineImpl>, TableRef, SchemaRef) { pub async fn setup_test_engine_and_table() -> (
MitoEngine<EngineImpl<NoopLogStore>>,
TableRef,
SchemaRef,
TempDir,
) {
let column_schemas = vec![ let column_schemas = vec![
ColumnSchema::new("host", ConcreteDataType::string_datatype(), false), ColumnSchema::new("host", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("ts", ConcreteDataType::int64_datatype(), true), ColumnSchema::new("ts", ConcreteDataType::int64_datatype(), true),
@@ -18,10 +27,22 @@ pub async fn setup_test_engine_and_table() -> (MitoEngine<EngineImpl>, TableRef,
ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true), ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true),
]; ];
let table_engine = MitoEngine::<EngineImpl>::new(EngineImpl::new()); let dir = TempDir::new("setup_test_engine_and_table").unwrap();
let store_dir = dir.path().to_string_lossy();
let table_engine = MitoEngine::<EngineImpl<NoopLogStore>>::new(
EngineImpl::new(
EngineConfig::with_store_dir(&store_dir),
Arc::new(NoopLogStore::default()),
)
.await
.unwrap(),
);
let table_name = "demo"; let table_name = "demo";
let schema = Arc::new(Schema::new(column_schemas)); let schema = Arc::new(
Schema::with_timestamp_index(column_schemas, 1).expect("ts must be timestamp column"),
);
let table = table_engine let table = table_engine
.create_table( .create_table(
&EngineContext::default(), &EngineContext::default(),
@@ -34,5 +55,5 @@ pub async fn setup_test_engine_and_table() -> (MitoEngine<EngineImpl>, TableRef,
.await .await
.unwrap(); .unwrap();
(table_engine, table, schema) (table_engine, table, schema, dir)
} }