diff --git a/Cargo.lock b/Cargo.lock index de6ba8d9a9..c53b8c9123 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -54,15 +54,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "aho-corasick" -version = "0.7.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" -dependencies = [ - "memchr", -] - [[package]] name = "aho-corasick" version = "1.1.2" @@ -875,26 +866,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bindgen" -version = "0.64.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" -dependencies = [ - "bitflags 1.3.2", - "cexpr", - "clang-sys", - "lazy_static", - "lazycell", - "peeking_take_while", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 1.0.109", -] - [[package]] name = "bindgen" version = "0.68.1" @@ -1232,11 +1203,11 @@ dependencies = [ "lazy_static", "log-store", "meta-client", - "metrics", "moka", "object-store", "parking_lot 0.12.1", "partition", + "prometheus", "regex", "serde", "serde_json", @@ -1510,8 +1481,10 @@ dependencies = [ "derive_builder 0.12.0", "enum_dispatch", "futures-util", + "lazy_static", "moka", "parking_lot 0.12.1", + "prometheus", "prost 0.12.1", "rand", "session", @@ -1579,11 +1552,11 @@ dependencies = [ "lazy_static", "meta-client", "meta-srv", - "metrics", "mito2", "nu-ansi-term", "partition", "plugins", + "prometheus", "prost 0.12.1", "query", "rand", @@ -1847,7 +1820,7 @@ dependencies = [ "humantime-serde", "hyper", "lazy_static", - "metrics", + "prometheus", "prost 0.12.1", "regex", "serde", @@ -1939,9 +1912,10 @@ dependencies = [ "common-error", "common-macro", "common-telemetry", - "metrics", + "lazy_static", "once_cell", "paste", + "prometheus", "snafu", "tokio", "tokio-test", @@ -1955,13 +1929,12 @@ dependencies = [ "backtrace", "common-error", "console-subscriber", - "metrics", - "metrics-exporter-prometheus", - "metrics-util", + "lazy_static", "once_cell", "opentelemetry 0.17.0", "opentelemetry-jaeger", "parking_lot 0.12.1", + "prometheus", "rand", "rs-snowflake", "serde", @@ -2701,12 +2674,13 @@ dependencies = [ "futures-util", "humantime-serde", "hyper", + "lazy_static", "log-store", "meta-client", - "metrics", "mito2", "object-store", "pin-project", + "prometheus", "prost 0.12.1", "query", "reqwest", @@ -3061,17 +3035,6 @@ dependencies = [ "serde", ] -[[package]] -name = "errno" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" -dependencies = [ - "errno-dragonfly", - "libc", - "winapi", -] - [[package]] name = "errno" version = "0.3.5" @@ -3082,16 +3045,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "error-chain" version = "0.12.4" @@ -3345,16 +3298,17 @@ dependencies = [ "futures-util", "humantime-serde", "itertools 0.10.5", + "lazy_static", "log-store", "meta-client", "meta-srv", - "metrics", "moka", "object-store", "openmetrics-parser", "opentelemetry-proto", "operator", "partition", + "prometheus", "prost 0.12.1", "query", "raft-engine", @@ -3591,7 +3545,7 @@ dependencies = [ "cfg-if 1.0.0", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] @@ -4031,7 +3985,7 @@ dependencies = [ "console", "instant", "number_prefix", - "portable-atomic 1.5.0", + "portable-atomic", "unicode-width", ] @@ -4370,17 +4324,6 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" -[[package]] -name = "libproc" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b18cbf29f8ff3542ba22bdce9ac610fcb75d74bb4e2b306b2a2762242025b4f" -dependencies = [ - "bindgen 0.64.0", - "errno 0.2.8", - "libc", -] - [[package]] name = "libsqlite3-sys" version = "0.25.2" @@ -4579,15 +4522,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "mach" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa" -dependencies = [ - "libc", -] - [[package]] name = "mach2" version = "0.4.1" @@ -4771,9 +4705,9 @@ dependencies = [ "h2", "http-body", "lazy_static", - "metrics", "once_cell", "parking_lot 0.12.1", + "prometheus", "prost 0.12.1", "rand", "regex", @@ -4815,79 +4749,6 @@ dependencies = [ "meter-core", ] -[[package]] -name = "metrics" -version = "0.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b9b8653cec6897f73b519a43fba5ee3d50f62fe9af80b428accdcc093b4a849" -dependencies = [ - "ahash 0.7.7", - "metrics-macros", - "portable-atomic 0.3.20", -] - -[[package]] -name = "metrics-exporter-prometheus" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8603921e1f54ef386189335f288441af761e0fc61bcb552168d9cedfe63ebc70" -dependencies = [ - "indexmap 1.9.3", - "metrics", - "metrics-util", - "parking_lot 0.12.1", - "portable-atomic 0.3.20", - "quanta 0.10.1", - "thiserror", -] - -[[package]] -name = "metrics-macros" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "731f8ecebd9f3a4aa847dfe75455e4757a45da40a7793d2f0b1f9b6ed18b23f3" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "metrics-process" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99eab79be9f7c18565e889d6eaed6f1ebdafb2b6a88aef446d2fee5e7796ed10" -dependencies = [ - "libproc", - "mach2", - "metrics", - "once_cell", - "procfs", - "rlimit", - "windows 0.48.0", -] - -[[package]] -name = "metrics-util" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d24dc2dbae22bff6f1f9326ffce828c9f07ef9cc1e8002e5279f845432a30a" -dependencies = [ - "aho-corasick 0.7.20", - "crossbeam-epoch", - "crossbeam-utils", - "hashbrown 0.12.3", - "indexmap 1.9.3", - "metrics", - "num_cpus", - "ordered-float 2.10.1", - "parking_lot 0.12.1", - "portable-atomic 0.3.20", - "quanta 0.10.1", - "radix_trie", - "sketches-ddsketch", -] - [[package]] name = "mime" version = "0.3.17" @@ -4927,7 +4788,7 @@ checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0" dependencies = [ "libc", "log", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "windows-sys 0.48.0", ] @@ -4968,11 +4829,11 @@ dependencies = [ "lazy_static", "log-store", "memcomparable", - "metrics", "moka", "object-store", "parquet", "paste", + "prometheus", "prost 0.12.1", "regex", "serde", @@ -5002,7 +4863,7 @@ dependencies = [ "futures-util", "once_cell", "parking_lot 0.12.1", - "quanta 0.11.1", + "quanta", "rustc_version", "skeptic", "smallvec", @@ -5088,7 +4949,7 @@ checksum = "57349d5a326b437989b6ee4dc8f2f34b0cc131202748414712a8e7d98952fc8c" dependencies = [ "base64 0.21.5", "bigdecimal", - "bindgen 0.68.1", + "bindgen", "bitflags 2.4.1", "bitvec", "byteorder", @@ -5397,10 +5258,11 @@ dependencies = [ "common-telemetry", "common-test-util", "futures", + "lazy_static", "md5", - "metrics", "moka", "opendal", + "prometheus", "snafu", "tokio", "uuid", @@ -5458,11 +5320,11 @@ dependencies = [ "hyper", "log", "md-5", - "metrics", "once_cell", "parking_lot 0.12.1", "percent-encoding", "pin-project", + "prometheus", "quick-xml 0.29.0", "reqsign", "reqwest", @@ -5627,12 +5489,13 @@ dependencies = [ "file-engine", "futures", "futures-util", + "lazy_static", "meta-client", "meter-core", "meter-macros", - "metrics", "object-store", "partition", + "prometheus", "query", "regex", "serde", @@ -5887,8 +5750,10 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datatypes", + "lazy_static", "meta-client", "moka", + "prometheus", "serde", "serde_json", "snafu", @@ -6221,15 +6086,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "portable-atomic" -version = "0.3.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e30165d31df606f5726b090ec7592c308a0eaf61721ff64c9a3018e344a8753e" -dependencies = [ - "portable-atomic 1.5.0", -] - [[package]] name = "portable-atomic" version = "1.5.0" @@ -6404,9 +6260,9 @@ dependencies = [ [[package]] name = "procfs" -version = "0.15.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943ca7f9f29bab5844ecd8fdb3992c5969b6622bb9609b9502fef9b4310e3f1f" +checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" dependencies = [ "bitflags 1.3.2", "byteorder", @@ -6424,8 +6280,10 @@ dependencies = [ "cfg-if 1.0.0", "fnv", "lazy_static", + "libc", "memchr", "parking_lot 0.12.1", + "procfs", "protobuf", "thiserror", ] @@ -6459,7 +6317,8 @@ dependencies = [ "datatypes", "futures", "greptime-proto", - "metrics", + "lazy_static", + "prometheus", "promql-parser", "prost 0.12.1", "query", @@ -6727,22 +6586,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "quanta" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e31331286705f455e56cca62e0e717158474ff02b7936c1fa596d983f4ae27" -dependencies = [ - "crossbeam-utils", - "libc", - "mach", - "once_cell", - "raw-cpuid", - "wasi 0.10.2+wasi-snapshot-preview1", - "web-sys", - "winapi", -] - [[package]] name = "quanta" version = "0.11.1" @@ -6754,7 +6597,7 @@ dependencies = [ "mach2", "once_cell", "raw-cpuid", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "web-sys", "winapi", ] @@ -6798,13 +6641,14 @@ dependencies = [ "futures-util", "greptime-proto", "humantime", - "metrics", + "lazy_static", "num", "num-traits", "object-store", "once_cell", "partition", "paste", + "prometheus", "promql", "promql-parser", "rand", @@ -7032,7 +6876,7 @@ version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ - "aho-corasick 1.1.2", + "aho-corasick", "memchr", "regex-automata 0.4.3", "regex-syntax 0.8.2", @@ -7053,7 +6897,7 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ - "aho-corasick 1.1.2", + "aho-corasick", "memchr", "regex-syntax 0.8.2", ] @@ -7297,15 +7141,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "rlimit" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a29d87a652dc4d43c586328706bb5cdff211f3f39a530f240b53f7221dab8e" -dependencies = [ - "libc", -] - [[package]] name = "ron" version = "0.7.1" @@ -7501,7 +7336,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab" dependencies = [ "bitflags 1.3.2", - "errno 0.3.5", + "errno", "io-lifetimes", "libc", "linux-raw-sys 0.1.4", @@ -7515,7 +7350,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" dependencies = [ "bitflags 2.4.1", - "errno 0.3.5", + "errno", "libc", "linux-raw-sys 0.4.10", "windows-sys 0.48.0", @@ -7865,7 +7700,7 @@ dependencies = [ "which", "widestring", "winapi", - "windows 0.39.0", + "windows", "winreg 0.10.1", ] @@ -8073,10 +7908,12 @@ dependencies = [ "datatypes", "futures", "futures-util", + "lazy_static", "log-store", "once_cell", "operator", "paste", + "prometheus", "pyo3", "query", "rayon", @@ -8360,8 +8197,7 @@ dependencies = [ "hyper", "influxdb_line_protocol", "itertools 0.10.5", - "metrics", - "metrics-process", + "lazy_static", "mime_guess", "mysql_async", "num_cpus", @@ -8374,6 +8210,7 @@ dependencies = [ "pin-project", "postgres-types", "pprof", + "prometheus", "promql-parser", "prost 0.12.1", "query", @@ -8550,12 +8387,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "sketches-ddsketch" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1" - [[package]] name = "slab" version = "0.4.9" @@ -8967,10 +8798,10 @@ dependencies = [ "itertools 0.10.5", "lazy_static", "log-store", - "metrics", "object-store", "parquet", "paste", + "prometheus", "prost 0.12.1", "rand", "regex", @@ -10553,12 +10384,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.2+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -10764,15 +10589,6 @@ dependencies = [ "windows_x86_64_msvc 0.39.0", ] -[[package]] -name = "windows" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" -dependencies = [ - "windows-targets 0.48.5", -] - [[package]] name = "windows-core" version = "0.51.1" diff --git a/Cargo.toml b/Cargo.toml index 8b2c47f7bd..03993c8e16 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -84,7 +84,6 @@ humantime-serde = "1.1" itertools = "0.10" lazy_static = "1.4" meter-core = { git = "https://github.com/GreptimeTeam/greptime-meter.git", rev = "abbd357c1e193cd270ea65ee7652334a150b628f" } -metrics = "0.20" moka = "0.12" once_cell = "1.18" opentelemetry-proto = { git = "https://github.com/waynexia/opentelemetry-rust.git", rev = "33841b38dda79b15f2024952be5f32533325ca02", features = [ @@ -94,6 +93,7 @@ opentelemetry-proto = { git = "https://github.com/waynexia/opentelemetry-rust.gi ] } parquet = "47.0" paste = "1.0" +prometheus = { version = "0.13.3", features = ["process"] } prost = "0.12" raft-engine = { git = "https://github.com/tikv/raft-engine.git", rev = "22dfb426cd994602b57725ef080287d3e53db479" } rand = "0.8" diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml index 7e5f36a77a..96975254a1 100644 --- a/src/catalog/Cargo.toml +++ b/src/catalog/Cargo.toml @@ -30,10 +30,10 @@ futures = "0.3" futures-util.workspace = true lazy_static.workspace = true meta-client = { workspace = true } -metrics.workspace = true moka = { workspace = true, features = ["future"] } parking_lot = "0.12" partition.workspace = true +prometheus.workspace = true regex.workspace = true serde.workspace = true serde_json = "1.0" diff --git a/src/catalog/src/kvbackend/client.rs b/src/catalog/src/kvbackend/client.rs index a01650b7e5..5ab34072e9 100644 --- a/src/catalog/src/kvbackend/client.rs +++ b/src/catalog/src/kvbackend/client.rs @@ -28,7 +28,7 @@ use common_meta::rpc::store::{ DeleteRangeResponse, PutRequest, PutResponse, RangeRequest, RangeResponse, }; use common_meta::rpc::KeyValue; -use common_telemetry::{debug, timer}; +use common_telemetry::debug; use meta_client::client::MetaClient; use moka::future::{Cache, CacheBuilder}; use snafu::{OptionExt, ResultExt}; @@ -152,10 +152,10 @@ impl KvBackend for CachedMetaKvBackend { } async fn get(&self, key: &[u8]) -> Result> { - let _timer = timer!(METRIC_CATALOG_KV_GET); + let _timer = METRIC_CATALOG_KV_GET.start_timer(); let init = async { - let _timer = timer!(METRIC_CATALOG_KV_REMOTE_GET); + let _timer = METRIC_CATALOG_KV_REMOTE_GET.start_timer(); self.kv_backend.get(key).await.map(|val| { val.with_context(|| CacheNotGetSnafu { key: String::from_utf8_lossy(key), diff --git a/src/catalog/src/memory/manager.rs b/src/catalog/src/memory/manager.rs index 49f209c6f4..5d08c11626 100644 --- a/src/catalog/src/memory/manager.rs +++ b/src/catalog/src/memory/manager.rs @@ -17,8 +17,8 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::{Arc, RwLock, Weak}; +use common_catalog::build_db_string; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME}; -use metrics::{decrement_gauge, increment_gauge}; use snafu::OptionExt; use table::TableRef; @@ -166,7 +166,7 @@ impl MemoryCatalogManager { let arc_self = Arc::new(self.clone()); let catalog = arc_self.create_catalog_entry(name); e.insert(catalog); - increment_gauge!(crate::metrics::METRIC_CATALOG_MANAGER_CATALOG_COUNT, 1.0); + crate::metrics::METRIC_CATALOG_MANAGER_CATALOG_COUNT.inc(); Ok(true) } Entry::Occupied(_) => Ok(false), @@ -187,11 +187,9 @@ impl MemoryCatalogManager { })?; let result = schema.remove(&request.table_name); if result.is_some() { - decrement_gauge!( - crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT, - 1.0, - &[crate::metrics::db_label(&request.catalog, &request.schema)], - ); + crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT + .with_label_values(&[build_db_string(&request.catalog, &request.schema).as_str()]) + .dec(); } Ok(()) } @@ -210,7 +208,7 @@ impl MemoryCatalogManager { match catalog.entry(request.schema) { Entry::Vacant(e) => { e.insert(HashMap::new()); - increment_gauge!(crate::metrics::METRIC_CATALOG_MANAGER_SCHEMA_COUNT, 1.0); + crate::metrics::METRIC_CATALOG_MANAGER_SCHEMA_COUNT.inc(); Ok(true) } Entry::Occupied(_) => Ok(false), @@ -238,11 +236,9 @@ impl MemoryCatalogManager { .fail(); } schema.insert(request.table_name, request.table); - increment_gauge!( - crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT, - 1.0, - &[crate::metrics::db_label(&request.catalog, &request.schema)], - ); + crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT + .with_label_values(&[build_db_string(&request.catalog, &request.schema).as_str()]) + .inc(); Ok(true) } diff --git a/src/catalog/src/metrics.rs b/src/catalog/src/metrics.rs index 6e481c15e2..1b673d6210 100644 --- a/src/catalog/src/metrics.rs +++ b/src/catalog/src/metrics.rs @@ -12,18 +12,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_catalog::build_db_string; - pub(crate) const METRIC_DB_LABEL: &str = "db"; -pub(crate) const METRIC_CATALOG_MANAGER_CATALOG_COUNT: &str = "catalog.catalog_count"; -pub(crate) const METRIC_CATALOG_MANAGER_SCHEMA_COUNT: &str = "catalog.schema_count"; -pub(crate) const METRIC_CATALOG_MANAGER_TABLE_COUNT: &str = "catalog.table_count"; +use lazy_static::lazy_static; +use prometheus::*; -pub(crate) const METRIC_CATALOG_KV_REMOTE_GET: &str = "catalog.kv.get.remote"; -pub(crate) const METRIC_CATALOG_KV_GET: &str = "catalog.kv.get"; - -#[inline] -pub(crate) fn db_label(catalog: &str, schema: &str) -> (&'static str, String) { - (METRIC_DB_LABEL, build_db_string(catalog, schema)) +lazy_static! { + pub static ref METRIC_CATALOG_MANAGER_CATALOG_COUNT: IntGauge = + register_int_gauge!("catalog_catalog_count", "catalog catalog count").unwrap(); + pub static ref METRIC_CATALOG_MANAGER_SCHEMA_COUNT: IntGauge = + register_int_gauge!("catalog_schema_count", "catalog schema count").unwrap(); + pub static ref METRIC_CATALOG_MANAGER_TABLE_COUNT: IntGaugeVec = register_int_gauge_vec!( + "catalog_table_count", + "catalog table count", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_CATALOG_KV_REMOTE_GET: Histogram = + register_histogram!("catalog_kv_get_remote", "catalog kv get remote").unwrap(); + pub static ref METRIC_CATALOG_KV_GET: Histogram = + register_histogram!("catalog_kv_get", "catalog kv get").unwrap(); } diff --git a/src/client/Cargo.toml b/src/client/Cargo.toml index 032b34ebd2..4626d2f96e 100644 --- a/src/client/Cargo.toml +++ b/src/client/Cargo.toml @@ -27,8 +27,10 @@ datatypes = { workspace = true } derive_builder.workspace = true enum_dispatch = "0.3" futures-util.workspace = true +lazy_static.workspace = true moka = { workspace = true, features = ["future"] } parking_lot = "0.12" +prometheus.workspace = true prost.workspace = true rand.workspace = true session = { workspace = true } diff --git a/src/client/src/database.rs b/src/client/src/database.rs index f3e6738b1d..563164d593 100644 --- a/src/client/src/database.rs +++ b/src/client/src/database.rs @@ -28,7 +28,7 @@ use common_grpc::flight::{FlightDecoder, FlightMessage}; use common_query::Output; use common_recordbatch::error::ExternalSnafu; use common_recordbatch::RecordBatchStreamAdaptor; -use common_telemetry::{logging, timer}; +use common_telemetry::logging; use futures_util::StreamExt; use prost::Message; use snafu::{ensure, ResultExt}; @@ -111,12 +111,12 @@ impl Database { } pub async fn insert(&self, requests: InsertRequests) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_INSERT); + let _timer = metrics::METRIC_GRPC_INSERT.start_timer(); self.handle(Request::Inserts(requests)).await } pub async fn row_insert(&self, requests: RowInsertRequests) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_INSERT); + let _timer = metrics::METRIC_GRPC_INSERT.start_timer(); self.handle(Request::RowInserts(requests)).await } @@ -141,7 +141,7 @@ impl Database { } pub async fn delete(&self, request: DeleteRequests) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_DELETE); + let _timer = metrics::METRIC_GRPC_DELETE.start_timer(); self.handle(Request::Deletes(request)).await } @@ -171,7 +171,7 @@ impl Database { where S: AsRef, { - let _timer = timer!(metrics::METRIC_GRPC_SQL); + let _timer = metrics::METRIC_GRPC_SQL.start_timer(); self.do_get( Request::Query(QueryRequest { query: Some(Query::Sql(sql.as_ref().to_string())), @@ -182,7 +182,7 @@ impl Database { } pub async fn logical_plan(&self, logical_plan: Vec, trace_id: u64) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_LOGICAL_PLAN); + let _timer = metrics::METRIC_GRPC_LOGICAL_PLAN.start_timer(); self.do_get( Request::Query(QueryRequest { query: Some(Query::LogicalPlan(logical_plan)), @@ -199,7 +199,7 @@ impl Database { end: &str, step: &str, ) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_PROMQL_RANGE_QUERY); + let _timer = metrics::METRIC_GRPC_PROMQL_RANGE_QUERY.start_timer(); self.do_get( Request::Query(QueryRequest { query: Some(Query::PromRangeQuery(PromRangeQuery { @@ -215,7 +215,7 @@ impl Database { } pub async fn create(&self, expr: CreateTableExpr) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_CREATE_TABLE); + let _timer = metrics::METRIC_GRPC_CREATE_TABLE.start_timer(); self.do_get( Request::Ddl(DdlRequest { expr: Some(DdlExpr::CreateTable(expr)), @@ -226,7 +226,7 @@ impl Database { } pub async fn alter(&self, expr: AlterExpr) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_ALTER); + let _timer = metrics::METRIC_GRPC_ALTER.start_timer(); self.do_get( Request::Ddl(DdlRequest { expr: Some(DdlExpr::Alter(expr)), @@ -237,7 +237,7 @@ impl Database { } pub async fn drop_table(&self, expr: DropTableExpr) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_DROP_TABLE); + let _timer = metrics::METRIC_GRPC_DROP_TABLE.start_timer(); self.do_get( Request::Ddl(DdlRequest { expr: Some(DdlExpr::DropTable(expr)), @@ -248,7 +248,7 @@ impl Database { } pub async fn truncate_table(&self, expr: TruncateTableExpr) -> Result { - let _timer = timer!(metrics::METRIC_GRPC_TRUNCATE_TABLE); + let _timer = metrics::METRIC_GRPC_TRUNCATE_TABLE.start_timer(); self.do_get( Request::Ddl(DdlRequest { expr: Some(DdlExpr::TruncateTable(expr)), @@ -260,7 +260,7 @@ impl Database { async fn do_get(&self, request: Request, trace_id: u64) -> Result { // FIXME(paomian): should be added some labels for metrics - let _timer = timer!(metrics::METRIC_GRPC_DO_GET); + let _timer = metrics::METRIC_GRPC_DO_GET.start_timer(); let request = self.to_rpc_request(request, trace_id); let request = Ticket { ticket: request.encode_to_vec().into(), diff --git a/src/client/src/metrics.rs b/src/client/src/metrics.rs index bceb11233e..efd3f7b441 100644 --- a/src/client/src/metrics.rs +++ b/src/client/src/metrics.rs @@ -12,15 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! client metrics -pub const METRIC_GRPC_CREATE_TABLE: &str = "grpc.create_table"; -pub const METRIC_GRPC_PROMQL_RANGE_QUERY: &str = "grpc.promql.range_query"; -pub const METRIC_GRPC_INSERT: &str = "grpc.insert"; -pub const METRIC_GRPC_DELETE: &str = "grpc.delete"; -pub const METRIC_GRPC_SQL: &str = "grpc.sql"; -pub const METRIC_GRPC_LOGICAL_PLAN: &str = "grpc.logical_plan"; -pub const METRIC_GRPC_ALTER: &str = "grpc.alter"; -pub const METRIC_GRPC_DROP_TABLE: &str = "grpc.drop_table"; -pub const METRIC_GRPC_TRUNCATE_TABLE: &str = "grpc.truncate_table"; -pub const METRIC_GRPC_DO_GET: &str = "grpc.do_get"; -pub(crate) const METRIC_REGION_REQUEST_GRPC: &str = "grpc.region_request"; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + pub static ref METRIC_GRPC_CREATE_TABLE: Histogram = + register_histogram!("grpc_create_table", "grpc create table").unwrap(); + pub static ref METRIC_GRPC_PROMQL_RANGE_QUERY: Histogram = + register_histogram!("grpc_promql_range_query", "grpc promql range query").unwrap(); + pub static ref METRIC_GRPC_INSERT: Histogram = + register_histogram!("grpc_insert", "grpc insert").unwrap(); + pub static ref METRIC_GRPC_DELETE: Histogram = + register_histogram!("grpc_delete", "grpc delete").unwrap(); + pub static ref METRIC_GRPC_SQL: Histogram = + register_histogram!("grpc_sql", "grpc sql").unwrap(); + pub static ref METRIC_GRPC_LOGICAL_PLAN: Histogram = + register_histogram!("grpc_logical_plan", "grpc logical plan").unwrap(); + pub static ref METRIC_GRPC_ALTER: Histogram = + register_histogram!("grpc_alter", "grpc alter").unwrap(); + pub static ref METRIC_GRPC_DROP_TABLE: Histogram = + register_histogram!("grpc_drop_table", "grpc drop table").unwrap(); + pub static ref METRIC_GRPC_TRUNCATE_TABLE: Histogram = + register_histogram!("grpc_truncate_table", "grpc truncate table").unwrap(); + pub static ref METRIC_GRPC_DO_GET: Histogram = + register_histogram!("grpc_do_get", "grpc do get").unwrap(); + pub static ref METRIC_REGION_REQUEST_GRPC: HistogramVec = register_histogram_vec!( + "grpc_region_request", + "grpc region request", + &["request_type"] + ) + .unwrap(); +} diff --git a/src/client/src/region.rs b/src/client/src/region.rs index ae8435e864..8a5895d35f 100644 --- a/src/client/src/region.rs +++ b/src/client/src/region.rs @@ -24,7 +24,7 @@ use common_meta::datanode_manager::{AffectedRows, Datanode}; use common_meta::error::{self as meta_error, Result as MetaResult}; use common_recordbatch::error::ExternalSnafu; use common_recordbatch::{RecordBatchStreamAdaptor, SendableRecordBatchStream}; -use common_telemetry::{error, timer}; +use common_telemetry::error; use prost::Message; use snafu::{location, Location, OptionExt, ResultExt}; use tokio_stream::StreamExt; @@ -152,11 +152,9 @@ impl RegionRequester { .with_context(|| MissingFieldSnafu { field: "body" })? .as_ref() .to_string(); - - let _timer = timer!( - metrics::METRIC_REGION_REQUEST_GRPC, - &[("request_type", request_type)] - ); + let _timer = metrics::METRIC_REGION_REQUEST_GRPC + .with_label_values(&[request_type.as_str()]) + .start_timer(); let mut client = self.client.raw_region_client()?; diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index 9c2de2e908..c112f8a209 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -10,9 +10,7 @@ name = "greptime" path = "src/bin/greptime.rs" [features] -default = ["metrics-process"] tokio-console = ["common-telemetry/tokio-console"] -metrics-process = ["servers/metrics-process"] [dependencies] anymap = "1.0.0-beta.2" @@ -45,11 +43,11 @@ futures.workspace = true lazy_static.workspace = true meta-client = { workspace = true } meta-srv = { workspace = true } -metrics.workspace = true mito2 = { workspace = true } nu-ansi-term = "0.46" partition = { workspace = true } plugins.workspace = true +prometheus.workspace = true prost.workspace = true query = { workspace = true } rand.workspace = true diff --git a/src/cmd/src/bin/greptime.rs b/src/cmd/src/bin/greptime.rs index 819e79b2c8..cf520690cf 100644 --- a/src/cmd/src/bin/greptime.rs +++ b/src/cmd/src/bin/greptime.rs @@ -21,7 +21,11 @@ use cmd::error::Result; use cmd::options::{Options, TopLevelOptions}; use cmd::{cli, datanode, frontend, metasrv, standalone}; use common_telemetry::logging::{error, info, TracingOptions}; -use metrics::gauge; + +lazy_static::lazy_static! { + static ref APP_VERSION: prometheus::IntGaugeVec = + prometheus::register_int_gauge_vec!("app_version", "app version", &["short_version", "version"]).unwrap(); +} #[derive(Parser)] #[clap(name = "greptimedb", version = print_version())] @@ -204,11 +208,12 @@ async fn main() -> Result<()> { }; common_telemetry::set_panic_hook(); - common_telemetry::init_default_metrics_recorder(); let _guard = common_telemetry::init_global_logging(app_name, logging_opts, tracing_opts); // Report app version as gauge. - gauge!("app_version", 1.0, "short_version" => short_version(), "version" => full_version()); + APP_VERSION + .with_label_values(&[short_version(), full_version()]) + .inc(); // Log version and argument flags. info!( diff --git a/src/common/error/src/status_code.rs b/src/common/error/src/status_code.rs index 9ca785be14..228b278aa7 100644 --- a/src/common/error/src/status_code.rs +++ b/src/common/error/src/status_code.rs @@ -14,10 +14,10 @@ use std::fmt; -use strum::EnumString; +use strum::{AsRefStr, EnumString}; /// Common status code for public API. -#[derive(Debug, Clone, Copy, PartialEq, Eq, EnumString)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, EnumString, AsRefStr)] pub enum StatusCode { // ====== Begin of common status code ============== /// Success. diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index c8fdd92180..2f30773c28 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -28,7 +28,7 @@ etcd-client.workspace = true futures.workspace = true humantime-serde.workspace = true lazy_static.workspace = true -metrics.workspace = true +prometheus.workspace = true prost.workspace = true regex.workspace = true serde.workspace = true diff --git a/src/common/meta/src/ddl/alter_table.rs b/src/common/meta/src/ddl/alter_table.rs index 0288555a34..ae215c83b6 100644 --- a/src/common/meta/src/ddl/alter_table.rs +++ b/src/common/meta/src/ddl/alter_table.rs @@ -374,10 +374,11 @@ impl Procedure for AlterTableProcedure { let state = &self.data.state; - let _timer = common_telemetry::timer!( - metrics::METRIC_META_PROCEDURE_ALTER_TABLE, - &[("step", state.as_ref().to_string())] - ); + let step = state.as_ref(); + + let _timer = metrics::METRIC_META_PROCEDURE_ALTER_TABLE + .with_label_values(&[step]) + .start_timer(); match state { AlterTableState::Prepare => self.on_prepare().await, diff --git a/src/common/meta/src/ddl/create_table.rs b/src/common/meta/src/ddl/create_table.rs index 975c2aa471..2d0190e957 100644 --- a/src/common/meta/src/ddl/create_table.rs +++ b/src/common/meta/src/ddl/create_table.rs @@ -251,10 +251,9 @@ impl Procedure for CreateTableProcedure { async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { let state = &self.creator.data.state; - let _timer = common_telemetry::timer!( - metrics::METRIC_META_PROCEDURE_CREATE_TABLE, - &[("step", state.as_ref().to_string())] - ); + let _timer = metrics::METRIC_META_PROCEDURE_CREATE_TABLE + .with_label_values(&[state.as_ref()]) + .start_timer(); match state { CreateTableState::Prepare => self.on_prepare().await, diff --git a/src/common/meta/src/ddl/drop_table.rs b/src/common/meta/src/ddl/drop_table.rs index 5a06270174..98f9849bde 100644 --- a/src/common/meta/src/ddl/drop_table.rs +++ b/src/common/meta/src/ddl/drop_table.rs @@ -197,10 +197,9 @@ impl Procedure for DropTableProcedure { async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { let state = &self.data.state; - let _timer = common_telemetry::timer!( - metrics::METRIC_META_PROCEDURE_DROP_TABLE, - &[("step", state.as_ref().to_string())] - ); + let _timer = metrics::METRIC_META_PROCEDURE_DROP_TABLE + .with_label_values(&[state.as_ref()]) + .start_timer(); match self.data.state { DropTableState::Prepare => self.on_prepare().await, diff --git a/src/common/meta/src/ddl/truncate_table.rs b/src/common/meta/src/ddl/truncate_table.rs index ed71c4e9fa..0cff39f362 100644 --- a/src/common/meta/src/ddl/truncate_table.rs +++ b/src/common/meta/src/ddl/truncate_table.rs @@ -55,10 +55,9 @@ impl Procedure for TruncateTableProcedure { async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { let state = &self.data.state; - let _timer = common_telemetry::timer!( - metrics::METRIC_META_PROCEDURE_TRUNCATE_TABLE, - &[("step", state.as_ref().to_string())] - ); + let _timer = metrics::METRIC_META_PROCEDURE_TRUNCATE_TABLE + .with_label_values(&[state.as_ref()]) + .start_timer(); match self.data.state { TruncateTableState::Prepare => self.on_prepare().await, diff --git a/src/common/meta/src/key/catalog_name.rs b/src/common/meta/src/key/catalog_name.rs index 1041e0cd81..c635e530df 100644 --- a/src/common/meta/src/key/catalog_name.rs +++ b/src/common/meta/src/key/catalog_name.rs @@ -16,10 +16,8 @@ use std::fmt::Display; use std::sync::Arc; use common_catalog::consts::DEFAULT_CATALOG_NAME; -use common_telemetry::timer; use futures::stream::BoxStream; use futures::StreamExt; -use metrics::increment_counter; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; @@ -104,7 +102,7 @@ impl CatalogManager { /// Creates `CatalogNameKey`. pub async fn create(&self, catalog: CatalogNameKey<'_>, if_not_exists: bool) -> Result<()> { - let _timer = timer!(crate::metrics::METRIC_META_CREATE_CATALOG); + let _timer = crate::metrics::METRIC_META_CREATE_CATALOG.start_timer(); let raw_key = catalog.as_raw_key(); let raw_value = CatalogNameValue.try_as_raw_value()?; @@ -113,7 +111,7 @@ impl CatalogManager { .put_conditionally(raw_key, raw_value, if_not_exists) .await? { - increment_counter!(crate::metrics::METRIC_META_CREATE_CATALOG); + crate::metrics::METRIC_META_CREATE_CATALOG_COUNTER.inc(); } Ok(()) diff --git a/src/common/meta/src/key/schema_name.rs b/src/common/meta/src/key/schema_name.rs index 287072e03f..3270543b4e 100644 --- a/src/common/meta/src/key/schema_name.rs +++ b/src/common/meta/src/key/schema_name.rs @@ -18,11 +18,9 @@ use std::sync::Arc; use std::time::Duration; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; -use common_telemetry::timer; use futures::stream::BoxStream; use futures::StreamExt; use humantime_serde::re::humantime; -use metrics::increment_counter; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; @@ -146,7 +144,7 @@ impl SchemaManager { value: Option, if_not_exists: bool, ) -> Result<()> { - let _timer = timer!(crate::metrics::METRIC_META_CREATE_SCHEMA); + let _timer = crate::metrics::METRIC_META_CREATE_SCHEMA.start_timer(); let raw_key = schema.as_raw_key(); let raw_value = value.unwrap_or_default().try_as_raw_value()?; @@ -155,7 +153,7 @@ impl SchemaManager { .put_conditionally(raw_key, raw_value, if_not_exists) .await? { - increment_counter!(crate::metrics::METRIC_META_CREATE_SCHEMA); + crate::metrics::METRIC_META_CREATE_SCHEMA_COUNTER.inc(); } Ok(()) diff --git a/src/common/meta/src/kv_backend/memory.rs b/src/common/meta/src/kv_backend/memory.rs index 484d520895..702fcc9d05 100644 --- a/src/common/meta/src/kv_backend/memory.rs +++ b/src/common/meta/src/kv_backend/memory.rs @@ -21,7 +21,6 @@ use std::sync::RwLock; use async_trait::async_trait; use common_error::ext::ErrorExt; -use common_telemetry::timer; use serde::Serializer; use crate::kv_backend::txn::{Txn, TxnOp, TxnOpResponse, TxnRequest, TxnResponse}; @@ -269,10 +268,9 @@ impl TxnService for MemoryKvBackend { type Error = T; async fn txn(&self, txn: Txn) -> Result { - let _timer = timer!( - METRIC_META_TXN_REQUEST, - &[("target", "memory"), ("op", "txn")] - ); + let _timer = METRIC_META_TXN_REQUEST + .with_label_values(&["memory", "txn"]) + .start_timer(); let TxnRequest { compare, diff --git a/src/common/meta/src/metrics.rs b/src/common/meta/src/metrics.rs index 815c8aacb1..0e009608b2 100644 --- a/src/common/meta/src/metrics.rs +++ b/src/common/meta/src/metrics.rs @@ -12,11 +12,42 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub const METRIC_META_TXN_REQUEST: &str = "meta.txn_request"; +use lazy_static::lazy_static; +use prometheus::*; -pub(crate) const METRIC_META_CREATE_CATALOG: &str = "meta.create_catalog"; -pub(crate) const METRIC_META_CREATE_SCHEMA: &str = "meta.create_schema"; -pub(crate) const METRIC_META_PROCEDURE_CREATE_TABLE: &str = "meta.procedure.create_table"; -pub(crate) const METRIC_META_PROCEDURE_DROP_TABLE: &str = "meta.procedure.drop_table"; -pub(crate) const METRIC_META_PROCEDURE_ALTER_TABLE: &str = "meta.procedure.alter_table"; -pub(crate) const METRIC_META_PROCEDURE_TRUNCATE_TABLE: &str = "meta.procedure.truncate_table"; +lazy_static! { + pub static ref METRIC_META_TXN_REQUEST: HistogramVec = + register_histogram_vec!("meta_txn_request", "meta txn request", &["target", "op"]).unwrap(); + pub static ref METRIC_META_CREATE_CATALOG: Histogram = + register_histogram!("meta_create_catalog", "meta create catalog").unwrap(); + pub static ref METRIC_META_CREATE_CATALOG_COUNTER: IntCounter = + register_int_counter!("meta_create_catalog_counter", "meta create catalog").unwrap(); + pub static ref METRIC_META_CREATE_SCHEMA: Histogram = + register_histogram!("meta_create_schema", "meta create schema").unwrap(); + pub static ref METRIC_META_CREATE_SCHEMA_COUNTER: IntCounter = + register_int_counter!("meta_create_schema_counter", "meta create schema").unwrap(); + pub static ref METRIC_META_PROCEDURE_CREATE_TABLE: HistogramVec = register_histogram_vec!( + "meta_procedure_create_table", + "meta procedure create table", + &["step"] + ) + .unwrap(); + pub static ref METRIC_META_PROCEDURE_DROP_TABLE: HistogramVec = register_histogram_vec!( + "meta_procedure_drop_table", + "meta procedure drop table", + &["step"] + ) + .unwrap(); + pub static ref METRIC_META_PROCEDURE_ALTER_TABLE: HistogramVec = register_histogram_vec!( + "meta_procedure_alter_table", + "meta procedure alter table", + &["step"] + ) + .unwrap(); + pub static ref METRIC_META_PROCEDURE_TRUNCATE_TABLE: HistogramVec = register_histogram_vec!( + "meta_procedure_truncate_table", + "meta procedure truncate table", + &["step"] + ) + .unwrap(); +} diff --git a/src/common/runtime/Cargo.toml b/src/common/runtime/Cargo.toml index 553a50504e..e192e63dd5 100644 --- a/src/common/runtime/Cargo.toml +++ b/src/common/runtime/Cargo.toml @@ -9,9 +9,10 @@ async-trait.workspace = true common-error = { workspace = true } common-macro = { workspace = true } common-telemetry = { workspace = true } -metrics.workspace = true +lazy_static.workspace = true once_cell.workspace = true paste.workspace = true +prometheus.workspace = true snafu.workspace = true tokio-util.workspace = true tokio.workspace = true diff --git a/src/common/runtime/src/metrics.rs b/src/common/runtime/src/metrics.rs index c36da98162..30c5c474e4 100644 --- a/src/common/runtime/src/metrics.rs +++ b/src/common/runtime/src/metrics.rs @@ -13,6 +13,22 @@ // limitations under the License. //! Runtime metrics -pub const THREAD_NAME_LABEL: &str = "thread.name"; -pub const METRIC_RUNTIME_THREADS_ALIVE: &str = "runtime.threads.alive"; -pub const METRIC_RUNTIME_THREADS_IDLE: &str = "runtime.threads.idle"; +use lazy_static::lazy_static; +use prometheus::*; + +pub const THREAD_NAME_LABEL: &str = "thread_name"; + +lazy_static! { + pub static ref METRIC_RUNTIME_THREADS_ALIVE: IntGaugeVec = register_int_gauge_vec!( + "runtime_threads_alive", + "runtime threads alive", + &[THREAD_NAME_LABEL] + ) + .unwrap(); + pub static ref METRIC_RUNTIME_THREADS_IDLE: IntGaugeVec = register_int_gauge_vec!( + "runtime_threads_idle", + "runtime threads idle", + &[THREAD_NAME_LABEL] + ) + .unwrap(); +} diff --git a/src/common/runtime/src/runtime.rs b/src/common/runtime/src/runtime.rs index fae0412a9c..6a776af254 100644 --- a/src/common/runtime/src/runtime.rs +++ b/src/common/runtime/src/runtime.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use std::thread; use std::time::Duration; -use metrics::{decrement_gauge, increment_gauge}; use snafu::ResultExt; use tokio::runtime::{Builder as RuntimeBuilder, Handle}; use tokio::sync::oneshot; @@ -172,29 +171,33 @@ impl Builder { fn on_thread_start(thread_name: String) -> impl Fn() + 'static { move || { - let labels = [(THREAD_NAME_LABEL, thread_name.clone())]; - increment_gauge!(METRIC_RUNTIME_THREADS_ALIVE, 1.0, &labels); + METRIC_RUNTIME_THREADS_ALIVE + .with_label_values(&[thread_name.as_str()]) + .inc(); } } fn on_thread_stop(thread_name: String) -> impl Fn() + 'static { move || { - let labels = [(THREAD_NAME_LABEL, thread_name.clone())]; - decrement_gauge!(METRIC_RUNTIME_THREADS_ALIVE, 1.0, &labels); + METRIC_RUNTIME_THREADS_ALIVE + .with_label_values(&[thread_name.as_str()]) + .dec(); } } fn on_thread_park(thread_name: String) -> impl Fn() + 'static { move || { - let labels = [(THREAD_NAME_LABEL, thread_name.clone())]; - increment_gauge!(METRIC_RUNTIME_THREADS_IDLE, 1.0, &labels); + METRIC_RUNTIME_THREADS_IDLE + .with_label_values(&[thread_name.as_str()]) + .inc(); } } fn on_thread_unpark(thread_name: String) -> impl Fn() + 'static { move || { - let labels = [(THREAD_NAME_LABEL, thread_name.clone())]; - decrement_gauge!(METRIC_RUNTIME_THREADS_IDLE, 1.0, &labels); + METRIC_RUNTIME_THREADS_IDLE + .with_label_values(&[thread_name.as_str()]) + .dec(); } } @@ -204,14 +207,13 @@ mod tests { use std::thread; use std::time::Duration; - use common_telemetry::metric; + use common_telemetry::dump_metrics; use tokio::sync::oneshot; use tokio_test::assert_ok; use super::*; fn runtime() -> Arc { - common_telemetry::init_default_metrics_recorder(); let runtime = Builder::default() .worker_threads(2) .thread_name("test_spawn_join") @@ -221,7 +223,6 @@ mod tests { #[test] fn test_metric() { - common_telemetry::init_default_metrics_recorder(); let runtime = Builder::default() .worker_threads(5) .thread_name("test_runtime_metric") @@ -236,8 +237,7 @@ mod tests { thread::sleep(Duration::from_millis(10)); - let handle = metric::try_handle().unwrap(); - let metric_text = handle.render(); + let metric_text = dump_metrics().unwrap(); assert!(metric_text.contains("runtime_threads_idle{thread_name=\"test_runtime_metric\"}")); assert!(metric_text.contains("runtime_threads_alive{thread_name=\"test_runtime_metric\"}")); diff --git a/src/common/telemetry/Cargo.toml b/src/common/telemetry/Cargo.toml index d36d26a94f..56f1cd3721 100644 --- a/src/common/telemetry/Cargo.toml +++ b/src/common/telemetry/Cargo.toml @@ -12,9 +12,7 @@ deadlock_detection = ["parking_lot/deadlock_detection"] backtrace = "0.3" common-error = { workspace = true } console-subscriber = { version = "0.1", optional = true } -metrics-exporter-prometheus = { version = "0.11", default-features = false } -metrics-util = "0.14" -metrics.workspace = true +lazy_static.workspace = true once_cell.workspace = true opentelemetry = { version = "0.17", default-features = false, features = [ "trace", @@ -22,6 +20,7 @@ opentelemetry = { version = "0.17", default-features = false, features = [ ] } opentelemetry-jaeger = { version = "0.16", features = ["rt-tokio"] } parking_lot = { version = "0.12" } +prometheus.workspace = true rand.workspace = true rs-snowflake = "0.6" serde.workspace = true diff --git a/src/common/telemetry/src/lib.rs b/src/common/telemetry/src/lib.rs index 91d67b92ce..4f7eed9313 100644 --- a/src/common/telemetry/src/lib.rs +++ b/src/common/telemetry/src/lib.rs @@ -21,7 +21,7 @@ use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; pub use logging::{init_default_ut_logging, init_global_logging, trace_id, TRACE_ID}; -pub use metric::init_default_metrics_recorder; +pub use metric::dump_metrics; use once_cell::sync::OnceCell; pub use panic_hook::set_panic_hook; use parking_lot::Mutex; diff --git a/src/common/telemetry/src/metric.rs b/src/common/telemetry/src/metric.rs index 6c7b8f2c8e..239d4244c2 100644 --- a/src/common/telemetry/src/metric.rs +++ b/src/common/telemetry/src/metric.rs @@ -14,174 +14,14 @@ // metric stuffs, inspired by databend -use std::fmt; -use std::sync::{Arc, Once, RwLock}; -use std::time::{Duration, Instant}; +use prometheus::{Encoder, TextEncoder}; -use metrics::{register_histogram, Histogram, IntoLabels}; -use metrics_exporter_prometheus::PrometheusBuilder; -pub use metrics_exporter_prometheus::PrometheusHandle; -use metrics_util::layers::{Layer, PrefixLayer}; -use once_cell::sync::Lazy; - -static PROMETHEUS_HANDLE: Lazy>>> = - Lazy::new(|| Arc::new(RwLock::new(None))); - -pub fn init_default_metrics_recorder() { - static START: Once = Once::new(); - START.call_once(init_prometheus_recorder) -} - -/// Init prometheus recorder. -fn init_prometheus_recorder() { - let recorder = PrometheusBuilder::new().build_recorder(); - let mut h = PROMETHEUS_HANDLE.as_ref().write().unwrap(); - *h = Some(recorder.handle()); - // TODO(LFC): separate metrics for testing and metrics for production - // `clear_recorder` is likely not expected to be called in production code, recorder should be - // globally unique and used throughout the whole lifetime of an application. - // It's marked as "unsafe" since [this PR](https://github.com/metrics-rs/metrics/pull/302), and - // "metrics" version also upgraded to 0.19. - // A quick look in the metrics codes suggests that the "unsafe" call is of no harm. However, - // it required a further investigation in how to use metric properly. - unsafe { - metrics::clear_recorder(); - } - let layer = PrefixLayer::new("greptime"); - let layered = layer.layer(recorder); - match metrics::set_boxed_recorder(Box::new(layered)) { - Ok(_) => (), - Err(err) => crate::warn!("Install prometheus recorder failed, cause: {}", err), - }; -} - -pub fn try_handle() -> Option { - PROMETHEUS_HANDLE.as_ref().read().unwrap().clone() -} - -/// A Histogram timer that emits the elapsed time to the histogram on drop. -#[must_use = "Timer should be kept in a variable otherwise it cannot observe duration"] -pub struct Timer { - start: Instant, - histogram: Histogram, - observed: bool, -} - -impl From for Timer { - fn from(histogram: Histogram) -> Timer { - Timer::from_histogram(histogram) - } -} - -impl fmt::Debug for Timer { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Timer") - .field("start", &self.start) - .field("observed", &self.observed) - .finish() - } -} - -impl Timer { - /// Creates a timer from given histogram. - pub fn from_histogram(histogram: Histogram) -> Self { - Self { - start: Instant::now(), - histogram, - observed: false, - } - } - - /// Creates a timer from given `name`. - pub fn new(name: &'static str) -> Self { - Self { - start: Instant::now(), - histogram: register_histogram!(name), - observed: false, - } - } - - /// Creates a timer from given `name`. - pub fn new_with_labels(name: &'static str, labels: L) -> Self { - Self { - start: Instant::now(), - histogram: register_histogram!(name, labels), - observed: false, - } - } - - /// Returns the elapsed duration from the time this timer created. - pub fn elapsed(&self) -> Duration { - self.start.elapsed() - } - - /// Discards the timer result. - pub fn discard(mut self) { - self.observed = true; - } -} - -impl Drop for Timer { - fn drop(&mut self) { - if !self.observed { - self.histogram.record(self.elapsed()) - } - } -} - -#[macro_export] -macro_rules! timer { - ($name: expr) => { - $crate::metric::Timer::new($name) - }; - ($name:expr, $labels:expr) => { - $crate::metric::Timer::new_with_labels($name, $labels) - }; -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_elapsed_timer() { - init_default_metrics_recorder(); - { - let _t = timer!("test_elapsed_timer_a"); - } - let handle = try_handle().unwrap(); - let text = handle.render(); - assert!(text.contains("test_elapsed_timer_a")); - assert!(!text.contains("test_elapsed_timer_b")); - - let _ = timer!("test_elapsed_timer_b"); - let text = handle.render(); - assert!(text.contains("test_elapsed_timer_a")); - assert!(text.contains("test_elapsed_timer_b")); - } - - #[test] - fn test_elapsed_timer_with_label() { - init_default_metrics_recorder(); - { - let _t = timer!("test_elapsed_timer_a"); - } - let handle = try_handle().unwrap(); - let text = handle.render(); - assert!(text.contains("test_elapsed_timer_a")); - assert!(!text.contains("test_elapsed_timer_b")); - let label_a = "label_a"; - let label_b = "label_b"; - assert!(!text.contains(label_a)); - assert!(!text.contains(label_b)); - - { - let _t = timer!("test_elapsed_timer_b", &[(label_a, "a"), (label_b, "b")]); - } - let text = handle.render(); - assert!(text.contains("test_elapsed_timer_a")); - assert!(text.contains("test_elapsed_timer_b")); - assert!(text.contains(label_a)); - assert!(text.contains(label_b)); - } +pub fn dump_metrics() -> Result { + let mut buffer = Vec::new(); + let encoder = TextEncoder::new(); + let metric_families = prometheus::gather(); + encoder + .encode(&metric_families, &mut buffer) + .map_err(|_| "Encode metrics failed".to_string())?; + String::from_utf8(buffer).map_err(|e| e.to_string()) } diff --git a/src/common/telemetry/src/panic_hook.rs b/src/common/telemetry/src/panic_hook.rs index d7a432c950..e96be38e9f 100644 --- a/src/common/telemetry/src/panic_hook.rs +++ b/src/common/telemetry/src/panic_hook.rs @@ -17,7 +17,13 @@ use std::panic; use std::time::Duration; use backtrace::Backtrace; -use metrics::increment_counter; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + pub static ref PANIC_COUNTER: IntCounter = + register_int_counter!("panic_counter", "panic_counter").unwrap(); +} pub fn set_panic_hook() { // Set a panic hook that records the panic as a `tracing` event at the @@ -41,7 +47,7 @@ pub fn set_panic_hook() { } else { tracing::error!(message = %panic, backtrace = %backtrace); } - increment_counter!("panic_counter"); + PANIC_COUNTER.inc(); default_hook(panic); })); diff --git a/src/datanode/Cargo.toml b/src/datanode/Cargo.toml index cc505c7d41..39baddefec 100644 --- a/src/datanode/Cargo.toml +++ b/src/datanode/Cargo.toml @@ -4,6 +4,9 @@ version.workspace = true edition.workspace = true license.workspace = true +[features] +testing = [] + [dependencies] api = { workspace = true } arrow-flight.workspace = true @@ -41,12 +44,13 @@ futures = "0.3" futures-util.workspace = true humantime-serde.workspace = true hyper = { version = "0.14", features = ["full"] } +lazy_static.workspace = true log-store = { workspace = true } meta-client = { workspace = true } -metrics.workspace = true mito2 = { workspace = true } object-store = { workspace = true } pin-project = "1.0" +prometheus.workspace = true prost.workspace = true query = { workspace = true } reqwest = { workspace = true } diff --git a/src/datanode/src/metrics.rs b/src/datanode/src/metrics.rs index 858506c89e..f4a2621ecc 100644 --- a/src/datanode/src/metrics.rs +++ b/src/datanode/src/metrics.rs @@ -12,9 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! datanode metrics +use lazy_static::lazy_static; +use prometheus::*; -/// The elapsed time of handling a request in the region_server. -pub const HANDLE_REGION_REQUEST_ELAPSED: &str = "datanode.handle_region_request_elapsed"; /// Region request type label. -pub const REGION_REQUEST_TYPE: &str = "datanode.region_request_type"; +pub const REGION_REQUEST_TYPE: &str = "datanode_region_request_type"; + +lazy_static! { + /// The elapsed time of handling a request in the region_server. + pub static ref HANDLE_REGION_REQUEST_ELAPSED: HistogramVec = register_histogram_vec!( + "datanode_handle_region_request_elapsed", + "datanode handle region request elapsed", + &[REGION_REQUEST_TYPE] + ) + .unwrap(); +} diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index 521dfe7c4e..bcd9ac20cf 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -28,7 +28,7 @@ use common_query::physical_plan::DfPhysicalPlanAdapter; use common_query::{DfPhysicalPlan, Output}; use common_recordbatch::SendableRecordBatchStream; use common_runtime::Runtime; -use common_telemetry::{info, timer, warn}; +use common_telemetry::{info, warn}; use dashmap::DashMap; use datafusion::catalog::schema::SchemaProvider; use datafusion::catalog::{CatalogList, CatalogProvider}; @@ -249,10 +249,9 @@ impl RegionServerInner { request: RegionRequest, ) -> Result { let request_type = request.request_type(); - let _timer = timer!( - crate::metrics::HANDLE_REGION_REQUEST_ELAPSED, - &[(crate::metrics::REGION_REQUEST_TYPE, request_type),] - ); + let _timer = crate::metrics::HANDLE_REGION_REQUEST_ELAPSED + .with_label_values(&[request_type]) + .start_timer(); let region_change = match &request { RegionRequest::Create(create) => RegionChange::Register(create.engine.clone()), diff --git a/src/datanode/src/store.rs b/src/datanode/src/store.rs index 0a70c28e30..78528cf6ce 100644 --- a/src/datanode/src/store.rs +++ b/src/datanode/src/store.rs @@ -26,7 +26,7 @@ use std::{env, path}; use common_base::readable_size::ReadableSize; use common_telemetry::logging::info; -use object_store::layers::{LoggingLayer, LruCacheLayer, MetricsLayer, RetryLayer, TracingLayer}; +use object_store::layers::{LoggingLayer, LruCacheLayer, RetryLayer, TracingLayer}; use object_store::services::Fs as FsBuilder; use object_store::util::normalize_dir; use object_store::{util, HttpClient, ObjectStore, ObjectStoreBuilder}; @@ -58,8 +58,7 @@ pub(crate) async fn new_object_store(opts: &DatanodeOptions) -> Result Result Vec> { - let _timer = timer!(metrics::METRIC_HANDLE_SQL_ELAPSED); + let _timer = metrics::METRIC_HANDLE_SQL_ELAPSED.start_timer(); let query_interceptor_opt = self.plugins.get::>(); let query_interceptor = query_interceptor_opt.as_ref(); let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) { @@ -482,7 +482,7 @@ impl SqlQueryHandler for Instance { } async fn do_exec_plan(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result { - let _timer = timer!(metrics::METRIC_EXEC_PLAN_ELAPSED); + let _timer = metrics::METRIC_EXEC_PLAN_ELAPSED.start_timer(); // plan should be prepared before exec // we'll do check there self.query_engine @@ -551,7 +551,7 @@ impl PrometheusHandler for Instance { query: &PromQuery, query_ctx: QueryContextRef, ) -> server_error::Result { - let _timer = timer!(metrics::METRIC_HANDLE_PROMQL_ELAPSED); + let _timer = metrics::METRIC_HANDLE_PROMQL_ELAPSED.start_timer(); let interceptor = self .plugins .get::>(); diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index e592739938..6dd73d4070 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -15,7 +15,6 @@ use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use common_error::ext::BoxedError; -use metrics::counter; use opentelemetry_proto::tonic::collector::metrics::v1::{ ExportMetricsServiceRequest, ExportMetricsServiceResponse, }; @@ -51,7 +50,7 @@ impl OpenTelemetryProtocolHandler for Instance { .map_err(BoxedError::new) .context(error::ExecuteGrpcQuerySnafu)?; - counter!(OTLP_METRICS_ROWS, rows as u64); + OTLP_METRICS_ROWS.inc_by(rows as u64); let resp = ExportMetricsServiceResponse { // TODO(sunng87): add support for partial_success in future patch @@ -87,7 +86,7 @@ impl OpenTelemetryProtocolHandler for Instance { .map_err(BoxedError::new) .context(error::ExecuteGrpcQuerySnafu)?; - counter!(OTLP_TRACES_ROWS, rows as u64); + OTLP_TRACES_ROWS.inc_by(rows as u64); let resp = ExportTraceServiceResponse { // TODO(fys): add support for partial_success in future patch diff --git a/src/frontend/src/instance/prom_store.rs b/src/frontend/src/instance/prom_store.rs index c49c7356b0..7b1bd128f0 100644 --- a/src/frontend/src/instance/prom_store.rs +++ b/src/frontend/src/instance/prom_store.rs @@ -21,7 +21,6 @@ use common_error::ext::BoxedError; use common_query::Output; use common_recordbatch::RecordBatches; use common_telemetry::logging; -use metrics::counter; use prost::Message; use servers::error::{self, AuthSnafu, Result as ServerResult}; use servers::prom_store::{self, Metrics}; @@ -161,7 +160,7 @@ impl PromStoreProtocolHandler for Instance { .map_err(BoxedError::new) .context(error::ExecuteGrpcQuerySnafu)?; - counter!(PROM_STORE_REMOTE_WRITE_SAMPLES, samples as u64); + PROM_STORE_REMOTE_WRITE_SAMPLES.inc_by(samples as u64); Ok(()) } diff --git a/src/frontend/src/instance/script.rs b/src/frontend/src/instance/script.rs index f0aee2074b..ba38b5d4af 100644 --- a/src/frontend/src/instance/script.rs +++ b/src/frontend/src/instance/script.rs @@ -16,7 +16,6 @@ use std::collections::HashMap; use async_trait::async_trait; use common_query::Output; -use common_telemetry::timer; use servers::query_handler::ScriptHandler; use session::context::QueryContextRef; @@ -31,7 +30,7 @@ impl ScriptHandler for Instance { name: &str, script: &str, ) -> servers::error::Result<()> { - let _timer = timer!(metrics::METRIC_HANDLE_SCRIPTS_ELAPSED); + let _timer = metrics::METRIC_HANDLE_SCRIPTS_ELAPSED.start_timer(); self.script_executor .insert_script(query_ctx, name, script) .await @@ -43,7 +42,7 @@ impl ScriptHandler for Instance { name: &str, params: HashMap, ) -> servers::error::Result { - let _timer = timer!(metrics::METRIC_RUN_SCRIPT_ELAPSED); + let _timer = metrics::METRIC_RUN_SCRIPT_ELAPSED.start_timer(); self.script_executor .execute_script(query_ctx, name, params) .await diff --git a/src/frontend/src/metrics.rs b/src/frontend/src/metrics.rs index b07bf2df9e..f57cdcd179 100644 --- a/src/frontend/src/metrics.rs +++ b/src/frontend/src/metrics.rs @@ -12,14 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub(crate) const METRIC_HANDLE_SQL_ELAPSED: &str = "frontend.handle_sql_elapsed"; -pub(crate) const METRIC_HANDLE_PROMQL_ELAPSED: &str = "frontend.handle_promql_elapsed"; -pub(crate) const METRIC_EXEC_PLAN_ELAPSED: &str = "frontend.exec_plan_elapsed"; -pub(crate) const METRIC_HANDLE_SCRIPTS_ELAPSED: &str = "frontend.handle_scripts_elapsed"; -pub(crate) const METRIC_RUN_SCRIPT_ELAPSED: &str = "frontend.run_script_elapsed"; +use lazy_static::lazy_static; +use prometheus::*; -/// The samples count of Prometheus remote write. -pub const PROM_STORE_REMOTE_WRITE_SAMPLES: &str = "frontend.prometheus.remote_write.samples"; - -pub const OTLP_METRICS_ROWS: &str = "frontend.otlp.metrics.rows"; -pub const OTLP_TRACES_ROWS: &str = "frontend.otlp.traces.rows"; +lazy_static! { + pub static ref METRIC_HANDLE_SQL_ELAPSED: Histogram = + register_histogram!("frontend_handle_sql_elapsed", "frontend handle sql elapsed").unwrap(); + pub static ref METRIC_HANDLE_PROMQL_ELAPSED: Histogram = register_histogram!( + "frontend_handle_promql_elapsed", + "frontend handle promql elapsed" + ) + .unwrap(); + pub static ref METRIC_EXEC_PLAN_ELAPSED: Histogram = + register_histogram!("frontend_exec_plan_elapsed", "frontend exec plan elapsed").unwrap(); + pub static ref METRIC_HANDLE_SCRIPTS_ELAPSED: Histogram = register_histogram!( + "frontend_handle_scripts_elapsed", + "frontend handle scripts elapsed" + ) + .unwrap(); + pub static ref METRIC_RUN_SCRIPT_ELAPSED: Histogram = + register_histogram!("frontend_run_script_elapsed", "frontend run script elapsed").unwrap(); + /// The samples count of Prometheus remote write. + pub static ref PROM_STORE_REMOTE_WRITE_SAMPLES: IntCounter = register_int_counter!( + "frontend_prometheus_remote_write_samples", + "frontend prometheus remote write samples" + ) + .unwrap(); + pub static ref OTLP_METRICS_ROWS: IntCounter = register_int_counter!( + "frontend_otlp_metrics_rows", + "frontend otlp metrics rows" + ) + .unwrap(); + pub static ref OTLP_TRACES_ROWS: IntCounter = register_int_counter!( + "frontend_otlp_traces_rows", + "frontend otlp traces rows" + ) + .unwrap(); +} diff --git a/src/meta-srv/Cargo.toml b/src/meta-srv/Cargo.toml index 0836485e0a..4998969e50 100644 --- a/src/meta-srv/Cargo.toml +++ b/src/meta-srv/Cargo.toml @@ -34,9 +34,9 @@ futures.workspace = true h2 = "0.3" http-body = "0.4" lazy_static.workspace = true -metrics.workspace = true once_cell.workspace = true parking_lot = "0.12" +prometheus.workspace = true prost.workspace = true rand.workspace = true regex.workspace = true diff --git a/src/meta-srv/src/handler.rs b/src/meta-srv/src/handler.rs index f57ba5579e..6fae2721b6 100644 --- a/src/meta-srv/src/handler.rs +++ b/src/meta-srv/src/handler.rs @@ -24,10 +24,9 @@ use api::v1::meta::{ }; use common_meta::instruction::{Instruction, InstructionReply}; use common_meta::sequence::Sequence; -use common_telemetry::{debug, info, timer, warn}; +use common_telemetry::{debug, info, warn}; use dashmap::DashMap; use futures::future::join_all; -use metrics::{decrement_gauge, increment_gauge}; use snafu::{OptionExt, ResultExt}; use tokio::sync::mpsc::Sender; use tokio::sync::{oneshot, Notify, RwLock}; @@ -215,14 +214,14 @@ impl HeartbeatHandlerGroup { pub async fn register(&self, key: impl AsRef, pusher: Pusher) { let key = key.as_ref(); - increment_gauge!(METRIC_META_HEARTBEAT_CONNECTION_NUM, 1.0); + METRIC_META_HEARTBEAT_CONNECTION_NUM.inc(); info!("Pusher register: {}", key); let _ = self.pushers.insert(key.to_string(), pusher).await; } pub async fn unregister(&self, key: impl AsRef) -> Option { let key = key.as_ref(); - decrement_gauge!(METRIC_META_HEARTBEAT_CONNECTION_NUM, 1.0); + METRIC_META_HEARTBEAT_CONNECTION_NUM.dec(); info!("Pusher unregister: {}", key); self.pushers.remove(key).await } @@ -252,7 +251,9 @@ impl HeartbeatHandlerGroup { } if handler.is_acceptable(role) { - let _timer = timer!(METRIC_META_HANDLER_EXECUTE, &[("name", *name)]); + let _timer = METRIC_META_HANDLER_EXECUTE + .with_label_values(&[*name]) + .start_timer(); handler.handle(&req, &mut ctx, &mut acc).await?; } } diff --git a/src/meta-srv/src/inactive_region_manager.rs b/src/meta-srv/src/inactive_region_manager.rs index b3e110584f..e00c3b456f 100644 --- a/src/meta-srv/src/inactive_region_manager.rs +++ b/src/meta-srv/src/inactive_region_manager.rs @@ -16,7 +16,6 @@ use std::collections::HashSet; use common_meta::rpc::store::{BatchGetRequest, DeleteRangeRequest, PutRequest, RangeRequest}; use common_meta::RegionIdent; -use metrics::{decrement_gauge, increment_gauge}; use crate::error::Result; use crate::keys::InactiveRegionKey; @@ -46,7 +45,7 @@ impl<'a> InactiveRegionManager<'a> { }; self.store.put(req).await?; - increment_gauge!(METRIC_META_INACTIVE_REGIONS, 1.0); + METRIC_META_INACTIVE_REGIONS.inc(); Ok(()) } @@ -61,7 +60,7 @@ impl<'a> InactiveRegionManager<'a> { .into(); self.store.delete(&key, false).await?; - decrement_gauge!(METRIC_META_INACTIVE_REGIONS, 1.0); + METRIC_META_INACTIVE_REGIONS.dec(); Ok(()) } diff --git a/src/meta-srv/src/metrics.rs b/src/meta-srv/src/metrics.rs index e3b2a8d09a..57d4893995 100644 --- a/src/meta-srv/src/metrics.rs +++ b/src/meta-srv/src/metrics.rs @@ -12,7 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub(crate) const METRIC_META_KV_REQUEST: &str = "meta.kv_request"; -pub(crate) const METRIC_META_HEARTBEAT_CONNECTION_NUM: &str = "meta.heartbeat_connection_num"; -pub(crate) const METRIC_META_HANDLER_EXECUTE: &str = "meta.handler_execute"; -pub const METRIC_META_INACTIVE_REGIONS: &str = "meta.inactive_regions"; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + pub static ref METRIC_META_KV_REQUEST: HistogramVec = register_histogram_vec!( + "meta_kv_request", + "meta kv request", + &["target", "op", "cluster_id"] + ) + .unwrap(); + pub static ref METRIC_META_HEARTBEAT_CONNECTION_NUM: IntGauge = register_int_gauge!( + "meta_heartbeat_connection_num", + "meta heartbeat connection num" + ) + .unwrap(); + pub static ref METRIC_META_HANDLER_EXECUTE: HistogramVec = + register_histogram_vec!("meta_handler_execute", "meta handler execute", &["name"]).unwrap(); + pub static ref METRIC_META_INACTIVE_REGIONS: IntGauge = + register_int_gauge!("meta_inactive_regions", "meta inactive regions").unwrap(); +} diff --git a/src/meta-srv/src/service/store.rs b/src/meta-srv/src/service/store.rs index 7a74e4499b..aa58501653 100644 --- a/src/meta-srv/src/service/store.rs +++ b/src/meta-srv/src/service/store.rs @@ -32,7 +32,6 @@ use common_meta::rpc::store::{ BatchDeleteRequest, BatchGetRequest, BatchPutRequest, CompareAndPutRequest, DeleteRangeRequest, PutRequest, RangeRequest, }; -use common_telemetry::timer; use snafu::OptionExt; use tonic::{Request, Response}; @@ -51,15 +50,11 @@ impl store_server::Store for MetaSrv { .as_ref() .context(MissingRequestHeaderSnafu)? .cluster_id; + let cluster_id_str = cluster_id.to_string(); - let _timer = timer!( - METRIC_META_KV_REQUEST, - &[ - ("target", self.kv_store().name().to_string()), - ("op", "range".to_string()), - ("cluster_id", cluster_id.to_string()), - ] - ); + let _timer = METRIC_META_KV_REQUEST + .with_label_values(&[self.kv_store().name(), "range", cluster_id_str.as_str()]) + .start_timer(); let req: RangeRequest = req.into(); @@ -77,15 +72,11 @@ impl store_server::Store for MetaSrv { .as_ref() .context(MissingRequestHeaderSnafu)? .cluster_id; + let cluster_id_str = cluster_id.to_string(); - let _timer = timer!( - METRIC_META_KV_REQUEST, - &[ - ("target", self.kv_store().name().to_string()), - ("op", "put".to_string()), - ("cluster_id", cluster_id.to_string()), - ] - ); + let _timer = METRIC_META_KV_REQUEST + .with_label_values(&[self.kv_store().name(), "put", cluster_id_str.as_str()]) + .start_timer(); let req: PutRequest = req.into(); @@ -103,15 +94,11 @@ impl store_server::Store for MetaSrv { .as_ref() .context(MissingRequestHeaderSnafu)? .cluster_id; + let cluster_id_str = cluster_id.to_string(); - let _timer = timer!( - METRIC_META_KV_REQUEST, - &[ - ("target", self.kv_store().name().to_string()), - ("op", "batch_get".to_string()), - ("cluster_id", cluster_id.to_string()), - ] - ); + let _timer = METRIC_META_KV_REQUEST + .with_label_values(&[self.kv_store().name(), "batch_get", cluster_id_str.as_str()]) + .start_timer(); let req: BatchGetRequest = req.into(); @@ -129,15 +116,11 @@ impl store_server::Store for MetaSrv { .as_ref() .context(MissingRequestHeaderSnafu)? .cluster_id; + let cluster_id_str = cluster_id.to_string(); - let _timer = timer!( - METRIC_META_KV_REQUEST, - &[ - ("target", self.kv_store().name().to_string()), - ("op", "batch_pub".to_string()), - ("cluster_id", cluster_id.to_string()), - ] - ); + let _timer = METRIC_META_KV_REQUEST + .with_label_values(&[self.kv_store().name(), "batch_pub", cluster_id_str.as_str()]) + .start_timer(); let req: BatchPutRequest = req.into(); @@ -158,15 +141,15 @@ impl store_server::Store for MetaSrv { .as_ref() .context(MissingRequestHeaderSnafu)? .cluster_id; + let cluster_id_str = cluster_id.to_string(); - let _timer = timer!( - METRIC_META_KV_REQUEST, - &[ - ("target", self.kv_store().name().to_string()), - ("op", "batch_delete".to_string()), - ("cluster_id", cluster_id.to_string()), - ] - ); + let _timer = METRIC_META_KV_REQUEST + .with_label_values(&[ + self.kv_store().name(), + "batch_delete", + cluster_id_str.as_str(), + ]) + .start_timer(); let req: BatchDeleteRequest = req.into(); @@ -187,15 +170,15 @@ impl store_server::Store for MetaSrv { .as_ref() .context(MissingRequestHeaderSnafu)? .cluster_id; + let cluster_id_str = cluster_id.to_string(); - let _timer = timer!( - METRIC_META_KV_REQUEST, - &[ - ("target", self.kv_store().name().to_string()), - ("op", "compare_and_put".to_string()), - ("cluster_id", cluster_id.to_string()), - ] - ); + let _timer = METRIC_META_KV_REQUEST + .with_label_values(&[ + self.kv_store().name(), + "compare_and_put", + cluster_id_str.as_str(), + ]) + .start_timer(); let req: CompareAndPutRequest = req.into(); @@ -216,15 +199,15 @@ impl store_server::Store for MetaSrv { .as_ref() .context(MissingRequestHeaderSnafu)? .cluster_id; + let cluster_id_str = cluster_id.to_string(); - let _timer = timer!( - METRIC_META_KV_REQUEST, - &[ - ("target", self.kv_store().name().to_string()), - ("op", "delete_range".to_string()), - ("cluster_id", cluster_id.to_string()), - ] - ); + let _timer = METRIC_META_KV_REQUEST + .with_label_values(&[ + self.kv_store().name(), + "delete_range", + cluster_id_str.as_str(), + ]) + .start_timer(); let req: DeleteRangeRequest = req.into(); diff --git a/src/meta-srv/src/service/store/etcd.rs b/src/meta-srv/src/service/store/etcd.rs index c6c0d6be03..9df574708a 100644 --- a/src/meta-srv/src/service/store/etcd.rs +++ b/src/meta-srv/src/service/store/etcd.rs @@ -24,7 +24,6 @@ use common_meta::rpc::store::{ DeleteRangeResponse, PutRequest, PutResponse, RangeRequest, RangeResponse, }; use common_meta::rpc::KeyValue; -use common_telemetry::timer; use etcd_client::{ Client, Compare, CompareOp, DeleteOptions, GetOptions, PutOptions, Txn, TxnOp, TxnOpResponse, TxnResponse, @@ -295,10 +294,9 @@ impl TxnService for EtcdStore { type Error = Error; async fn txn(&self, txn: KvTxn) -> Result { - let _timer = timer!( - METRIC_META_TXN_REQUEST, - &[("target", "etcd".to_string()), ("op", "txn".to_string()),] - ); + let _timer = METRIC_META_TXN_REQUEST + .with_label_values(&["etcd", "txn"]) + .start_timer(); let etcd_txn: Txn = txn.into(); let txn_res = self diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 851b0f02a8..ff48b5818b 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -40,11 +40,11 @@ futures.workspace = true humantime-serde = { workspace = true } lazy_static = "1.4" memcomparable = "0.2" -metrics.workspace = true moka = { workspace = true, features = ["sync"] } object-store = { workspace = true } parquet = { workspace = true, features = ["async"] } paste.workspace = true +prometheus.workspace = true prost.workspace = true regex = "1.5" serde = { version = "1.0", features = ["derive"] } diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index 5d843fc373..a3c8300050 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -22,7 +22,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; -use common_telemetry::{debug, error, timer}; +use common_telemetry::{debug, error}; pub use picker::CompactionPickerRef; use snafu::ResultExt; use store_api::storage::RegionId; @@ -33,7 +33,7 @@ use crate::compaction::twcs::TwcsPicker; use crate::error::{ CompactRegionSnafu, Error, RegionClosedSnafu, RegionDroppedSnafu, RegionTruncatedSnafu, Result, }; -use crate::metrics::{COMPACTION_STAGE_ELAPSED, STAGE_LABEL}; +use crate::metrics::COMPACTION_STAGE_ELAPSED; use crate::region::options::CompactionOptions; use crate::region::version::{VersionControlRef, VersionRef}; use crate::request::{OptionOutputTx, OutputTx, WorkerRequest}; @@ -180,7 +180,9 @@ impl CompactionScheduler { picker, region_id ); - let pick_timer = timer!(COMPACTION_STAGE_ELAPSED, &[(STAGE_LABEL, "pick")]); + let pick_timer = COMPACTION_STAGE_ELAPSED + .with_label_values(&["pick"]) + .start_timer(); let Some(mut task) = picker.pick(request) else { // Nothing to compact, remove it from the region status map. self.region_status.remove(®ion_id); diff --git a/src/mito2/src/compaction/twcs.rs b/src/mito2/src/compaction/twcs.rs index d9a586d0f6..92df30e92f 100644 --- a/src/mito2/src/compaction/twcs.rs +++ b/src/mito2/src/compaction/twcs.rs @@ -19,11 +19,10 @@ use std::time::{Duration, Instant}; use common_base::readable_size::ReadableSize; use common_query::Output; -use common_telemetry::{debug, error, info, timer}; +use common_telemetry::{debug, error, info}; use common_time::timestamp::TimeUnit; use common_time::timestamp_millis::BucketAligned; use common_time::Timestamp; -use metrics::increment_counter; use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; use store_api::storage::RegionId; @@ -35,7 +34,7 @@ use crate::compaction::picker::{CompactionTask, Picker}; use crate::compaction::CompactionRequest; use crate::error; use crate::error::CompactRegionSnafu; -use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED, STAGE_LABEL}; +use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED}; use crate::request::{ BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, WorkerRequest, }; @@ -316,10 +315,12 @@ impl TwcsCompactionTask { async fn handle_compaction(&mut self) -> error::Result<(Vec, Vec)> { self.mark_files_compacting(true); - let merge_timer = timer!(COMPACTION_STAGE_ELAPSED, &[(STAGE_LABEL, "merge")]); + let merge_timer = COMPACTION_STAGE_ELAPSED + .with_label_values(&["merge"]) + .start_timer(); let (output, mut compacted) = self.merge_ssts().await.map_err(|e| { error!(e; "Failed to compact region: {}", self.region_id); - merge_timer.discard(); + merge_timer.stop_and_discard(); e })?; compacted.extend(self.expired_ssts.iter().map(FileHandle::meta)); @@ -328,7 +329,7 @@ impl TwcsCompactionTask { /// Handles compaction failure, notifies all waiters. fn on_failure(&mut self, err: Arc) { - increment_counter!(COMPACTION_FAILURE_COUNT); + COMPACTION_FAILURE_COUNT.inc(); for waiter in self.waiters.drain(..) { waiter.send(Err(err.clone()).context(CompactRegionSnafu { region_id: self.region_id, diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index 6251a79376..daaff6626c 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -45,7 +45,6 @@ use async_trait::async_trait; use common_error::ext::BoxedError; use common_query::Output; use common_recordbatch::SendableRecordBatchStream; -use common_telemetry::timer; use object_store::manager::ObjectStoreManagerRef; use snafu::{OptionExt, ResultExt}; use store_api::logstore::LogStore; @@ -56,7 +55,7 @@ use store_api::storage::{RegionId, ScanRequest}; use crate::config::MitoConfig; use crate::error::{RecvSnafu, RegionNotFoundSnafu, Result}; -use crate::metrics::{HANDLE_REQUEST_ELAPSED, TYPE_LABEL}; +use crate::metrics::HANDLE_REQUEST_ELAPSED; use crate::read::scan_region::{ScanRegion, Scanner}; use crate::region::RegionUsage; use crate::request::WorkerRequest; @@ -146,7 +145,9 @@ impl EngineInner { /// Handles [RegionRequest] and return its executed result. async fn handle_request(&self, region_id: RegionId, request: RegionRequest) -> Result { - let _timer = timer!(HANDLE_REQUEST_ELAPSED, &[(TYPE_LABEL, request.type_name())]); + let _timer = HANDLE_REQUEST_ELAPSED + .with_label_values(&[request.type_name()]) + .start_timer(); let (request, receiver) = WorkerRequest::try_from_region_request(region_id, request)?; self.workers.submit_to_worker(region_id, request).await?; diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index 1417bf352c..8bbbc6c94c 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -19,8 +19,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use common_query::Output; -use common_telemetry::{error, info, timer}; -use metrics::{counter, increment_counter}; +use common_telemetry::{error, info}; use snafu::ResultExt; use store_api::storage::RegionId; use strum::IntoStaticStr; @@ -31,10 +30,7 @@ use crate::error::{ Error, FlushRegionSnafu, RegionClosedSnafu, RegionDroppedSnafu, RegionTruncatedSnafu, Result, }; use crate::memtable::MemtableBuilderRef; -use crate::metrics::{ - FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_ERRORS_TOTAL, FLUSH_REASON, FLUSH_REQUESTS_TOTAL, - TYPE_LABEL, -}; +use crate::metrics::{FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_ERRORS_TOTAL, FLUSH_REQUESTS_TOTAL}; use crate::read::Source; use crate::region::version::{VersionControlData, VersionControlRef, VersionRef}; use crate::request::{ @@ -244,7 +240,7 @@ impl RegionFlushTask { /// Runs the flush task. async fn do_flush(&mut self, version_data: VersionControlData) { - let timer = timer!(FLUSH_ELAPSED, &[(TYPE_LABEL, "total")]); + let timer = FLUSH_ELAPSED.with_label_values(&["total"]).start_timer(); self.listener.on_flush_begin(self.region_id).await; let worker_request = match self.flush_memtables(&version_data.version).await { Ok(file_metas) => { @@ -264,7 +260,7 @@ impl RegionFlushTask { memtables_to_remove, senders: std::mem::take(&mut self.senders), file_purger: self.file_purger.clone(), - timer, + _timer: timer, }; WorkerRequest::Background { region_id: self.region_id, @@ -274,7 +270,7 @@ impl RegionFlushTask { Err(e) => { error!(e; "Failed to flush region {}", self.region_id); // Discard the timer. - timer.discard(); + timer.stop_and_discard(); let err = Arc::new(e); self.on_failure(err.clone()); @@ -289,7 +285,9 @@ impl RegionFlushTask { /// Flushes memtables to level 0 SSTs. async fn flush_memtables(&self, version: &VersionRef) -> Result> { - let timer = timer!(FLUSH_ELAPSED, &[(TYPE_LABEL, "flush_memtables")]); + let timer = FLUSH_ELAPSED + .with_label_values(&["flush_memtables"]) + .start_timer(); // TODO(yingwen): Make it configurable. let mut write_opts = WriteOptions::default(); @@ -328,7 +326,7 @@ impl RegionFlushTask { } if !file_metas.is_empty() { - counter!(FLUSH_BYTES_TOTAL, flushed_bytes); + FLUSH_BYTES_TOTAL.inc_by(flushed_bytes); } let file_ids: Vec<_> = file_metas.iter().map(|f| f.file_id).collect(); @@ -337,7 +335,7 @@ impl RegionFlushTask { version.metadata.region_id, self.reason.as_str(), file_ids, - timer.elapsed(), + timer.stop_and_record(), ); Ok(file_metas) @@ -392,7 +390,9 @@ impl FlushScheduler { ) -> Result<()> { debug_assert_eq!(region_id, task.region_id); - increment_counter!(FLUSH_REQUESTS_TOTAL, FLUSH_REASON => task.reason.as_str()); + FLUSH_REQUESTS_TOTAL + .with_label_values(&[task.reason.as_str()]) + .inc(); let version = version_control.current().version; if version.memtables.mutable.is_empty() && version.memtables.immutables().is_empty() { @@ -474,7 +474,7 @@ impl FlushScheduler { pub(crate) fn on_flush_failed(&mut self, region_id: RegionId, err: Arc) { error!(err; "Region {} failed to flush, cancel all pending tasks", region_id); - increment_counter!(FLUSH_ERRORS_TOTAL); + FLUSH_ERRORS_TOTAL.inc(); // Remove this region. let Some(flush_status) = self.region_status.remove(®ion_id) else { diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index 0ced4f5472..1ce3509220 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -24,7 +24,6 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; use common_time::Timestamp; -use metrics::{decrement_gauge, increment_gauge}; use store_api::metadata::RegionMetadataRef; use store_api::storage::ColumnId; use table::predicate::Predicate; @@ -131,7 +130,7 @@ impl AllocTracker { /// Tracks `bytes` memory is allocated. pub(crate) fn on_allocation(&self, bytes: usize) { self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed); - increment_gauge!(WRITE_BUFFER_BYTES, bytes as f64); + WRITE_BUFFER_BYTES.add(bytes as i64); if let Some(write_buffer_manager) = &self.write_buffer_manager { write_buffer_manager.reserve_mem(bytes); } @@ -167,7 +166,7 @@ impl Drop for AllocTracker { } let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed); - decrement_gauge!(WRITE_BUFFER_BYTES, bytes_allocated as f64); + WRITE_BUFFER_BYTES.sub(bytes_allocated as i64); // Memory tracked by this tracker is freed. if let Some(write_buffer_manager) = &self.write_buffer_manager { diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index 4b32ec877e..257bb537e5 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -12,49 +12,97 @@ // See the License for the specific language governing permissions and // limitations under the License. +use lazy_static::lazy_static; +use prometheus::*; + /// Stage label. pub const STAGE_LABEL: &str = "stage"; - -/// Global write buffer size in bytes. -pub const WRITE_BUFFER_BYTES: &str = "mito.write_buffer_bytes"; /// Type label. pub const TYPE_LABEL: &str = "type"; -/// Gauge for open regions -pub const REGION_COUNT: &str = "mito.region_count"; -/// Elapsed time to handle requests. -pub const HANDLE_REQUEST_ELAPSED: &str = "mito.handle_request.elapsed"; - -// ------ Flush related metrics -/// Counter of scheduled flush requests. -/// Note that the flush scheduler may merge some flush requests. -pub const FLUSH_REQUESTS_TOTAL: &str = "mito.flush.requests_total"; /// Reason to flush. pub const FLUSH_REASON: &str = "reason"; -/// Counter of scheduled failed flush jobs. -pub const FLUSH_ERRORS_TOTAL: &str = "mito.flush.errors_total"; -/// Elapsed time of a flush job. -pub const FLUSH_ELAPSED: &str = "mito.flush.elapsed"; -/// Histogram of flushed bytes. -pub const FLUSH_BYTES_TOTAL: &str = "mito.flush.bytes_total"; -// ------ End of flush related metrics -// ------ Write related metrics -/// Counter of stalled write requests. -pub const WRITE_STALL_TOTAL: &str = "mito.write.stall_total"; -/// Counter of rejected write requests. -pub const WRITE_REJECT_TOTAL: &str = "mito.write.reject_total"; -/// Elapsed time of each write stage. -pub const WRITE_STAGE_ELAPSED: &str = "mito.write.stage_elapsed"; -/// Counter of rows to write. -pub const WRITE_ROWS_TOTAL: &str = "mito.write.rows_total"; -// ------ End of write related metrics +lazy_static! { + /// Global write buffer size in bytes. + pub static ref WRITE_BUFFER_BYTES: IntGauge = + register_int_gauge!("mito_write_buffer_bytes", "mito write buffer bytes").unwrap(); + /// Gauge for open regions + pub static ref REGION_COUNT: IntGauge = + register_int_gauge!("mito_region_count", "mito region count").unwrap(); + /// Elapsed time to handle requests. + pub static ref HANDLE_REQUEST_ELAPSED: HistogramVec = register_histogram_vec!( + "mito_handle_request_elapsed", + "mito handle request elapsed", + &[TYPE_LABEL] + ) + .unwrap(); -// Compaction metrics -/// Timer of different stages in compaction. -pub const COMPACTION_STAGE_ELAPSED: &str = "mito.compaction.stage_elapsed"; -/// Timer of whole compaction task. -pub const COMPACTION_ELAPSED_TOTAL: &str = "mito.compaction.total_elapsed"; -/// Counter of all requested compaction task. -pub const COMPACTION_REQUEST_COUNT: &str = "mito.compaction.requests_total"; -/// Counter of failed compaction task. -pub const COMPACTION_FAILURE_COUNT: &str = "mito.compaction.failure_total"; + + + // ------ Flush related metrics + /// Counter of scheduled flush requests. + /// Note that the flush scheduler may merge some flush requests. + pub static ref FLUSH_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!( + "mito_flush_requests_total", + "mito flush requests total", + &[FLUSH_REASON] + ) + .unwrap(); + /// Counter of scheduled failed flush jobs. + pub static ref FLUSH_ERRORS_TOTAL: IntCounter = + register_int_counter!("mito_flush_errors_total", "mito flush errors total").unwrap(); + /// Elapsed time of a flush job. + pub static ref FLUSH_ELAPSED: HistogramVec = register_histogram_vec!( + "mito_flush_elapsed", + "mito flush elapsed", + &[TYPE_LABEL] + ) + .unwrap(); + /// Histogram of flushed bytes. + pub static ref FLUSH_BYTES_TOTAL: IntCounter = + register_int_counter!("mito_flush_bytes_total", "mito flush bytes total").unwrap(); + // ------ End of flush related metrics + + + // ------ Write related metrics + /// Counter of stalled write requests. + pub static ref WRITE_STALL_TOTAL: IntCounter = + register_int_counter!("mito_write_stall_total", "mito write stall total").unwrap(); + /// Counter of rejected write requests. + pub static ref WRITE_REJECT_TOTAL: IntCounter = + register_int_counter!("mito_write_reject_total", "mito write reject total").unwrap(); + /// Elapsed time of each write stage. + pub static ref WRITE_STAGE_ELAPSED: HistogramVec = register_histogram_vec!( + "mito_write_stage_elapsed", + "mito write stage elapsed", + &[STAGE_LABEL] + ) + .unwrap(); + /// Counter of rows to write. + pub static ref WRITE_ROWS_TOTAL: IntCounterVec = register_int_counter_vec!( + "mito_write_rows_total", + "mito write rows total", + &[TYPE_LABEL] + ) + .unwrap(); + // ------ End of write related metrics + + + // Compaction metrics + /// Timer of different stages in compaction. + pub static ref COMPACTION_STAGE_ELAPSED: HistogramVec = register_histogram_vec!( + "mito_compaction_stage_elapsed", + "mito compaction stage elapsed", + &[STAGE_LABEL] + ) + .unwrap(); + /// Timer of whole compaction task. + pub static ref COMPACTION_ELAPSED_TOTAL: Histogram = + register_histogram!("mito_compaction_total_elapsed", "mito compaction total elapsed").unwrap(); + /// Counter of all requested compaction task. + pub static ref COMPACTION_REQUEST_COUNT: IntCounter = + register_int_counter!("mito_compaction_requests_total", "mito compaction requests total").unwrap(); + /// Counter of failed compaction task. + pub static ref COMPACTION_FAILURE_COUNT: IntCounter = + register_int_counter!("mito_compaction_failure_total", "mito compaction failure total").unwrap(); +} diff --git a/src/mito2/src/request.rs b/src/mito2/src/request.rs index 919eb6d4fa..2d5f0e4fda 100644 --- a/src/mito2/src/request.rs +++ b/src/mito2/src/request.rs @@ -25,11 +25,10 @@ use api::helper::{ use api::v1::{ColumnDataType, ColumnSchema, OpType, Rows, SemanticType, Value}; use common_query::Output; use common_query::Output::AffectedRows; -use common_telemetry::metric::Timer; use common_telemetry::tracing::log::info; use common_telemetry::warn; use datatypes::prelude::DataType; -use metrics::histogram; +use prometheus::HistogramTimer; use prost::Message; use smallvec::SmallVec; use snafu::{ensure, OptionExt, ResultExt}; @@ -596,7 +595,7 @@ pub(crate) struct FlushFinished { /// File purger for cleaning files on failure. pub(crate) file_purger: FilePurgerRef, /// Flush timer. - pub(crate) timer: Timer, + pub(crate) _timer: HistogramTimer, } impl FlushFinished { @@ -655,7 +654,7 @@ pub(crate) struct CompactionFinished { impl CompactionFinished { pub fn on_success(self) { // only update compaction time on success - histogram!(COMPACTION_ELAPSED_TOTAL, self.start_time.elapsed()); + COMPACTION_ELAPSED_TOTAL.observe(self.start_time.elapsed().as_secs_f64()); for sender in self.senders { sender.send(Ok(AffectedRows(0))); diff --git a/src/mito2/src/worker/handle_close.rs b/src/mito2/src/worker/handle_close.rs index 8020cc8b7a..c9e152baa3 100644 --- a/src/mito2/src/worker/handle_close.rs +++ b/src/mito2/src/worker/handle_close.rs @@ -16,7 +16,6 @@ use common_query::Output; use common_telemetry::info; -use metrics::decrement_gauge; use store_api::storage::RegionId; use crate::error::Result; @@ -40,7 +39,7 @@ impl RegionWorkerLoop { info!("Region {} closed", region_id); - decrement_gauge!(REGION_COUNT, 1.0); + REGION_COUNT.dec(); Ok(Output::AffectedRows(0)) } diff --git a/src/mito2/src/worker/handle_compaction.rs b/src/mito2/src/worker/handle_compaction.rs index 26d81c23d7..58fd714c30 100644 --- a/src/mito2/src/worker/handle_compaction.rs +++ b/src/mito2/src/worker/handle_compaction.rs @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_telemetry::{error, info, timer}; -use metrics::increment_counter; +use common_telemetry::{error, info}; use store_api::logstore::LogStore; use store_api::storage::RegionId; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; -use crate::metrics::{COMPACTION_REQUEST_COUNT, COMPACTION_STAGE_ELAPSED, STAGE_LABEL}; +use crate::metrics::{COMPACTION_REQUEST_COUNT, COMPACTION_STAGE_ELAPSED}; use crate::request::{CompactionFailed, CompactionFinished, OnFailure, OptionOutputTx}; use crate::worker::RegionWorkerLoop; @@ -32,7 +31,7 @@ impl RegionWorkerLoop { let Some(region) = self.regions.writable_region_or(region_id, &mut sender) else { return; }; - increment_counter!(COMPACTION_REQUEST_COUNT); + COMPACTION_REQUEST_COUNT.inc(); if let Err(e) = self.compaction_scheduler.schedule_compaction( region.region_id, ®ion.version_control, @@ -60,8 +59,9 @@ impl RegionWorkerLoop { }; { - let manifest_timer = - timer!(COMPACTION_STAGE_ELAPSED, &[(STAGE_LABEL, "write_manifest")]); + let manifest_timer = COMPACTION_STAGE_ELAPSED + .with_label_values(&["write_manifest"]) + .start_timer(); // Write region edit to manifest. let edit = RegionEdit { files_to_add: std::mem::take(&mut request.compaction_outputs), @@ -74,7 +74,7 @@ impl RegionWorkerLoop { RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone())); if let Err(e) = region.manifest_manager.update(action_list).await { error!(e; "Failed to update manifest, region: {}", region_id); - manifest_timer.discard(); + manifest_timer.stop_and_discard(); request.on_failure(e); return; } diff --git a/src/mito2/src/worker/handle_create.rs b/src/mito2/src/worker/handle_create.rs index e983f300bc..a44c82153f 100644 --- a/src/mito2/src/worker/handle_create.rs +++ b/src/mito2/src/worker/handle_create.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use common_query::Output; use common_telemetry::info; -use metrics::increment_gauge; use snafu::ResultExt; use store_api::logstore::LogStore; use store_api::metadata::RegionMetadataBuilder; @@ -73,7 +72,7 @@ impl RegionWorkerLoop { info!("A new region created, region: {:?}", region.metadata()); - increment_gauge!(REGION_COUNT, 1.0); + REGION_COUNT.inc(); // Insert the MitoRegion into the RegionMap. self.regions.insert_region(Arc::new(region)); diff --git a/src/mito2/src/worker/handle_drop.rs b/src/mito2/src/worker/handle_drop.rs index cc9542bfdd..50bc0f8133 100644 --- a/src/mito2/src/worker/handle_drop.rs +++ b/src/mito2/src/worker/handle_drop.rs @@ -20,7 +20,6 @@ use common_query::Output; use common_telemetry::info; use common_telemetry::tracing::warn; use futures::TryStreamExt; -use metrics::decrement_gauge; use object_store::util::join_path; use object_store::{EntryMode, ObjectStore}; use snafu::ResultExt; @@ -66,7 +65,7 @@ impl RegionWorkerLoop { region_id ); - decrement_gauge!(REGION_COUNT, 1.0); + REGION_COUNT.dec(); // detach a background task to delete the region dir let region_dir = region.access_layer.region_dir().to_owned(); diff --git a/src/mito2/src/worker/handle_flush.rs b/src/mito2/src/worker/handle_flush.rs index 36a86555d8..24ffce1ce2 100644 --- a/src/mito2/src/worker/handle_flush.rs +++ b/src/mito2/src/worker/handle_flush.rs @@ -186,10 +186,8 @@ impl RegionWorkerLoop { // Delete wal. info!( - "Region {} flush finished, elapsed: {:?}, tries to bump wal to {}", - region_id, - request.timer.elapsed(), - request.flushed_entry_id + "Region {} flush finished, tries to bump wal to {}", + region_id, request.flushed_entry_id ); if let Err(e) = self.wal.obsolete(region_id, request.flushed_entry_id).await { error!(e; "Failed to write wal, region: {}", region_id); diff --git a/src/mito2/src/worker/handle_open.rs b/src/mito2/src/worker/handle_open.rs index da8a1fc04b..e902c78968 100644 --- a/src/mito2/src/worker/handle_open.rs +++ b/src/mito2/src/worker/handle_open.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use common_query::Output; use common_telemetry::info; -use metrics::increment_gauge; use object_store::util::join_path; use snafu::ResultExt; use store_api::logstore::LogStore; @@ -76,7 +75,7 @@ impl RegionWorkerLoop { info!("Region {} is opened", region_id); - increment_gauge!(REGION_COUNT, 1.0); + REGION_COUNT.inc(); // Insert the MitoRegion into the RegionMap. self.regions.insert_region(Arc::new(region)); diff --git a/src/mito2/src/worker/handle_write.rs b/src/mito2/src/worker/handle_write.rs index 873ef726fb..a04596a9ef 100644 --- a/src/mito2/src/worker/handle_write.rs +++ b/src/mito2/src/worker/handle_write.rs @@ -17,16 +17,13 @@ use std::collections::{hash_map, HashMap}; use std::sync::Arc; -use common_telemetry::timer; -use metrics::counter; use store_api::logstore::LogStore; use store_api::metadata::RegionMetadata; use store_api::storage::RegionId; use crate::error::{RejectWriteSnafu, Result}; use crate::metrics::{ - STAGE_LABEL, TYPE_LABEL, WRITE_REJECT_TOTAL, WRITE_ROWS_TOTAL, WRITE_STAGE_ELAPSED, - WRITE_STALL_TOTAL, + WRITE_REJECT_TOTAL, WRITE_ROWS_TOTAL, WRITE_STAGE_ELAPSED, WRITE_STALL_TOTAL, }; use crate::region_write_ctx::RegionWriteCtx; use crate::request::{SenderWriteRequest, WriteRequest}; @@ -56,7 +53,7 @@ impl RegionWorkerLoop { } if self.write_buffer_manager.should_stall() && allow_stall { - counter!(WRITE_STALL_TOTAL, write_requests.len() as u64); + WRITE_STALL_TOTAL.inc_by(write_requests.len() as u64); self.stalled_requests.append(&mut write_requests); self.listener.on_write_stall(); @@ -67,7 +64,9 @@ impl RegionWorkerLoop { // Write to WAL. { - let _timer = timer!(WRITE_STAGE_ELAPSED, &[(STAGE_LABEL, "write_wal")]); + let _timer = WRITE_STAGE_ELAPSED + .with_label_values(&["write_wal"]) + .start_timer(); let mut wal_writer = self.wal.writer(); for region_ctx in region_ctxs.values_mut() { if let Err(e) = region_ctx.add_wal_entry(&mut wal_writer).map_err(Arc::new) { @@ -86,16 +85,21 @@ impl RegionWorkerLoop { let (mut put_rows, mut delete_rows) = (0, 0); // Write to memtables. { - let _timer = timer!(WRITE_STAGE_ELAPSED, &[(STAGE_LABEL, "write_memtable")]); + let _timer = WRITE_STAGE_ELAPSED + .with_label_values(&["write_memtable"]) + .start_timer(); for mut region_ctx in region_ctxs.into_values() { region_ctx.write_memtable(); put_rows += region_ctx.put_num; delete_rows += region_ctx.delete_num; } } - - counter!(WRITE_ROWS_TOTAL, put_rows as u64, TYPE_LABEL => "put"); - counter!(WRITE_ROWS_TOTAL, delete_rows as u64, TYPE_LABEL => "delete"); + WRITE_ROWS_TOTAL + .with_label_values(&["put"]) + .inc_by(put_rows as u64); + WRITE_ROWS_TOTAL + .with_label_values(&["delete"]) + .inc_by(delete_rows as u64); } } @@ -167,7 +171,7 @@ impl RegionWorkerLoop { /// Send rejected error to all `write_requests`. fn reject_write_requests(write_requests: Vec) { - counter!(WRITE_REJECT_TOTAL, write_requests.len() as u64); + WRITE_REJECT_TOTAL.inc_by(write_requests.len() as u64); for req in write_requests { req.sender.send( diff --git a/src/object-store/Cargo.toml b/src/object-store/Cargo.toml index 9d1d055ef2..911c1eb0b7 100644 --- a/src/object-store/Cargo.toml +++ b/src/object-store/Cargo.toml @@ -12,13 +12,14 @@ common-macro.workspace = true common-runtime.workspace = true common-telemetry.workspace = true futures.workspace = true +lazy_static.workspace = true md5 = "0.7" -metrics.workspace = true moka = { workspace = true, features = ["future"] } opendal = { version = "0.40", features = [ "layers-tracing", - "layers-metrics", + "layers-prometheus", ] } +prometheus.workspace = true snafu.workspace = true uuid.workspace = true diff --git a/src/object-store/src/layers/lru_cache/read_cache.rs b/src/object-store/src/layers/lru_cache/read_cache.rs index 586465b996..314d68eb50 100644 --- a/src/object-store/src/layers/lru_cache/read_cache.rs +++ b/src/object-store/src/layers/lru_cache/read_cache.rs @@ -15,7 +15,6 @@ use std::sync::Arc; use common_telemetry::logging::debug; use futures::FutureExt; -use metrics::{decrement_gauge, increment_counter, increment_gauge}; use moka::future::Cache; use moka::notification::ListenerFuture; use opendal::raw::oio::{Page, Read, ReadExt, Reader, WriteExt}; @@ -76,12 +75,12 @@ impl ReadCache { let eviction_listener = move |read_key: Arc, read_result: ReadResult, cause| -> ListenerFuture { // Delete the file from local file cache when it's purged from mem_cache. - decrement_gauge!(OBJECT_STORE_LRU_CACHE_ENTRIES, 1.0); + OBJECT_STORE_LRU_CACHE_ENTRIES.dec(); let file_cache_cloned = file_cache_cloned.clone(); async move { if let ReadResult::Success(size) = read_result { - decrement_gauge!(OBJECT_STORE_LRU_CACHE_BYTES, size as f64); + OBJECT_STORE_LRU_CACHE_BYTES.sub(size as i64); let result = file_cache_cloned.delete(&read_key, OpDelete::new()).await; debug!( @@ -147,8 +146,8 @@ impl ReadCache { stat.into_metadata().content_length() }; - increment_gauge!(OBJECT_STORE_LRU_CACHE_ENTRIES, 1.0); - increment_gauge!(OBJECT_STORE_LRU_CACHE_BYTES, size as f64); + OBJECT_STORE_LRU_CACHE_ENTRIES.inc(); + OBJECT_STORE_LRU_CACHE_BYTES.add(size as i64); self.mem_cache .insert(read_key.to_string(), ReadResult::Success(size as u32)) .await; @@ -200,17 +199,21 @@ impl ReadCache { // while reading, we have to fallback to remote read match self.file_cache.read(&read_key, OpRead::default()).await { Ok(ret) => { - increment_counter!(OBJECT_STORE_LRU_CACHE_HIT, "result" => "success"); + OBJECT_STORE_LRU_CACHE_HIT + .with_label_values(&["success"]) + .inc(); Ok(to_output_reader(ret)) } Err(_) => { - increment_counter!(OBJECT_STORE_LRU_CACHE_MISS); + OBJECT_STORE_LRU_CACHE_MISS.inc(); inner.read(path, args).await.map(to_output_reader) } } } ReadResult::NotFound => { - increment_counter!(OBJECT_STORE_LRU_CACHE_HIT, "result" => "not_found"); + OBJECT_STORE_LRU_CACHE_HIT + .with_label_values(&["not_found"]) + .inc(); Err(OpendalError::new( ErrorKind::NotFound, @@ -231,7 +234,7 @@ impl ReadCache { where I: Accessor, { - increment_counter!(OBJECT_STORE_LRU_CACHE_MISS); + OBJECT_STORE_LRU_CACHE_MISS.inc(); let inner_result = inner.read(path, args).await; @@ -247,22 +250,25 @@ impl ReadCache { writer.close().await?; let read_bytes = rp.metadata().content_length() as u32; - increment_gauge!(OBJECT_STORE_LRU_CACHE_ENTRIES, 1.0); - increment_gauge!(OBJECT_STORE_LRU_CACHE_BYTES, read_bytes as f64); + OBJECT_STORE_LRU_CACHE_ENTRIES.inc(); + OBJECT_STORE_LRU_CACHE_BYTES.add(read_bytes as i64); Ok(ReadResult::Success(read_bytes)) } Err(e) if e.kind() == ErrorKind::NotFound => { - increment_counter!(OBJECT_STORE_READ_ERROR, "kind" => format!("{}", e.kind())); - increment_gauge!(OBJECT_STORE_LRU_CACHE_ENTRIES, 1.0); + OBJECT_STORE_READ_ERROR + .with_label_values(&[e.kind().to_string().as_str()]) + .inc(); + OBJECT_STORE_LRU_CACHE_ENTRIES.inc(); Ok(ReadResult::NotFound) } Err(e) => { - increment_counter!(OBJECT_STORE_READ_ERROR, "kind" => format!("{}", e.kind())); - + OBJECT_STORE_READ_ERROR + .with_label_values(&[e.kind().to_string().as_str()]) + .inc(); Err(e) } } diff --git a/src/object-store/src/metrics.rs b/src/object-store/src/metrics.rs index e46862b734..96016d7660 100644 --- a/src/object-store/src/metrics.rs +++ b/src/object-store/src/metrics.rs @@ -15,12 +15,36 @@ //! object-store metrics /// Cache hit counter, no matter what the cache result is. -pub const OBJECT_STORE_LRU_CACHE_HIT: &str = "object_store.lru_cache.hit"; -/// Cache miss counter -pub const OBJECT_STORE_LRU_CACHE_MISS: &str = "object_store.lru_cache.miss"; -/// Object store read error counter -pub const OBJECT_STORE_READ_ERROR: &str = "object_store.read.errors"; -/// Cache entry number -pub const OBJECT_STORE_LRU_CACHE_ENTRIES: &str = "object_store.lru_cache.entries"; -/// Cache size in bytes -pub const OBJECT_STORE_LRU_CACHE_BYTES: &str = "object_store.lru_cache.bytes"; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + /// Cache hit counter, no matter what the cache result is. + pub static ref OBJECT_STORE_LRU_CACHE_HIT: IntCounterVec = register_int_counter_vec!( + "object_store_lru_cache_hit", + "object store lru cache hit", + &["result"] + ) + .unwrap(); + /// Cache miss counter + pub static ref OBJECT_STORE_LRU_CACHE_MISS: IntCounter = + register_int_counter!("object_store_lru_cache_miss", "object store lru cache miss") + .unwrap(); + /// Object store read error counter + pub static ref OBJECT_STORE_READ_ERROR: IntCounterVec = register_int_counter_vec!( + "object_store_read_errors", + "object store read errors", + &["kind"] + ) + .unwrap(); + + /// Cache entry number + pub static ref OBJECT_STORE_LRU_CACHE_ENTRIES: IntGauge = + register_int_gauge!("object_store_lru_cache_entries", "object store lru cache entries") + .unwrap(); + + /// Cache size in bytes + pub static ref OBJECT_STORE_LRU_CACHE_BYTES: IntGauge = + register_int_gauge!("object_store_lru_cache_bytes", "object store lru cache bytes") + .unwrap(); +} diff --git a/src/object-store/tests/object_store_test.rs b/src/object-store/tests/object_store_test.rs index d43a3af6f2..2937defc41 100644 --- a/src/object-store/tests/object_store_test.rs +++ b/src/object-store/tests/object_store_test.rs @@ -16,7 +16,7 @@ use std::env; use std::sync::Arc; use anyhow::Result; -use common_telemetry::{logging, metric}; +use common_telemetry::logging; use common_test_util::temp_dir::create_temp_dir; use object_store::layers::LruCacheLayer; use object_store::services::{Fs, S3}; @@ -290,7 +290,6 @@ async fn assert_cache_files( #[tokio::test] async fn test_object_store_cache_policy() -> Result<()> { common_telemetry::init_default_ut_logging(); - common_telemetry::init_default_metrics_recorder(); // create file storage let root_dir = create_temp_dir("test_object_store_cache_policy"); let store = OperatorBuilder::new( @@ -426,8 +425,7 @@ async fn test_object_store_cache_policy() -> Result<()> { ) .await; - let handle = metric::try_handle().unwrap(); - let metric_text = handle.render(); + let metric_text = common_telemetry::dump_metrics().unwrap(); assert!(metric_text.contains("object_store_lru_cache_hit")); assert!(metric_text.contains("object_store_lru_cache_miss")); diff --git a/src/operator/Cargo.toml b/src/operator/Cargo.toml index 26e48c3b41..1c1bf0a54a 100644 --- a/src/operator/Cargo.toml +++ b/src/operator/Cargo.toml @@ -34,12 +34,13 @@ datatypes = { workspace = true } file-engine = { workspace = true } futures = "0.3" futures-util.workspace = true +lazy_static.workspace = true meta-client = { workspace = true } meter-core.workspace = true meter-macros.workspace = true -metrics.workspace = true object-store = { workspace = true } partition = { workspace = true } +prometheus.workspace = true query = { workspace = true } regex.workspace = true serde.workspace = true diff --git a/src/operator/src/delete.rs b/src/operator/src/delete.rs index 1efd757ee4..849760bf53 100644 --- a/src/operator/src/delete.rs +++ b/src/operator/src/delete.rs @@ -23,7 +23,6 @@ use common_meta::datanode_manager::{AffectedRows, DatanodeManagerRef}; use common_meta::peer::Peer; use common_query::Output; use futures_util::future; -use metrics::counter; use partition::manager::PartitionRuleManagerRef; use session::context::QueryContextRef; use snafu::{ensure, OptionExt, ResultExt}; @@ -142,7 +141,7 @@ impl Deleter { let results = future::try_join_all(tasks).await.context(JoinTaskSnafu)?; let affected_rows = results.into_iter().sum::>()?; - counter!(crate::metrics::DIST_DELETE_ROW_COUNT, affected_rows); + crate::metrics::DIST_DELETE_ROW_COUNT.inc_by(affected_rows); Ok(affected_rows) } diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index 856baa9627..078727997e 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -30,7 +30,6 @@ use common_telemetry::{error, info}; use datatypes::schema::Schema; use futures_util::future; use meter_macros::write_meter; -use metrics::counter; use partition::manager::PartitionRuleManagerRef; use session::context::QueryContextRef; use snafu::prelude::*; @@ -175,7 +174,7 @@ impl Inserter { let results = future::try_join_all(tasks).await.context(JoinTaskSnafu)?; let affected_rows = results.into_iter().sum::>()?; - counter!(crate::metrics::DIST_INGEST_ROW_COUNT, affected_rows); + crate::metrics::DIST_INGEST_ROW_COUNT.inc_by(affected_rows); Ok(affected_rows) } diff --git a/src/operator/src/metrics.rs b/src/operator/src/metrics.rs index 1ece70c2b4..577d843cc0 100644 --- a/src/operator/src/metrics.rs +++ b/src/operator/src/metrics.rs @@ -12,6 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub const DIST_CREATE_TABLE: &str = "table.operator.create_table"; -pub const DIST_INGEST_ROW_COUNT: &str = "table.operator.ingest_rows"; -pub const DIST_DELETE_ROW_COUNT: &str = "table.operator.delete_rows"; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + pub static ref DIST_CREATE_TABLE: Histogram = + register_histogram!("table_operator_create_table", "table operator create table").unwrap(); + pub static ref DIST_INGEST_ROW_COUNT: IntCounter = + register_int_counter!("table_operator_ingest_rows", "table operator ingest rows").unwrap(); + pub static ref DIST_DELETE_ROW_COUNT: IntCounter = + register_int_counter!("table_operator_delete_rows", "table operator delete rows").unwrap(); +} diff --git a/src/operator/src/statement/ddl.rs b/src/operator/src/statement/ddl.rs index 820c51d9b5..581f265604 100644 --- a/src/operator/src/statement/ddl.rs +++ b/src/operator/src/statement/ddl.rs @@ -77,7 +77,7 @@ impl StatementExecutor { create_table: &mut CreateTableExpr, partitions: Option, ) -> Result { - let _timer = common_telemetry::timer!(crate::metrics::DIST_CREATE_TABLE); + let _timer = crate::metrics::DIST_CREATE_TABLE.start_timer(); let schema = self .table_metadata_manager .schema_manager() diff --git a/src/partition/Cargo.toml b/src/partition/Cargo.toml index e56b1d6df9..093d1059e2 100644 --- a/src/partition/Cargo.toml +++ b/src/partition/Cargo.toml @@ -17,8 +17,10 @@ datafusion-common.workspace = true datafusion-expr.workspace = true datafusion.workspace = true datatypes = { workspace = true } +lazy_static.workspace = true meta-client = { workspace = true } moka = { workspace = true, features = ["future"] } +prometheus.workspace = true serde.workspace = true serde_json = "1.0" snafu.workspace = true diff --git a/src/partition/src/manager.rs b/src/partition/src/manager.rs index 76e6870775..6a7f74d30d 100644 --- a/src/partition/src/manager.rs +++ b/src/partition/src/manager.rs @@ -21,7 +21,6 @@ use common_meta::kv_backend::KvBackendRef; use common_meta::peer::Peer; use common_meta::rpc::router::{convert_to_region_map, RegionRoutes}; use common_query::prelude::Expr; -use common_telemetry::timer; use datafusion_expr::{BinaryExpr, Expr as DfExpr, Operator}; use datatypes::prelude::Value; use snafu::{ensure, OptionExt, ResultExt}; @@ -68,7 +67,7 @@ impl PartitionRuleManager { /// Find table route of given table name. pub async fn find_table_route(&self, table_id: TableId) -> Result { - let _timer = timer!(METRIC_TABLE_ROUTE_GET); + let _timer = METRIC_TABLE_ROUTE_GET.start_timer(); let route = self .table_route_manager .get(table_id) diff --git a/src/partition/src/metrics.rs b/src/partition/src/metrics.rs index 18b921383e..91e34bd532 100644 --- a/src/partition/src/metrics.rs +++ b/src/partition/src/metrics.rs @@ -12,4 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub(crate) const METRIC_TABLE_ROUTE_GET: &str = "frontend.table_route.get"; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + pub static ref METRIC_TABLE_ROUTE_GET: Histogram = + register_histogram!("frontend_table_route_get", "frontend table route get").unwrap(); +} diff --git a/src/promql/Cargo.toml b/src/promql/Cargo.toml index a1fe7f4510..766796923c 100644 --- a/src/promql/Cargo.toml +++ b/src/promql/Cargo.toml @@ -18,7 +18,8 @@ datafusion.workspace = true datatypes = { workspace = true } futures = "0.3" greptime-proto.workspace = true -metrics = { workspace = true } +lazy_static.workspace = true +prometheus.workspace = true promql-parser = "0.1.1" prost.workspace = true session = { workspace = true } diff --git a/src/promql/src/extension_plan/series_divide.rs b/src/promql/src/extension_plan/series_divide.rs index fdf2c9cdf9..772ee079eb 100644 --- a/src/promql/src/extension_plan/series_divide.rs +++ b/src/promql/src/extension_plan/series_divide.rs @@ -282,7 +282,7 @@ impl Stream for SeriesDivideStream { let batch = match ready!(self.as_mut().fetch_next_batch(cx)) { Some(Ok(batch)) => batch, None => { - metrics::histogram!(PROMQL_SERIES_COUNT, self.num_series as f64); + PROMQL_SERIES_COUNT.observe(self.num_series as f64); return Poll::Ready(None); } error => return Poll::Ready(error), diff --git a/src/promql/src/metrics.rs b/src/promql/src/metrics.rs index b8bebf7a43..5d3c49ac37 100644 --- a/src/promql/src/metrics.rs +++ b/src/promql/src/metrics.rs @@ -12,5 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -/// Counter for the number of series processed per query. -pub static PROMQL_SERIES_COUNT: &str = "promql.series_count"; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + /// Counter for the number of series processed per query. + pub static ref PROMQL_SERIES_COUNT: Histogram = + register_histogram!("promql_series_count", "promql series count").unwrap(); +} diff --git a/src/query/Cargo.toml b/src/query/Cargo.toml index b922708821..47ab9859d0 100644 --- a/src/query/Cargo.toml +++ b/src/query/Cargo.toml @@ -38,10 +38,11 @@ futures = "0.3" futures-util.workspace = true greptime-proto.workspace = true humantime = "2.1" -metrics.workspace = true +lazy_static.workspace = true object-store.workspace = true once_cell.workspace = true partition.workspace = true +prometheus.workspace = true promql-parser = "0.1.1" promql.workspace = true regex.workspace = true diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs index 5b3b8d41fe..caa72f22a1 100644 --- a/src/query/src/datafusion.rs +++ b/src/query/src/datafusion.rs @@ -34,7 +34,6 @@ use common_recordbatch::adapter::RecordBatchStreamAdapter; use common_recordbatch::{ EmptyRecordBatchStream, RecordBatch, RecordBatches, SendableRecordBatchStream, }; -use common_telemetry::timer; use datafusion::common::Column; use datafusion::physical_plan::analyze::AnalyzeExec; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; @@ -287,7 +286,7 @@ impl QueryEngine for DatafusionQueryEngine { impl LogicalOptimizer for DatafusionQueryEngine { fn optimize(&self, plan: &LogicalPlan) -> Result { - let _timer = timer!(metrics::METRIC_OPTIMIZE_LOGICAL_ELAPSED); + let _timer = metrics::METRIC_OPTIMIZE_LOGICAL_ELAPSED.start_timer(); match plan { LogicalPlan::DfPlan(df_plan) => { let optimized_plan = self @@ -311,7 +310,7 @@ impl PhysicalPlanner for DatafusionQueryEngine { ctx: &mut QueryEngineContext, logical_plan: &LogicalPlan, ) -> Result> { - let _timer = timer!(metrics::METRIC_CREATE_PHYSICAL_ELAPSED); + let _timer = metrics::METRIC_CREATE_PHYSICAL_ELAPSED.start_timer(); match logical_plan { LogicalPlan::DfPlan(df_plan) => { let state = ctx.state(); @@ -344,7 +343,7 @@ impl PhysicalOptimizer for DatafusionQueryEngine { ctx: &mut QueryEngineContext, plan: Arc, ) -> Result> { - let _timer = timer!(metrics::METRIC_OPTIMIZE_PHYSICAL_ELAPSED); + let _timer = metrics::METRIC_OPTIMIZE_PHYSICAL_ELAPSED.start_timer(); let state = ctx.state(); let config = state.config_options(); @@ -391,7 +390,7 @@ impl QueryExecutor for DatafusionQueryEngine { ctx: &QueryEngineContext, plan: &Arc, ) -> Result { - let _timer = timer!(metrics::METRIC_EXEC_PLAN_ELAPSED); + let _timer = metrics::METRIC_EXEC_PLAN_ELAPSED.start_timer(); let task_ctx = ctx.build_task_ctx(); match plan.output_partitioning().partition_count() { diff --git a/src/query/src/dist_plan/merge_scan.rs b/src/query/src/dist_plan/merge_scan.rs index b47bf1949b..91a7cb4441 100644 --- a/src/query/src/dist_plan/merge_scan.rs +++ b/src/query/src/dist_plan/merge_scan.rs @@ -166,7 +166,7 @@ impl MergeScanExec { let trace_id = trace_id().unwrap_or_default(); let stream = Box::pin(stream!({ - metrics::histogram!(METRIC_MERGE_SCAN_REGIONS, regions.len() as f64); + METRIC_MERGE_SCAN_REGIONS.observe(regions.len() as f64); let _finish_timer = metric.finish_time().timer(); let mut ready_timer = metric.ready_time().timer(); let mut first_consume_timer = Some(metric.first_consume_time().timer()); @@ -185,7 +185,7 @@ impl MergeScanExec { .do_get(request) .await .map_err(|e| { - metrics::increment_counter!(METRIC_MERGE_SCAN_ERRORS_TOTAL); + METRIC_MERGE_SCAN_ERRORS_TOTAL.inc(); BoxedError::new(e) }) .context(ExternalSnafu)?; @@ -211,7 +211,7 @@ impl MergeScanExec { // reset poll timer poll_timer = Instant::now(); } - metrics::histogram!(METRIC_MERGE_SCAN_POLL_ELAPSED, poll_duration.as_secs_f64()); + METRIC_MERGE_SCAN_POLL_ELAPSED.observe(poll_duration.as_secs_f64()); } })); diff --git a/src/query/src/metrics.rs b/src/query/src/metrics.rs index 7efac4e414..73d1344cee 100644 --- a/src/query/src/metrics.rs +++ b/src/query/src/metrics.rs @@ -11,13 +11,41 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +use lazy_static::lazy_static; +use prometheus::*; -pub static METRIC_PARSE_SQL_ELAPSED: &str = "query.parse_sql_elapsed"; -pub static METRIC_PARSE_PROMQL_ELAPSED: &str = "query.parse_promql_elapsed"; -pub static METRIC_OPTIMIZE_LOGICAL_ELAPSED: &str = "query.optimize_logicalplan_elapsed"; -pub static METRIC_OPTIMIZE_PHYSICAL_ELAPSED: &str = "query.optimize_physicalplan_elapsed"; -pub static METRIC_CREATE_PHYSICAL_ELAPSED: &str = "query.create_physicalplan_elapsed"; -pub static METRIC_EXEC_PLAN_ELAPSED: &str = "query.execute_plan_elapsed"; -pub static METRIC_MERGE_SCAN_POLL_ELAPSED: &str = "query.merge_scan.poll_elapsed"; -pub static METRIC_MERGE_SCAN_REGIONS: &str = "query.merge_scan.regions"; -pub static METRIC_MERGE_SCAN_ERRORS_TOTAL: &str = "query.merge_scan.errors_total"; +lazy_static! { + pub static ref METRIC_PARSE_SQL_ELAPSED: Histogram = + register_histogram!("query_parse_sql_elapsed", "query parse sql elapsed").unwrap(); + pub static ref METRIC_PARSE_PROMQL_ELAPSED: Histogram = + register_histogram!("query_parse_promql_elapsed", "query parse promql elapsed").unwrap(); + pub static ref METRIC_OPTIMIZE_LOGICAL_ELAPSED: Histogram = register_histogram!( + "query_optimize_logicalplan_elapsed", + "query optimize logicalplan elapsed" + ) + .unwrap(); + pub static ref METRIC_OPTIMIZE_PHYSICAL_ELAPSED: Histogram = register_histogram!( + "query_optimize_physicalplan_elapsed", + "query optimize physicalplan elapsed" + ) + .unwrap(); + pub static ref METRIC_CREATE_PHYSICAL_ELAPSED: Histogram = register_histogram!( + "query_create_physicalplan_elapsed", + "query create physicalplan elapsed" + ) + .unwrap(); + pub static ref METRIC_EXEC_PLAN_ELAPSED: Histogram = + register_histogram!("query_execute_plan_elapsed", "query execute plan elapsed").unwrap(); + pub static ref METRIC_MERGE_SCAN_POLL_ELAPSED: Histogram = register_histogram!( + "query_merge_scan_poll_elapsed", + "query merge scan poll elapsed" + ) + .unwrap(); + pub static ref METRIC_MERGE_SCAN_REGIONS: Histogram = + register_histogram!("query_merge_scan_regions", "query merge scan regions").unwrap(); + pub static ref METRIC_MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!( + "query_merge_scan_errors_total", + "query merge scan errors total" + ) + .unwrap(); +} diff --git a/src/query/src/parser.rs b/src/query/src/parser.rs index 92113290c8..8a7316c376 100644 --- a/src/query/src/parser.rs +++ b/src/query/src/parser.rs @@ -20,7 +20,6 @@ use std::time::{Duration, SystemTime}; use chrono::DateTime; use common_error::ext::{BoxedError, PlainError}; use common_error::status_code::StatusCode; -use common_telemetry::timer; use promql_parser::parser::ast::{Extension as NodeExtension, ExtensionExpr}; use promql_parser::parser::Expr::Extension; use promql_parser::parser::{EvalStmt, Expr, ValueType}; @@ -106,7 +105,7 @@ pub struct QueryLanguageParser {} impl QueryLanguageParser { pub fn parse_sql(sql: &str) -> Result { - let _timer = timer!(METRIC_PARSE_SQL_ELAPSED); + let _timer = METRIC_PARSE_SQL_ELAPSED.start_timer(); let mut statement = ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}) .map_err(BoxedError::new) .context(QueryParseSnafu { @@ -123,7 +122,7 @@ impl QueryLanguageParser { } pub fn parse_promql(query: &PromQuery) -> Result { - let _timer = timer!(METRIC_PARSE_PROMQL_ELAPSED); + let _timer = METRIC_PARSE_PROMQL_ELAPSED.start_timer(); let expr = promql_parser::parser::parse(&query.query) .map_err(|msg| BoxedError::new(PlainError::new(msg, StatusCode::InvalidArguments))) diff --git a/src/script/Cargo.toml b/src/script/Cargo.toml index 0d42ab96f9..87a107e9ad 100644 --- a/src/script/Cargo.toml +++ b/src/script/Cargo.toml @@ -47,8 +47,10 @@ datafusion-physical-expr = { workspace = true, optional = true } datatypes = { workspace = true } futures-util.workspace = true futures.workspace = true +lazy_static.workspace = true once_cell.workspace = true paste = { workspace = true, optional = true } +prometheus.workspace = true query = { workspace = true } # TODO(discord9): This is a forked and tweaked version of RustPython, please update it to newest original RustPython After RustPython support GC pyo3 = { version = "0.19", optional = true, features = ["abi3", "abi3-py37"] } diff --git a/src/script/src/python/metric.rs b/src/script/src/python/metric.rs index 16b6de288f..c5d2c927eb 100644 --- a/src/script/src/python/metric.rs +++ b/src/script/src/python/metric.rs @@ -12,13 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Script engine metrics -pub static METRIC_RSPY_INIT_ELAPSED: &str = "script.rspy.init_elapsed"; -pub static METRIC_RSPY_EXEC_ELAPSED: &str = "script.rspy.exec_elapsed"; -pub static METRIC_RSPY_EXEC_TOTAL_ELAPSED: &str = "script.rspy.exec_total_elapsed"; +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + pub static ref METRIC_RSPY_INIT_ELAPSED: Histogram = + register_histogram!("script_rspy_init_elapsed", "script rspy init elapsed").unwrap(); + pub static ref METRIC_RSPY_EXEC_ELAPSED: Histogram = + register_histogram!("script_rspy_exec_elapsed", "script rspy exec elapsed").unwrap(); + pub static ref METRIC_RSPY_EXEC_TOTAL_ELAPSED: Histogram = register_histogram!( + "script_rspy_exec_total_elapsed", + "script rspy exec total elapsed" + ) + .unwrap(); +} + #[cfg(feature = "pyo3_backend")] -pub static METRIC_PYO3_EXEC_ELAPSED: &str = "script.pyo3.exec_elapsed"; -#[cfg(feature = "pyo3_backend")] -pub static METRIC_PYO3_INIT_ELAPSED: &str = "script.pyo3.init_elapsed"; -#[cfg(feature = "pyo3_backend")] -pub static METRIC_PYO3_EXEC_TOTAL_ELAPSED: &str = "script.pyo3.exec_total_elapsed"; +lazy_static! { + pub static ref METRIC_PYO3_EXEC_ELAPSED: Histogram = + register_histogram!("script_pyo3_exec_elapsed", "script pyo3 exec elapsed").unwrap(); + pub static ref METRIC_PYO3_INIT_ELAPSED: Histogram = + register_histogram!("script_pyo3_init_elapsed", "script pyo3 init elapsed").unwrap(); + pub static ref METRIC_PYO3_EXEC_TOTAL_ELAPSED: Histogram = register_histogram!( + "script_pyo3_exec_total_elapsed", + "script pyo3 exec total elapsed" + ) + .unwrap(); +} diff --git a/src/script/src/python/pyo3/copr_impl.rs b/src/script/src/python/pyo3/copr_impl.rs index 91352ae449..1a23ab8132 100644 --- a/src/script/src/python/pyo3/copr_impl.rs +++ b/src/script/src/python/pyo3/copr_impl.rs @@ -16,7 +16,6 @@ use std::collections::HashMap; use arrow::compute; use common_recordbatch::RecordBatch; -use common_telemetry::timer; use datafusion_common::ScalarValue; use datatypes::prelude::ConcreteDataType; use datatypes::vectors::{Helper, VectorRef}; @@ -67,7 +66,7 @@ pub(crate) fn pyo3_exec_parsed( rb: &Option, params: &HashMap, ) -> Result { - let _t = timer!(metric::METRIC_PYO3_EXEC_TOTAL_ELAPSED); + let _t = metric::METRIC_PYO3_EXEC_TOTAL_ELAPSED.start_timer(); // i.e params or use `vector(..)` to construct a PyVector let arg_names = &copr.deco_args.arg_names.clone().unwrap_or_default(); let args: Vec = if let Some(rb) = rb { @@ -80,7 +79,7 @@ pub(crate) fn pyo3_exec_parsed( // Just in case cpython is not inited init_cpython_interpreter().unwrap(); Python::with_gil(|py| -> Result<_> { - let _t = timer!(metric::METRIC_PYO3_EXEC_ELAPSED); + let _t = metric::METRIC_PYO3_EXEC_ELAPSED.start_timer(); let mut cols = (|| -> PyResult<_> { let dummy_decorator = " diff --git a/src/script/src/python/pyo3/utils.rs b/src/script/src/python/pyo3/utils.rs index f64ac32c90..facc03a6f7 100644 --- a/src/script/src/python/pyo3/utils.rs +++ b/src/script/src/python/pyo3/utils.rs @@ -15,7 +15,7 @@ use std::sync::{Arc, Mutex}; use arrow::pyarrow::PyArrowException; -use common_telemetry::{info, timer}; +use common_telemetry::info; use datafusion_common::ScalarValue; use datafusion_expr::ColumnarValue; use datatypes::arrow::datatypes::DataType as ArrowDataType; @@ -40,7 +40,7 @@ pub(crate) fn to_py_err(err: impl ToString) -> PyErr { /// init cpython interpreter with `greptime` builtins, if already inited, do nothing pub(crate) fn init_cpython_interpreter() -> PyResult<()> { - let _t = timer!(metric::METRIC_PYO3_INIT_ELAPSED); + let _t = metric::METRIC_PYO3_INIT_ELAPSED.start_timer(); let mut start = START_PYO3.lock().unwrap(); if !*start { pyo3::append_to_inittab!(greptime_builtins); diff --git a/src/script/src/python/rspython/copr_impl.rs b/src/script/src/python/rspython/copr_impl.rs index 7420ead142..84ebd1c544 100644 --- a/src/script/src/python/rspython/copr_impl.rs +++ b/src/script/src/python/rspython/copr_impl.rs @@ -18,7 +18,7 @@ use std::rc::Rc; use std::result::Result as StdResult; use common_recordbatch::RecordBatch; -use common_telemetry::{info, timer}; +use common_telemetry::info; use datatypes::vectors::VectorRef; use rustpython_vm::builtins::{PyBaseExceptionRef, PyDict, PyStr, PyTuple}; use rustpython_vm::class::PyClassImpl; @@ -45,7 +45,7 @@ pub(crate) fn rspy_exec_parsed( rb: &Option, params: &HashMap, ) -> Result { - let _t = timer!(metric::METRIC_RSPY_EXEC_TOTAL_ELAPSED); + let _t = metric::METRIC_RSPY_EXEC_TOTAL_ELAPSED.start_timer(); // 3. get args from `rb`, and cast them into PyVector let args: Vec = if let Some(rb) = rb { let arg_names = copr.deco_args.arg_names.clone().unwrap_or_default(); @@ -102,7 +102,7 @@ pub(crate) fn exec_with_cached_vm( vm: &Rc, ) -> Result { vm.enter(|vm| -> Result { - let _t = timer!(metric::METRIC_RSPY_EXEC_ELAPSED); + let _t = metric::METRIC_RSPY_EXEC_ELAPSED.start_timer(); // set arguments with given name and values let scope = vm.new_scope_with_builtins(); @@ -189,7 +189,7 @@ fn try_into_columns( /// init interpreter with type PyVector and Module: greptime pub(crate) fn init_interpreter() -> Rc { - let _t = timer!(metric::METRIC_RSPY_INIT_ELAPSED); + let _t = metric::METRIC_RSPY_INIT_ELAPSED.start_timer(); INTERPRETER.with(|i| { i.borrow_mut() .get_or_insert_with(|| { diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 6c8623a692..01714f1bf9 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -50,9 +50,7 @@ humantime-serde.workspace = true hyper = { version = "0.14", features = ["full"] } influxdb_line_protocol = { git = "https://github.com/evenyag/influxdb_iox", branch = "feat/line-protocol" } itertools.workspace = true -metrics.workspace = true -# metrics-process 1.0.10 depends on metrics-0.21 but opendal depends on metrics-0.20.1 -metrics-process = { version = "<1.0.10", optional = true } +lazy_static.workspace = true mime_guess = "2.0" num_cpus = "1.13" once_cell.workspace = true @@ -68,6 +66,7 @@ pprof = { version = "0.11", features = [ "prost-codec", "protobuf", ], optional = true } +prometheus.workspace = true promql-parser = "0.1.1" prost.workspace = true query = { workspace = true } diff --git a/src/servers/src/grpc/greptime_handler.rs b/src/servers/src/grpc/greptime_handler.rs index f8dbe30101..ef2481b91a 100644 --- a/src/servers/src/grpc/greptime_handler.rs +++ b/src/servers/src/grpc/greptime_handler.rs @@ -28,16 +28,12 @@ use common_error::status_code::StatusCode; use common_query::Output; use common_runtime::Runtime; use common_telemetry::{logging, TRACE_ID}; -use metrics::{histogram, increment_counter}; use session::context::{QueryContextBuilder, QueryContextRef}; use snafu::{OptionExt, ResultExt}; use crate::error::Error::UnsupportedAuthScheme; use crate::error::{AuthSnafu, InvalidQuerySnafu, JoinTaskSnafu, NotFoundAuthHeaderSnafu, Result}; -use crate::metrics::{ - METRIC_AUTH_FAILURE, METRIC_CODE_LABEL, METRIC_DB_LABEL, METRIC_SERVER_GRPC_DB_REQUEST_TIMER, - METRIC_TYPE_LABEL, -}; +use crate::metrics::{METRIC_AUTH_FAILURE, METRIC_SERVER_GRPC_DB_REQUEST_TIMER}; use crate::query_handler::grpc::ServerGrpcQueryHandlerRef; #[derive(Clone)] @@ -136,10 +132,9 @@ pub(crate) async fn auth( } .map(Some) .map_err(|e| { - increment_counter!( - METRIC_AUTH_FAILURE, - &[(METRIC_CODE_LABEL, format!("{}", e.status_code()))] - ); + METRIC_AUTH_FAILURE + .with_label_values(&[e.status_code().as_ref()]) + .inc(); e }) } @@ -204,14 +199,12 @@ impl RequestTimer { impl Drop for RequestTimer { fn drop(&mut self) { - histogram!( - METRIC_SERVER_GRPC_DB_REQUEST_TIMER, - self.start.elapsed(), - &[ - (METRIC_DB_LABEL, std::mem::take(&mut self.db)), - (METRIC_TYPE_LABEL, std::mem::take(&mut self.request_type)), - (METRIC_CODE_LABEL, self.status_code.to_string()) - ] - ); + METRIC_SERVER_GRPC_DB_REQUEST_TIMER + .with_label_values(&[ + self.db.as_str(), + self.request_type.as_str(), + self.status_code.as_ref(), + ]) + .observe(self.start.elapsed().as_secs_f64()); } } diff --git a/src/servers/src/grpc/prom_query_gateway.rs b/src/servers/src/grpc/prom_query_gateway.rs index cff82f5265..1910f0c6de 100644 --- a/src/servers/src/grpc/prom_query_gateway.rs +++ b/src/servers/src/grpc/prom_query_gateway.rs @@ -24,7 +24,6 @@ use async_trait::async_trait; use auth::UserProviderRef; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; -use common_telemetry::timer; use common_time::util::current_time_rfc3339; use promql_parser::parser::ValueType; use query::parser::PromQuery; @@ -112,10 +111,10 @@ impl PrometheusGatewayService { ctx: Arc, is_range_query: bool, ) -> PrometheusJsonResponse { - let _timer = timer!( - crate::metrics::METRIC_SERVER_GRPC_PROM_REQUEST_TIMER, - &[(crate::metrics::METRIC_DB_LABEL, ctx.get_db_string())] - ); + let db = ctx.get_db_string(); + let _timer = crate::metrics::METRIC_SERVER_GRPC_PROM_REQUEST_TIMER + .with_label_values(&[db.as_str()]) + .start_timer(); let result = self.handler.do_query(&query, ctx).await; let (metric_name, mut result_type) = diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index 884f79d48d..7ca2b4261d 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -69,8 +69,7 @@ use crate::http::prometheus::{ instant_query, label_values_query, labels_query, range_query, series_query, }; use crate::metrics::{ - METRIC_CODE_LABEL, METRIC_HTTP_REQUESTS_ELAPSED, METRIC_HTTP_REQUESTS_TOTAL, - METRIC_METHOD_LABEL, METRIC_PATH_LABEL, + HTTP_TRACK_METRICS, METRIC_HTTP_REQUESTS_ELAPSED, METRIC_HTTP_REQUESTS_TOTAL, }; use crate::metrics_handler::MetricsHandler; use crate::prometheus_handler::PrometheusHandlerRef; @@ -674,7 +673,9 @@ impl HttpServer { /// A middleware to record metrics for HTTP. // Based on https://github.com/tokio-rs/axum/blob/axum-v0.6.16/examples/prometheus-metrics/src/main.rs pub(crate) async fn track_metrics(req: Request, next: Next) -> impl IntoResponse { - let _timer = common_telemetry::timer!("http_track_metrics", &[("tag", "value")]); + let _timer = HTTP_TRACK_METRICS + .with_label_values(&["value"]) + .start_timer(); let start = Instant::now(); let path = if let Some(matched_path) = req.extensions().get::() { matched_path.as_str().to_owned() @@ -687,15 +688,13 @@ pub(crate) async fn track_metrics(req: Request, next: Next) -> impl Int let latency = start.elapsed().as_secs_f64(); let status = response.status().as_u16().to_string(); + let method_str = method.to_string(); - let labels = [ - (METRIC_METHOD_LABEL, method.to_string()), - (METRIC_PATH_LABEL, path), - (METRIC_CODE_LABEL, status), - ]; - - metrics::increment_counter!(METRIC_HTTP_REQUESTS_TOTAL, &labels); - metrics::histogram!(METRIC_HTTP_REQUESTS_ELAPSED, latency, &labels); + let labels = [method_str.as_str(), path.as_str(), status.as_str()]; + METRIC_HTTP_REQUESTS_TOTAL.with_label_values(&labels).inc(); + METRIC_HTTP_REQUESTS_ELAPSED + .with_label_values(&labels) + .observe(latency); response } diff --git a/src/servers/src/http/authorize.rs b/src/servers/src/http/authorize.rs index 03c7bb735f..188d8b7430 100644 --- a/src/servers/src/http/authorize.rs +++ b/src/servers/src/http/authorize.rs @@ -24,7 +24,6 @@ use common_telemetry::warn; use futures::future::BoxFuture; use headers::Header; use http_body::Body; -use metrics::increment_counter; use secrecy::SecretString; use session::context::QueryContext; use snafu::{ensure, OptionExt, ResultExt}; @@ -89,13 +88,9 @@ where Ok((username, password)) => (username, password), Err(e) => { warn!("extract username and password failed: {}", e); - increment_counter!( - crate::metrics::METRIC_AUTH_FAILURE, - &[( - crate::metrics::METRIC_CODE_LABEL, - format!("{}", e.status_code()) - )] - ); + crate::metrics::METRIC_AUTH_FAILURE + .with_label_values(&[e.status_code().as_ref()]) + .inc(); return Err(unauthorized_resp()); } }; @@ -116,13 +111,9 @@ where } Err(e) => { warn!("authenticate failed: {}", e); - increment_counter!( - crate::metrics::METRIC_AUTH_FAILURE, - &[( - crate::metrics::METRIC_CODE_LABEL, - format!("{}", e.status_code()) - )] - ); + crate::metrics::METRIC_AUTH_FAILURE + .with_label_values(&[e.status_code().as_ref()]) + .inc(); Err(unauthorized_resp()) } } diff --git a/src/servers/src/http/handler.rs b/src/servers/src/http/handler.rs index 90fff97635..d07210266b 100644 --- a/src/servers/src/http/handler.rs +++ b/src/servers/src/http/handler.rs @@ -21,7 +21,6 @@ use axum::extract::{Json, Query, State}; use axum::response::{IntoResponse, Response}; use axum::{Extension, Form}; use common_error::status_code::StatusCode; -use common_telemetry::timer; use query::parser::PromQuery; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -50,10 +49,9 @@ pub async fn sql( let start = Instant::now(); let sql = query_params.sql.or(form_params.sql); let db = query_ctx.get_db_string(); - let _timer = timer!( - crate::metrics::METRIC_HTTP_SQL_ELAPSED, - &[(crate::metrics::METRIC_DB_LABEL, db)] - ); + let _timer = crate::metrics::METRIC_HTTP_SQL_ELAPSED + .with_label_values(&[db.as_str()]) + .start_timer(); let resp = if let Some(sql) = &sql { if let Some(resp) = validate_schema(sql_handler.clone(), query_ctx.clone()).await { @@ -101,10 +99,9 @@ pub async fn promql( let sql_handler = &state.sql_handler; let exec_start = Instant::now(); let db = query_ctx.get_db_string(); - let _timer = timer!( - crate::metrics::METRIC_HTTP_PROMQL_ELAPSED, - &[(crate::metrics::METRIC_DB_LABEL, db)] - ); + let _timer = crate::metrics::METRIC_HTTP_PROMQL_ELAPSED + .with_label_values(&[db.as_str()]) + .start_timer(); if let Some(resp) = validate_schema(sql_handler.clone(), query_ctx.clone()).await { return Json(resp); @@ -127,9 +124,9 @@ pub async fn metrics( State(state): State, Query(_params): Query>, ) -> String { - // Collect process metrics. - #[cfg(feature = "metrics-process")] - crate::metrics::PROCESS_COLLECTOR.collect(); + // A default ProcessCollector is registered automatically in prometheus. + // We do not need to explicitly collect process-related data. + // But ProcessCollector only support on linux. #[cfg(not(windows))] if let Some(c) = crate::metrics::jemalloc::JEMALLOC_COLLECTOR.as_ref() { diff --git a/src/servers/src/http/influxdb.rs b/src/servers/src/http/influxdb.rs index 4dab6d00ef..de13085d88 100644 --- a/src/servers/src/http/influxdb.rs +++ b/src/servers/src/http/influxdb.rs @@ -20,7 +20,6 @@ use axum::response::IntoResponse; use axum::Extension; use common_catalog::consts::DEFAULT_SCHEMA_NAME; use common_grpc::writer::Precision; -use common_telemetry::timer; use session::context::QueryContextRef; use crate::error::{Result, TimePrecisionSnafu}; @@ -84,10 +83,9 @@ pub async fn influxdb_write( handler: InfluxdbLineProtocolHandlerRef, ctx: QueryContextRef, ) -> Result { - let _timer = timer!( - crate::metrics::METRIC_HTTP_INFLUXDB_WRITE_ELAPSED, - &[(crate::metrics::METRIC_DB_LABEL, db.to_string())] - ); + let _timer = crate::metrics::METRIC_HTTP_INFLUXDB_WRITE_ELAPSED + .with_label_values(&[db]) + .start_timer(); let request = InfluxdbRequest { precision, lines }; handler.exec(request, ctx).await?; diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index b4ae4ea854..9fa552c32d 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -16,7 +16,6 @@ use axum::extract::{RawBody, State}; use axum::http::header; use axum::response::IntoResponse; use axum::Extension; -use common_telemetry::timer; use hyper::Body; use opentelemetry_proto::tonic::collector::metrics::v1::{ ExportMetricsServiceRequest, ExportMetricsServiceResponse, @@ -37,10 +36,10 @@ pub async fn metrics( Extension(query_ctx): Extension, RawBody(body): RawBody, ) -> Result { - let _timer = timer!( - crate::metrics::METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED, - &[(crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string())] - ); + let db = query_ctx.get_db_string(); + let _timer = crate::metrics::METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED + .with_label_values(&[db.as_str()]) + .start_timer(); let request = parse_metrics_body(body).await?; handler .metrics(request, query_ctx) @@ -75,10 +74,10 @@ pub async fn traces( Extension(query_ctx): Extension, RawBody(body): RawBody, ) -> Result { - let _timer = timer!( - crate::metrics::METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED, - &[(crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string())] - ); + let db = query_ctx.get_db_string(); + let _timer = crate::metrics::METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED + .with_label_values(&[db.as_str()]) + .start_timer(); let request = parse_traces_body(body).await?; handler .traces(request, query_ctx) diff --git a/src/servers/src/http/prom_store.rs b/src/servers/src/http/prom_store.rs index 897e9e703e..af1f2a5261 100644 --- a/src/servers/src/http/prom_store.rs +++ b/src/servers/src/http/prom_store.rs @@ -18,7 +18,6 @@ use axum::http::{header, StatusCode}; use axum::response::IntoResponse; use axum::Extension; use common_catalog::consts::DEFAULT_SCHEMA_NAME; -use common_telemetry::timer; use hyper::Body; use prost::Message; use schemars::JsonSchema; @@ -51,14 +50,11 @@ pub async fn remote_write( RawBody(body): RawBody, ) -> Result<(StatusCode, ())> { let request = decode_remote_write_request(body).await?; + let db = params.db.clone().unwrap_or_default(); - let _timer = timer!( - crate::metrics::METRIC_HTTP_PROM_STORE_WRITE_ELAPSED, - &[( - crate::metrics::METRIC_DB_LABEL, - params.db.clone().unwrap_or_default() - )] - ); + let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_WRITE_ELAPSED + .with_label_values(&[db.as_str()]) + .start_timer(); handler.write(request, query_ctx).await?; Ok((StatusCode::NO_CONTENT, ())) @@ -85,14 +81,11 @@ pub async fn remote_read( RawBody(body): RawBody, ) -> Result { let request = decode_remote_read_request(body).await?; + let db = params.db.clone().unwrap_or_default(); - let _timer = timer!( - crate::metrics::METRIC_HTTP_PROM_STORE_READ_ELAPSED, - &[( - crate::metrics::METRIC_DB_LABEL, - params.db.clone().unwrap_or_default() - )] - ); + let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_READ_ELAPSED + .with_label_values(&[db.as_str()]) + .start_timer(); handler.read(request, query_ctx).await } diff --git a/src/servers/src/http/prometheus.rs b/src/servers/src/http/prometheus.rs index b2c96b1293..82389ba66d 100644 --- a/src/servers/src/http/prometheus.rs +++ b/src/servers/src/http/prometheus.rs @@ -24,7 +24,6 @@ use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_query::Output; use common_recordbatch::RecordBatches; -use common_telemetry::timer; use common_time::util::{current_time_rfc3339, yesterday_rfc3339}; use datatypes::prelude::ConcreteDataType; use datatypes::scalars::ScalarVector; @@ -306,7 +305,7 @@ pub async fn instant_query( Extension(query_ctx): Extension, Form(form_params): Form, ) -> Json { - let _timer = timer!(crate::metrics::METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED); + let _timer = crate::metrics::METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED.start_timer(); // Extract time from query string, or use current server time if not specified. let time = params .time @@ -346,7 +345,7 @@ pub async fn range_query( Extension(query_ctx): Extension, Form(form_params): Form, ) -> Json { - let _timer = timer!(crate::metrics::METRIC_HTTP_PROMQL_RANGE_QUERY_ELAPSED); + let _timer = crate::metrics::METRIC_HTTP_PROMQL_RANGE_QUERY_ELAPSED.start_timer(); let prom_query = PromQuery { query: params.query.or(form_params.query).unwrap_or_default(), start: params.start.or(form_params.start).unwrap_or_default(), @@ -415,7 +414,7 @@ pub async fn labels_query( Extension(query_ctx): Extension, Form(form_params): Form, ) -> Json { - let _timer = timer!(crate::metrics::METRIC_HTTP_PROMQL_LABEL_QUERY_ELAPSED); + let _timer = crate::metrics::METRIC_HTTP_PROMQL_LABEL_QUERY_ELAPSED.start_timer(); let db = ¶ms.db.unwrap_or(DEFAULT_SCHEMA_NAME.to_string()); let (catalog, schema) = parse_catalog_and_schema_from_db_string(db); @@ -681,7 +680,7 @@ pub async fn label_values_query( Extension(query_ctx): Extension, Query(params): Query, ) -> Json { - let _timer = timer!(crate::metrics::METRIC_HTTP_PROMQL_LABEL_VALUE_QUERY_ELAPSED); + let _timer = crate::metrics::METRIC_HTTP_PROMQL_LABEL_VALUE_QUERY_ELAPSED.start_timer(); let db = ¶ms.db.unwrap_or(DEFAULT_SCHEMA_NAME.to_string()); let (catalog, schema) = parse_catalog_and_schema_from_db_string(db); @@ -807,7 +806,7 @@ pub async fn series_query( Extension(query_ctx): Extension, Form(form_params): Form, ) -> Json { - let _timer = timer!(crate::metrics::METRIC_HTTP_PROMQL_SERIES_QUERY_ELAPSED); + let _timer = crate::metrics::METRIC_HTTP_PROMQL_SERIES_QUERY_ELAPSED.start_timer(); let mut queries: Vec = params.matches.0; if queries.is_empty() { queries = form_params.matches.0; diff --git a/src/servers/src/metrics.rs b/src/servers/src/metrics.rs index 6e9aee8ad1..f5e0be116e 100644 --- a/src/servers/src/metrics.rs +++ b/src/servers/src/metrics.rs @@ -19,6 +19,11 @@ use std::task::{Context, Poll}; use std::time::Instant; use hyper::Body; +use lazy_static::lazy_static; +use prometheus::{ + register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, + register_int_gauge, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, +}; use tonic::body::BoxBody; use tower::{Layer, Service}; @@ -26,66 +31,172 @@ pub(crate) const METRIC_DB_LABEL: &str = "db"; pub(crate) const METRIC_CODE_LABEL: &str = "code"; pub(crate) const METRIC_TYPE_LABEL: &str = "type"; pub(crate) const METRIC_PROTOCOL_LABEL: &str = "protocol"; - -pub(crate) const METRIC_ERROR_COUNTER: &str = "servers.error"; pub(crate) const METRIC_ERROR_COUNTER_LABEL_MYSQL: &str = "mysql"; - -pub(crate) const METRIC_HTTP_SQL_ELAPSED: &str = "servers.http_sql_elapsed"; -pub(crate) const METRIC_HTTP_PROMQL_ELAPSED: &str = "servers.http_promql_elapsed"; -pub(crate) const METRIC_AUTH_FAILURE: &str = "servers.auth_failure_count"; -pub(crate) const METRIC_HTTP_INFLUXDB_WRITE_ELAPSED: &str = "servers.http_influxdb_write_elapsed"; -pub(crate) const METRIC_HTTP_PROM_STORE_WRITE_ELAPSED: &str = - "servers.http_prometheus_write_elapsed"; -pub(crate) const METRIC_HTTP_PROM_STORE_READ_ELAPSED: &str = "servers.http_prometheus_read_elapsed"; -pub(crate) const METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED: &str = - "servers.http_otlp_metrics_elapsed"; -pub(crate) const METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED: &str = - "servers.http_otlp_traces_elapsed"; -pub(crate) const METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED: &str = - "servers.opentsdb_line_write_elapsed"; -pub(crate) const METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED: &str = - "servers.http_promql_instant_query_elapsed"; -pub(crate) const METRIC_HTTP_PROMQL_RANGE_QUERY_ELAPSED: &str = - "servers.http_promql_range_query_elapsed"; -pub(crate) const METRIC_HTTP_PROMQL_LABEL_QUERY_ELAPSED: &str = - "servers.http_promql_label_query_elapsed"; -pub(crate) const METRIC_HTTP_PROMQL_SERIES_QUERY_ELAPSED: &str = - "servers.http_promql_series_query_elapsed"; -pub(crate) const METRIC_HTTP_PROMQL_LABEL_VALUE_QUERY_ELAPSED: &str = - "servers.http_promql_label_value_query_elapsed"; - -pub(crate) const METRIC_MYSQL_CONNECTIONS: &str = "servers.mysql_connection_count"; -pub(crate) const METRIC_MYSQL_QUERY_TIMER: &str = "servers.mysql_query_elapsed"; pub(crate) const METRIC_MYSQL_SUBPROTOCOL_LABEL: &str = "subprotocol"; pub(crate) const METRIC_MYSQL_BINQUERY: &str = "binquery"; pub(crate) const METRIC_MYSQL_TEXTQUERY: &str = "textquery"; -pub(crate) const METRIC_MYSQL_PREPARED_COUNT: &str = "servers.mysql_prepared_count"; - -pub(crate) const METRIC_POSTGRES_CONNECTIONS: &str = "servers.postgres_connection_count"; -pub(crate) const METRIC_POSTGRES_QUERY_TIMER: &str = "servers.postgres_query_elapsed"; pub(crate) const METRIC_POSTGRES_SUBPROTOCOL_LABEL: &str = "subprotocol"; pub(crate) const METRIC_POSTGRES_SIMPLE_QUERY: &str = "simple"; pub(crate) const METRIC_POSTGRES_EXTENDED_QUERY: &str = "extended"; -pub(crate) const METRIC_POSTGRES_PREPARED_COUNT: &str = "servers.postgres_prepared_count"; - -pub(crate) const METRIC_SERVER_GRPC_DB_REQUEST_TIMER: &str = "servers.grpc.db_request_elapsed"; -pub(crate) const METRIC_SERVER_GRPC_PROM_REQUEST_TIMER: &str = "servers.grpc.prom_request_elapsed"; - -pub(crate) const METRIC_HTTP_REQUESTS_TOTAL: &str = "servers.http_requests_total"; -pub(crate) const METRIC_HTTP_REQUESTS_ELAPSED: &str = "servers.http_requests_elapsed"; -pub(crate) const METRIC_GRPC_REQUESTS_TOTAL: &str = "servers.grpc_requests_total"; -pub(crate) const METRIC_GRPC_REQUESTS_ELAPSED: &str = "servers.grpc_requests_elapsed"; pub(crate) const METRIC_METHOD_LABEL: &str = "method"; pub(crate) const METRIC_PATH_LABEL: &str = "path"; -/// Prometheus style process metrics collector. -#[cfg(feature = "metrics-process")] -pub(crate) static PROCESS_COLLECTOR: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| { - let collector = metrics_process::Collector::default(); - collector.describe(); - collector - }); +lazy_static! { + pub static ref METRIC_ERROR_COUNTER: IntCounterVec = + register_int_counter_vec!("servers_error", "servers error", &[METRIC_PROTOCOL_LABEL]) + .unwrap(); + pub static ref METRIC_HTTP_SQL_ELAPSED: HistogramVec = register_histogram_vec!( + "servers_http_sql_elapsed", + "servers http sql elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_PROMQL_ELAPSED: HistogramVec = register_histogram_vec!( + "servers_http_promql_elapsed", + "servers http promql elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_AUTH_FAILURE: IntCounterVec = register_int_counter_vec!( + "servers_auth_failure_count", + "servers auth failure count", + &[METRIC_CODE_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_INFLUXDB_WRITE_ELAPSED: HistogramVec = register_histogram_vec!( + "servers_http_influxdb_write_elapsed", + "servers http influxdb write elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_PROM_STORE_WRITE_ELAPSED: HistogramVec = register_histogram_vec!( + "servers_http_prometheus_write_elapsed", + "servers http prometheus write elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_PROM_STORE_READ_ELAPSED: HistogramVec = register_histogram_vec!( + "servers_http_prometheus_read_elapsed", + "servers http prometheus read elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED: HistogramVec = + register_histogram_vec!( + "servers_http_otlp_metrics_elapsed", + "servers_http_otlp_metrics_elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED: HistogramVec = + register_histogram_vec!( + "servers_http_otlp_traces_elapsed", + "servers http otlp traces elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED: Histogram = register_histogram!( + "servers_opentsdb_line_write_elapsed", + "servers opentsdb line write elapsed" + ) + .unwrap(); + pub static ref METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED: Histogram = register_histogram!( + "servers_http_promql_instant_query_elapsed", + "servers http promql instant query elapsed" + ) + .unwrap(); + pub static ref METRIC_HTTP_PROMQL_RANGE_QUERY_ELAPSED: Histogram = register_histogram!( + "servers_http_promql_range_query_elapsed", + "servers http promql range query elapsed" + ) + .unwrap(); + pub static ref METRIC_HTTP_PROMQL_LABEL_QUERY_ELAPSED: Histogram = register_histogram!( + "servers_http_promql_label_query_elapsed", + "servers http promql label query elapsed" + ) + .unwrap(); + pub static ref METRIC_HTTP_PROMQL_SERIES_QUERY_ELAPSED: Histogram = register_histogram!( + "servers_http_promql_series_query_elapsed", + "servers http promql series query elapsed" + ) + .unwrap(); + pub static ref METRIC_HTTP_PROMQL_LABEL_VALUE_QUERY_ELAPSED: Histogram = register_histogram!( + "servers_http_promql_label_value_query_elapsed", + "servers http promql label value query elapsed" + ) + .unwrap(); + pub static ref METRIC_MYSQL_CONNECTIONS: IntGauge = register_int_gauge!( + "servers_mysql_connection_count", + "servers mysql connection count" + ) + .unwrap(); + pub static ref METRIC_MYSQL_QUERY_TIMER: HistogramVec = register_histogram_vec!( + "servers_mysql_query_elapsed", + "servers mysql query elapsed", + &[METRIC_MYSQL_SUBPROTOCOL_LABEL, METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_MYSQL_PREPARED_COUNT: IntCounterVec = register_int_counter_vec!( + "servers_mysql_prepared_count", + "servers mysql prepared count", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_POSTGRES_CONNECTIONS: IntGauge = register_int_gauge!( + "servers_postgres_connection_count", + "servers postgres connection count" + ) + .unwrap(); + pub static ref METRIC_POSTGRES_QUERY_TIMER: HistogramVec = register_histogram_vec!( + "servers_postgres_query_elapsed", + "servers postgres query elapsed", + &[METRIC_POSTGRES_SUBPROTOCOL_LABEL, METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_POSTGRES_PREPARED_COUNT: IntCounter = register_int_counter!( + "servers_postgres_prepared_count", + "servers postgres prepared count" + ) + .unwrap(); + pub static ref METRIC_SERVER_GRPC_DB_REQUEST_TIMER: HistogramVec = register_histogram_vec!( + "servers_grpc_db_request_elapsed", + "servers grpc db request elapsed", + &[METRIC_DB_LABEL, METRIC_TYPE_LABEL, METRIC_CODE_LABEL] + ) + .unwrap(); + pub static ref METRIC_SERVER_GRPC_PROM_REQUEST_TIMER: HistogramVec = register_histogram_vec!( + "servers_grpc_prom_request_elapsed", + "servers grpc prom request elapsed", + &[METRIC_DB_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!( + "servers_http_requests_total", + "servers http requests total", + &[METRIC_METHOD_LABEL, METRIC_PATH_LABEL, METRIC_CODE_LABEL] + ) + .unwrap(); + pub static ref METRIC_HTTP_REQUESTS_ELAPSED: HistogramVec = register_histogram_vec!( + "servers_http_requests_elapsed", + "servers http requests elapsed", + &[METRIC_METHOD_LABEL, METRIC_PATH_LABEL, METRIC_CODE_LABEL] + ) + .unwrap(); + pub static ref METRIC_GRPC_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!( + "servers_grpc_requests_total", + "servers grpc requests total", + &[METRIC_PATH_LABEL, METRIC_CODE_LABEL] + ) + .unwrap(); + pub static ref METRIC_GRPC_REQUESTS_ELAPSED: HistogramVec = register_histogram_vec!( + "servers_grpc_requests_elapsed", + "servers grpc requests elapsed", + &[METRIC_PATH_LABEL, METRIC_CODE_LABEL] + ) + .unwrap(); + pub static ref HTTP_TRACK_METRICS: HistogramVec = + register_histogram_vec!("http_track_metrics", "http track metrics", &["tag"]).unwrap(); +} // Based on https://github.com/hyperium/tonic/blob/master/examples/src/tower/server.rs // See https://github.com/hyperium/tonic/issues/242 @@ -136,9 +247,11 @@ where let latency = start.elapsed().as_secs_f64(); let status = response.status().as_u16().to_string(); - let labels = [(METRIC_PATH_LABEL, path), (METRIC_CODE_LABEL, status)]; - metrics::increment_counter!(METRIC_GRPC_REQUESTS_TOTAL, &labels); - metrics::histogram!(METRIC_GRPC_REQUESTS_ELAPSED, latency, &labels); + let labels = [path.as_str(), status.as_str()]; + METRIC_GRPC_REQUESTS_TOTAL.with_label_values(&labels).inc(); + METRIC_GRPC_REQUESTS_ELAPSED + .with_label_values(&labels) + .observe(latency); Ok(response) }) diff --git a/src/servers/src/metrics/jemalloc.rs b/src/servers/src/metrics/jemalloc.rs index d54eddafde..26cf5a21ac 100644 --- a/src/servers/src/metrics/jemalloc.rs +++ b/src/servers/src/metrics/jemalloc.rs @@ -16,14 +16,25 @@ mod error; use common_telemetry::error; use error::UpdateJemallocMetricsSnafu; -use metrics::gauge; +use lazy_static::lazy_static; use once_cell::sync::Lazy; +use prometheus::*; use snafu::ResultExt; use tikv_jemalloc_ctl::stats::{allocated_mib, resident_mib}; use tikv_jemalloc_ctl::{epoch, epoch_mib, stats}; -pub(crate) const METRIC_JEMALLOC_RESIDENT: &str = "sys.jemalloc.resident"; -pub(crate) const METRIC_JEMALLOC_ALLOCATED: &str = "sys.jemalloc.allocated"; +lazy_static! { + pub static ref SYS_JEMALLOC_RESIDEN: IntGauge = register_int_gauge!( + "sys_jemalloc_resident", + "Total number of bytes allocated by the application." + ) + .unwrap(); + pub static ref SYS_JEMALLOC_ALLOCATED: IntGauge = register_int_gauge!( + "sys_jemalloc_allocated", + "Total number of bytes in physically resident data pages mapped by the allocator." + ) + .unwrap(); +} pub(crate) static JEMALLOC_COLLECTOR: Lazy> = Lazy::new(|| { let collector = JemallocCollector::try_new() @@ -62,8 +73,8 @@ impl JemallocCollector { let _ = self.epoch.advance().context(UpdateJemallocMetricsSnafu)?; let allocated = self.allocated.read().context(UpdateJemallocMetricsSnafu)?; let resident = self.resident.read().context(UpdateJemallocMetricsSnafu)?; - gauge!(METRIC_JEMALLOC_ALLOCATED, allocated as f64); - gauge!(METRIC_JEMALLOC_RESIDENT, resident as f64); + SYS_JEMALLOC_RESIDEN.set(allocated as i64); + SYS_JEMALLOC_ALLOCATED.set(resident as i64); Ok(()) } } diff --git a/src/servers/src/metrics_handler.rs b/src/servers/src/metrics_handler.rs index 89970cf308..82156ce3c6 100644 --- a/src/servers/src/metrics_handler.rs +++ b/src/servers/src/metrics_handler.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_telemetry::metric; +use prometheus::{Encoder, TextEncoder}; /// a server that serves metrics /// only start when datanode starts in distributed mode @@ -21,10 +21,17 @@ pub struct MetricsHandler; impl MetricsHandler { pub fn render(&self) -> String { - if let Some(handle) = metric::try_handle() { - handle.render() - } else { - "Prometheus handle not initialized.".to_owned() + let mut buffer = Vec::new(); + let encoder = TextEncoder::new(); + // Gather the metrics. + let metric_families = prometheus::gather(); + // Encode them to send. + match encoder.encode(&metric_families, &mut buffer) { + Ok(_) => match String::from_utf8(buffer) { + Ok(s) => s, + Err(e) => e.to_string(), + }, + Err(e) => e.to_string(), } } } diff --git a/src/servers/src/mysql/handler.rs b/src/servers/src/mysql/handler.rs index 4674f65b1d..f59cd0ea6c 100644 --- a/src/servers/src/mysql/handler.rs +++ b/src/servers/src/mysql/handler.rs @@ -23,9 +23,8 @@ use chrono::{NaiveDate, NaiveDateTime}; use common_catalog::parse_catalog_and_schema_from_db_string; use common_error::ext::ErrorExt; use common_query::Output; -use common_telemetry::{error, logging, timer, warn}; +use common_telemetry::{error, logging, warn}; use datatypes::prelude::ConcreteDataType; -use metrics::increment_counter; use opensrv_mysql::{ AsyncMysqlShim, Column, ErrorKind, InitWriter, ParamParser, ParamValue, QueryResultWriter, StatementMetaWriter, ValueInner, @@ -43,6 +42,7 @@ use sql::statements::statement::Statement; use tokio::io::AsyncWrite; use crate::error::{self, InvalidPrepareStatementSnafu, Result}; +use crate::metrics::METRIC_AUTH_FAILURE; use crate::mysql::helper::{ self, format_placeholder, replace_placeholders, transform_placeholders, }; @@ -184,7 +184,9 @@ impl AsyncMysqlShim for MysqlInstanceShi user_info = Some(userinfo); } Err(e) => { - increment_counter!(crate::metrics::METRIC_AUTH_FAILURE); + METRIC_AUTH_FAILURE + .with_label_values(&[e.status_code().as_ref()]) + .inc(); warn!("Failed to auth, err: {:?}", e); return false; } @@ -244,10 +246,9 @@ impl AsyncMysqlShim for MysqlInstanceShi }); w.reply(stmt_id, ¶ms, &[]).await?; - increment_counter!( - crate::metrics::METRIC_MYSQL_PREPARED_COUNT, - &[(crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string())] - ); + crate::metrics::METRIC_MYSQL_PREPARED_COUNT + .with_label_values(&[query_ctx.get_db_string().as_str()]) + .inc(); return Ok(()); } @@ -258,16 +259,11 @@ impl AsyncMysqlShim for MysqlInstanceShi w: QueryResultWriter<'a, W>, ) -> Result<()> { let query_ctx = self.session.new_query_context(); - let _timer = timer!( - crate::metrics::METRIC_MYSQL_QUERY_TIMER, - &[ - ( - crate::metrics::METRIC_MYSQL_SUBPROTOCOL_LABEL, - crate::metrics::METRIC_MYSQL_BINQUERY.to_string() - ), - (crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string()) - ] - ); + let db = query_ctx.get_db_string(); + let _timer = crate::metrics::METRIC_MYSQL_QUERY_TIMER + .with_label_values(&[crate::metrics::METRIC_MYSQL_BINQUERY, db.as_str()]) + .start_timer(); + let params: Vec = p.into_iter().collect(); let sql_plan = match self.plan(stmt_id) { None => { @@ -326,16 +322,10 @@ impl AsyncMysqlShim for MysqlInstanceShi writer: QueryResultWriter<'a, W>, ) -> Result<()> { let query_ctx = self.session.new_query_context(); - let _timer = timer!( - crate::metrics::METRIC_MYSQL_QUERY_TIMER, - &[ - ( - crate::metrics::METRIC_MYSQL_SUBPROTOCOL_LABEL, - crate::metrics::METRIC_MYSQL_TEXTQUERY.to_string() - ), - (crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string()) - ] - ); + let db = query_ctx.get_db_string(); + let _timer = crate::metrics::METRIC_MYSQL_QUERY_TIMER + .with_label_values(&[crate::metrics::METRIC_MYSQL_TEXTQUERY, db.as_str()]) + .start_timer(); let outputs = self.do_query(query, query_ctx.clone()).await; writer::write_output(writer, query_ctx, outputs).await?; Ok(()) @@ -358,13 +348,9 @@ impl AsyncMysqlShim for MysqlInstanceShi if let Some(schema_validator) = &self.user_provider { if let Err(e) = schema_validator.authorize(catalog, schema, user_info).await { - increment_counter!( - crate::metrics::METRIC_AUTH_FAILURE, - &[( - crate::metrics::METRIC_CODE_LABEL, - format!("{}", e.status_code()) - )] - ); + METRIC_AUTH_FAILURE + .with_label_values(&[e.status_code().as_ref()]) + .inc(); return w .error( ErrorKind::ER_DBACCESS_DENIED_ERROR, diff --git a/src/servers/src/mysql/server.rs b/src/servers/src/mysql/server.rs index 0dd9af79dd..9f2855e21c 100644 --- a/src/servers/src/mysql/server.rs +++ b/src/servers/src/mysql/server.rs @@ -22,7 +22,6 @@ use common_runtime::Runtime; use common_telemetry::error; use common_telemetry::logging::{info, warn}; use futures::StreamExt; -use metrics::{decrement_gauge, increment_gauge}; use opensrv_mysql::{ plain_run_with_options, secure_run_with_options, AsyncMysqlIntermediary, IntermediaryOptions, }; @@ -160,13 +159,13 @@ impl MysqlServer { ) -> Result<()> { info!("MySQL connection coming from: {}", stream.peer_addr()?); let _handle = io_runtime.spawn(async move { - increment_gauge!(crate::metrics::METRIC_MYSQL_CONNECTIONS, 1.0); + crate::metrics::METRIC_MYSQL_CONNECTIONS.inc(); if let Err(e) = Self::do_handle(stream, spawn_ref, spawn_config).await { // TODO(LFC): Write this error to client as well, in MySQL text protocol. // Looks like we have to expose opensrv-mysql's `PacketWriter`? warn!(e; "Internal error occurred during query exec, server actively close the channel to let client try next time") } - decrement_gauge!(crate::metrics::METRIC_MYSQL_CONNECTIONS, 1.0); + crate::metrics::METRIC_MYSQL_CONNECTIONS.dec(); }); Ok(()) diff --git a/src/servers/src/mysql/writer.rs b/src/servers/src/mysql/writer.rs index 6e41f72851..a358df1c14 100644 --- a/src/servers/src/mysql/writer.rs +++ b/src/servers/src/mysql/writer.rs @@ -20,7 +20,6 @@ use common_recordbatch::{RecordBatch, SendableRecordBatchStream}; use datatypes::prelude::{ConcreteDataType, Value}; use datatypes::schema::SchemaRef; use futures::StreamExt; -use metrics::increment_counter; use opensrv_mysql::{ Column, ColumnFlags, ColumnType, ErrorKind, OkResponse, QueryResultWriter, RowWriter, }; @@ -209,10 +208,9 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> { } async fn write_query_error(error: impl ErrorExt, w: QueryResultWriter<'a, W>) -> Result<()> { - increment_counter!( - METRIC_ERROR_COUNTER, - &[(METRIC_PROTOCOL_LABEL, METRIC_ERROR_COUNTER_LABEL_MYSQL)] - ); + METRIC_ERROR_COUNTER + .with_label_values(&[METRIC_ERROR_COUNTER_LABEL_MYSQL]) + .inc(); let kind = ErrorKind::ER_INTERNAL_ERROR; let error = error.output_msg(); diff --git a/src/servers/src/opentsdb/handler.rs b/src/servers/src/opentsdb/handler.rs index a12d54db61..55cebb210b 100644 --- a/src/servers/src/opentsdb/handler.rs +++ b/src/servers/src/opentsdb/handler.rs @@ -15,7 +15,6 @@ //! Modified from Tokio's mini-redis example. use common_error::ext::ErrorExt; -use common_telemetry::timer; use session::context::QueryContextBuilder; use tokio::io::{AsyncRead, AsyncWrite}; @@ -93,7 +92,8 @@ impl Handler { match DataPoint::try_create(&line) { Ok(data_point) => { - let _timer = timer!(crate::metrics::METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED); + let _timer = + crate::metrics::METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED.start_timer(); let result = self.query_handler.exec(vec![data_point], ctx.clone()).await; if let Err(e) = result { self.connection.write_line(e.output_msg()).await?; diff --git a/src/servers/src/postgres/auth_handler.rs b/src/servers/src/postgres/auth_handler.rs index 703e944c2d..c795f381b5 100644 --- a/src/servers/src/postgres/auth_handler.rs +++ b/src/servers/src/postgres/auth_handler.rs @@ -20,7 +20,6 @@ use async_trait::async_trait; use common_catalog::parse_catalog_and_schema_from_db_string; use common_error::ext::ErrorExt; use futures::{Sink, SinkExt}; -use metrics::increment_counter; use pgwire::api::auth::StartupHandler; use pgwire::api::{auth, ClientInfo, PgWireConnectionState}; use pgwire::error::{ErrorInfo, PgWireError, PgWireResult}; @@ -32,6 +31,7 @@ use snafu::IntoError; use super::PostgresServerHandler; use crate::error::{AuthSnafu, Result}; +use crate::metrics::METRIC_AUTH_FAILURE; use crate::query_handler::sql::ServerSqlQueryHandlerRef; pub(crate) struct PgLoginVerifier { @@ -102,13 +102,9 @@ impl PgLoginVerifier { .await { Err(e) => { - increment_counter!( - crate::metrics::METRIC_AUTH_FAILURE, - &[( - crate::metrics::METRIC_CODE_LABEL, - format!("{}", e.status_code()) - )] - ); + METRIC_AUTH_FAILURE + .with_label_values(&[e.status_code().as_ref()]) + .inc(); Err(AuthSnafu.into_error(e)) } Ok(user_info) => Ok(Some(user_info)), diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index b72256d7e2..e3ff0c6796 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -19,10 +19,8 @@ use common_error::ext::ErrorExt; use common_query::Output; use common_recordbatch::error::Result as RecordBatchResult; use common_recordbatch::RecordBatch; -use common_telemetry::timer; use datatypes::schema::SchemaRef; use futures::{future, stream, Stream, StreamExt}; -use metrics::increment_counter; use pgwire::api::portal::{Format, Portal}; use pgwire::api::query::{ExtendedQueryHandler, SimpleQueryHandler, StatementOrPortal}; use pgwire::api::results::{DataRowEncoder, DescribeResponse, QueryResponse, Response, Tag}; @@ -48,16 +46,10 @@ impl SimpleQueryHandler for PostgresServerHandler { C: ClientInfo + Unpin + Send + Sync, { let query_ctx = self.session.new_query_context(); - let _timer = timer!( - crate::metrics::METRIC_POSTGRES_QUERY_TIMER, - &[ - ( - crate::metrics::METRIC_POSTGRES_SUBPROTOCOL_LABEL, - crate::metrics::METRIC_POSTGRES_SIMPLE_QUERY.to_string() - ), - (crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string()) - ] - ); + let db = query_ctx.get_db_string(); + let _timer = crate::metrics::METRIC_POSTGRES_QUERY_TIMER + .with_label_values(&[crate::metrics::METRIC_POSTGRES_SIMPLE_QUERY, db.as_str()]) + .start_timer(); let outputs = self.query_handler.do_query(query, query_ctx).await; let mut results = Vec::with_capacity(outputs.len()); @@ -155,7 +147,7 @@ impl QueryParser for DefaultQueryParser { type Statement = SqlPlan; async fn parse_sql(&self, sql: &str, _types: &[Type]) -> PgWireResult { - increment_counter!(crate::metrics::METRIC_POSTGRES_PREPARED_COUNT); + crate::metrics::METRIC_POSTGRES_PREPARED_COUNT.inc(); let query_ctx = self.session.new_query_context(); let mut stmts = ParserContext::create_with_dialect(sql, &PostgreSqlDialect {}) .map_err(|e| PgWireError::ApiError(Box::new(e)))?; @@ -216,16 +208,11 @@ impl ExtendedQueryHandler for PostgresServerHandler { C: ClientInfo + Unpin + Send + Sync, { let query_ctx = self.session.new_query_context(); - let _timer = timer!( - crate::metrics::METRIC_POSTGRES_QUERY_TIMER, - &[ - ( - crate::metrics::METRIC_POSTGRES_SUBPROTOCOL_LABEL, - crate::metrics::METRIC_POSTGRES_EXTENDED_QUERY.to_string() - ), - (crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string()) - ] - ); + let db = query_ctx.get_db_string(); + let _timer = crate::metrics::METRIC_POSTGRES_QUERY_TIMER + .with_label_values(&[crate::metrics::METRIC_POSTGRES_EXTENDED_QUERY, db.as_str()]) + .start_timer(); + let sql_plan = portal.statement().statement(); let output = if let Some(plan) = &sql_plan.plan { diff --git a/src/servers/src/postgres/server.rs b/src/servers/src/postgres/server.rs index 56ed7390df..6a4d7a112d 100644 --- a/src/servers/src/postgres/server.rs +++ b/src/servers/src/postgres/server.rs @@ -22,7 +22,6 @@ use common_runtime::Runtime; use common_telemetry::logging::error; use common_telemetry::{debug, warn}; use futures::StreamExt; -use metrics::{decrement_gauge, increment_gauge}; use pgwire::tokio::process_socket; use tokio_rustls::TlsAcceptor; @@ -89,7 +88,7 @@ impl PostgresServer { }; let _handle = io_runtime.spawn(async move { - increment_gauge!(crate::metrics::METRIC_POSTGRES_CONNECTIONS, 1.0); + crate::metrics::METRIC_POSTGRES_CONNECTIONS.inc(); let pg_handler = Arc::new(handler_maker.make(addr)); let r = process_socket( io_stream, @@ -99,7 +98,7 @@ impl PostgresServer { pg_handler, ) .await; - decrement_gauge!(crate::metrics::METRIC_POSTGRES_CONNECTIONS, 1.0); + crate::metrics::METRIC_POSTGRES_CONNECTIONS.dec(); r }); } diff --git a/src/servers/tests/http/http_handler_test.rs b/src/servers/tests/http/http_handler_test.rs index 995e911aca..5b86593f42 100644 --- a/src/servers/tests/http/http_handler_test.rs +++ b/src/servers/tests/http/http_handler_test.rs @@ -17,10 +17,8 @@ use std::collections::HashMap; use axum::body::{Body, Bytes}; use axum::extract::{Json, Query, RawBody, State}; use axum::Form; -use common_telemetry::metric; use http_body::combinators::UnsyncBoxBody; use hyper::Response; -use metrics::counter; use servers::http::{ handler as http_handler, script as script_handler, ApiState, GreptimeOptionsConfigState, JsonOutput, @@ -154,11 +152,14 @@ async fn test_sql_form() { } } +lazy_static::lazy_static! { + static ref TEST_METRIC: prometheus::Counter = + prometheus::register_counter!("test_metrics", "test metrics").unwrap(); +} + #[tokio::test] async fn test_metrics() { - metric::init_default_metrics_recorder(); - - counter!("test_metrics", 1); + TEST_METRIC.inc(); let stats = MetricsHandler; let text = http_handler::metrics(axum::extract::State(stats), Query(HashMap::default())).await; assert!(text.contains("test_metrics counter")); diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 5b097c30ac..69a783a947 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -31,10 +31,10 @@ futures-util.workspace = true futures.workspace = true itertools.workspace = true lazy_static.workspace = true -metrics.workspace = true object-store = { workspace = true } parquet = { workspace = true, features = ["async"] } paste.workspace = true +prometheus.workspace = true prost.workspace = true regex = "1.5" serde.workspace = true diff --git a/src/storage/src/compaction/task.rs b/src/storage/src/compaction/task.rs index 43a236c3e5..49188d8a64 100644 --- a/src/storage/src/compaction/task.rs +++ b/src/storage/src/compaction/task.rs @@ -16,7 +16,7 @@ use std::collections::HashSet; use std::fmt::{Debug, Formatter}; use common_base::readable_size::ReadableSize; -use common_telemetry::{debug, error, info, timer}; +use common_telemetry::{debug, error, info}; use itertools::Itertools; use snafu::ResultExt; use store_api::logstore::LogStore; @@ -158,7 +158,7 @@ impl CompactionTaskImpl { #[async_trait::async_trait] impl CompactionTask for CompactionTaskImpl { async fn run(mut self) -> Result<()> { - let _timer = timer!(crate::metrics::COMPACT_ELAPSED); + let _timer = crate::metrics::COMPACT_ELAPSED.start_timer(); self.mark_files_compacting(true); let (output, mut compacted) = self.merge_ssts().await.map_err(|e| { @@ -256,7 +256,7 @@ impl CompactionOutput { let opts = WriteOptions { sst_write_buffer_size, }; - let _timer = timer!(crate::metrics::MERGE_ELAPSED); + let _timer = crate::metrics::MERGE_ELAPSED.start_timer(); let meta = sst_layer .write_sst(self.output_file_id, Source::Reader(reader), &opts) .await? diff --git a/src/storage/src/flush.rs b/src/storage/src/flush.rs index 05c8116954..44f6062472 100644 --- a/src/storage/src/flush.rs +++ b/src/storage/src/flush.rs @@ -18,8 +18,7 @@ mod scheduler; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use common_telemetry::{logging, timer}; -use metrics::counter; +use common_telemetry::logging; pub use picker::{FlushPicker, PickerConfig}; pub use scheduler::{ FlushHandle, FlushRegionRequest, FlushRequest, FlushScheduler, FlushSchedulerRef, @@ -237,7 +236,7 @@ pub struct FlushJob { impl FlushJob { /// Execute the flush job. async fn run(&mut self) -> Result<()> { - let _timer = timer!(FLUSH_ELAPSED); + let _timer = FLUSH_ELAPSED.start_timer(); let file_metas = self.write_memtables_to_layer().await?; if file_metas.is_empty() { @@ -299,7 +298,8 @@ impl FlushJob { .collect(); let flush_bytes = metas.iter().map(|f| f.file_size).sum(); - counter!(FLUSH_BYTES_TOTAL, flush_bytes); + + FLUSH_BYTES_TOTAL.inc_by(flush_bytes); let file_ids = metas.iter().map(|f| f.file_id).collect::>(); logging::info!("Successfully flush memtables, region:{region_id}, files: {file_ids:?}"); diff --git a/src/storage/src/flush/scheduler.rs b/src/storage/src/flush/scheduler.rs index 6c7c8bb5ac..a6fa575ce9 100644 --- a/src/storage/src/flush/scheduler.rs +++ b/src/storage/src/flush/scheduler.rs @@ -18,7 +18,6 @@ use std::time::Duration; use async_trait::async_trait; use common_runtime::{RepeatedTask, TaskFunction}; use common_telemetry::logging; -use metrics::increment_counter; use snafu::{ensure, ResultExt}; use store_api::logstore::LogStore; use store_api::storage::{RegionId, SequenceNumber}; @@ -320,7 +319,7 @@ async fn execute_flush_region( if let Err(e) = flush_job.run().await { logging::error!(e; "Failed to flush region {}", req.region_id()); - increment_counter!(FLUSH_ERRORS_TOTAL); + FLUSH_ERRORS_TOTAL.inc(); FlushRequest::Region { req, sender }.complete(Err(e)); } else { diff --git a/src/storage/src/memtable.rs b/src/storage/src/memtable.rs index 6a9b269e18..546c40f383 100644 --- a/src/storage/src/memtable.rs +++ b/src/storage/src/memtable.rs @@ -26,7 +26,6 @@ use api::v1::OpType; use common_time::range::TimestampRange; use common_time::Timestamp; use datatypes::vectors::VectorRef; -use metrics::{decrement_gauge, increment_gauge}; use store_api::storage::{consts, SequenceNumber}; use crate::error::Result; @@ -223,7 +222,7 @@ impl AllocTracker { /// Tracks `bytes` memory is allocated. pub(crate) fn on_allocate(&self, bytes: usize) { let _ = self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed); - increment_gauge!(WRITE_BUFFER_BYTES, bytes as f64); + WRITE_BUFFER_BYTES.add(bytes as i64); if let Some(flush_strategy) = &self.flush_strategy { flush_strategy.reserve_mem(bytes); } @@ -258,7 +257,7 @@ impl Drop for AllocTracker { } let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed); - decrement_gauge!(WRITE_BUFFER_BYTES, bytes_allocated as f64); + WRITE_BUFFER_BYTES.sub(bytes_allocated as i64); // Memory tracked by this tracker is freed. if let Some(flush_strategy) = &self.flush_strategy { diff --git a/src/storage/src/memtable/inserter.rs b/src/storage/src/memtable/inserter.rs index 5192451942..b54b4897b3 100644 --- a/src/storage/src/memtable/inserter.rs +++ b/src/storage/src/memtable/inserter.rs @@ -42,7 +42,7 @@ impl Inserter { /// Won't do schema validation if not configured. Caller (mostly the `RegionWriter` should ensure the /// schemas of `memtable` are consistent with `payload`'s. pub fn insert_memtable(&mut self, payload: &Payload, memtable: &MemtableRef) -> Result<()> { - let _timer = common_telemetry::timer!(MEMTABLE_WRITE_ELAPSED); + let _timer = MEMTABLE_WRITE_ELAPSED.start_timer(); if payload.is_empty() { return Ok(()); diff --git a/src/storage/src/metrics.rs b/src/storage/src/metrics.rs index d05c3176e6..605e0b4927 100644 --- a/src/storage/src/metrics.rs +++ b/src/storage/src/metrics.rs @@ -14,33 +14,53 @@ //! storage metrics -/// Elapsed time of updating manifest when creating regions. -pub const CREATE_REGION_UPDATE_MANIFEST: &str = "storage.create_region.update_manifest"; -/// Counter of scheduled flush requests. -pub const FLUSH_REQUESTS_TOTAL: &str = "storage.flush.requests_total"; -/// Counter of scheduled failed flush jobs. -pub const FLUSH_ERRORS_TOTAL: &str = "storage.flush.errors_total"; -/// Elapsed time of a flush job. -pub const FLUSH_ELAPSED: &str = "storage.flush.elapsed"; -/// Counter of flushed bytes. -pub const FLUSH_BYTES_TOTAL: &str = "storage.flush.bytes_total"; +use lazy_static::lazy_static; +use prometheus::*; + /// Reason to flush. pub const FLUSH_REASON: &str = "reason"; -/// Gauge for open regions -pub const REGION_COUNT: &str = "storage.region_count"; -/// Timer for logstore write -pub const LOG_STORE_WRITE_ELAPSED: &str = "storage.logstore.write.elapsed"; -/// Elapsed time of a compact job. -pub const COMPACT_ELAPSED: &str = "storage.compact.elapsed"; -/// Elapsed time for merging SST files. -pub const MERGE_ELAPSED: &str = "storage.compaction.merge.elapsed"; -/// Global write buffer size in bytes. -pub const WRITE_BUFFER_BYTES: &str = "storage.write_buffer_bytes"; -/// Elapsed time of inserting memtable. -pub const MEMTABLE_WRITE_ELAPSED: &str = "storage.memtable.write.elapsed"; -/// Elapsed time of preprocessing write batch. -pub const PREPROCESS_ELAPSED: &str = "storage.write.preprocess.elapsed"; -/// Elapsed time for windowed scan -pub const WINDOW_SCAN_ELAPSED: &str = "query.scan.window_scan.elapsed"; -/// Rows per window during window scan -pub const WINDOW_SCAN_ROWS_PER_WINDOW: &str = "query.scan.window_scan.window_row_size"; + +lazy_static! { + /// Elapsed time of updating manifest when creating regions. + pub static ref CREATE_REGION_UPDATE_MANIFEST: Histogram = + register_histogram!("storage_create_region_update_manifest", "storage create region update manifest").unwrap(); + /// Counter of scheduled flush requests. + pub static ref FLUSH_REQUESTS_TOTAL: IntCounterVec = + register_int_counter_vec!("storage_flush_requests_total", "storage flush requests total", &[FLUSH_REASON]).unwrap(); + /// Counter of scheduled failed flush jobs. + pub static ref FLUSH_ERRORS_TOTAL: IntCounter = + register_int_counter!("storage_flush_errors_total", "storage flush errors total").unwrap(); + //// Elapsed time of a flush job. + pub static ref FLUSH_ELAPSED: Histogram = + register_histogram!("storage_flush_elapsed", "storage flush elapsed").unwrap(); + /// Counter of flushed bytes. + pub static ref FLUSH_BYTES_TOTAL: IntCounter = + register_int_counter!("storage_flush_bytes_total", "storage flush bytes total").unwrap(); + /// Gauge for open regions + pub static ref REGION_COUNT: IntGauge = + register_int_gauge!("storage_region_count", "storage region count").unwrap(); + /// Timer for logstore write + pub static ref LOG_STORE_WRITE_ELAPSED: Histogram = + register_histogram!("storage_logstore_write_elapsed", "storage logstore write elapsed").unwrap(); + /// Elapsed time of a compact job. + pub static ref COMPACT_ELAPSED: Histogram = + register_histogram!("storage_compact_elapsed", "storage compact elapsed").unwrap(); + /// Elapsed time for merging SST files. + pub static ref MERGE_ELAPSED: Histogram = + register_histogram!("storage_compaction_merge_elapsed", "storage compaction merge elapsed").unwrap(); + /// Global write buffer size in bytes. + pub static ref WRITE_BUFFER_BYTES: IntGauge = + register_int_gauge!("storage_write_buffer_bytes", "storage write buffer bytes").unwrap(); + /// Elapsed time of inserting memtable. + pub static ref MEMTABLE_WRITE_ELAPSED: Histogram = + register_histogram!("storage_memtable_write_elapsed", "storage memtable write elapsed").unwrap(); + /// Elapsed time of preprocessing write batch. + pub static ref PREPROCESS_ELAPSED: Histogram = + register_histogram!("storage_write_preprocess_elapsed", "storage write preprocess elapsed").unwrap(); + /// Elapsed time for windowed scan + pub static ref WINDOW_SCAN_ELAPSED: Histogram = + register_histogram!("query_scan_window_scan_elapsed", "query scan window scan elapsed").unwrap(); + /// Rows per window during window scan + pub static ref WINDOW_SCAN_ROWS_PER_WINDOW: Histogram = + register_histogram!("query_scan_window_scan_window_row_size", "query scan window scan window row size").unwrap(); +} diff --git a/src/storage/src/read/windowed.rs b/src/storage/src/read/windowed.rs index d1faa65700..c9828ad629 100644 --- a/src/storage/src/read/windowed.rs +++ b/src/storage/src/read/windowed.rs @@ -16,10 +16,8 @@ use arrow::compute::SortOptions; use arrow::row::{RowConverter, SortField}; use arrow_array::{Array, ArrayRef}; use common_recordbatch::OrderOption; -use common_telemetry::timer; use datatypes::data_type::DataType; use datatypes::vectors::Helper; -use metrics::histogram; use snafu::ResultExt; use crate::error::{self, Result}; @@ -62,7 +60,7 @@ where R: BatchReader, { async fn next_batch(&mut self) -> Result> { - let _window_scan_elapsed = timer!(crate::metrics::WINDOW_SCAN_ELAPSED); + let _window_scan_elapsed = crate::metrics::WINDOW_SCAN_ELAPSED.start_timer(); let Some(mut reader) = self.readers.pop() else { return Ok(None); }; @@ -98,7 +96,7 @@ where .push(arrow::compute::concat(&columns).context(error::ConvertColumnsToRowsSnafu)?); } if let Some(v) = vectors_in_batch.get(0) { - histogram!(crate::metrics::WINDOW_SCAN_ROWS_PER_WINDOW, v.len() as f64); + crate::metrics::WINDOW_SCAN_ROWS_PER_WINDOW.observe(v.len() as f64); } let sorted = sort_by_rows(&self.schema, vectors_in_batch, &self.order_options)?; let vectors = sorted diff --git a/src/storage/src/region.rs b/src/storage/src/region.rs index c177ffb785..3106adb48f 100644 --- a/src/storage/src/region.rs +++ b/src/storage/src/region.rs @@ -25,7 +25,6 @@ use std::time::Duration; use async_trait::async_trait; use common_telemetry::{info, logging}; use common_time::util; -use metrics::{decrement_gauge, increment_gauge}; use snafu::ResultExt; use store_api::logstore::LogStore; use store_api::manifest::{ @@ -133,7 +132,7 @@ impl Region for RegionImpl { } async fn drop_region(&self) -> Result<()> { - decrement_gauge!(crate::metrics::REGION_COUNT, 1.0); + crate::metrics::REGION_COUNT.dec(); self.inner.drop_region().await } @@ -195,7 +194,7 @@ impl RegionImpl { // Try to persist region data to manifest, ensure the new region could be recovered from // the manifest. let manifest_version = { - let _timer = common_telemetry::timer!(crate::metrics::CREATE_REGION_UPDATE_MANIFEST); + let _timer = crate::metrics::CREATE_REGION_UPDATE_MANIFEST.start_timer(); store_config .manifest .update(RegionMetaActionList::with_action(RegionMetaAction::Change( @@ -218,7 +217,7 @@ impl RegionImpl { store_config.file_purger.clone(), ); let region = RegionImpl::new(version, store_config); - increment_gauge!(crate::metrics::REGION_COUNT, 1.0); + crate::metrics::REGION_COUNT.inc(); Ok(region) } @@ -368,7 +367,7 @@ impl RegionImpl { manifest: store_config.manifest, }); - increment_gauge!(crate::metrics::REGION_COUNT, 1.0); + crate::metrics::REGION_COUNT.inc(); Ok(Some(RegionImpl { inner })) } @@ -573,7 +572,7 @@ impl RegionImpl { } pub async fn close(&self, ctx: &CloseContext) -> Result<()> { - decrement_gauge!(crate::metrics::REGION_COUNT, 1.0); + crate::metrics::REGION_COUNT.dec(); self.inner.close(ctx).await } } diff --git a/src/storage/src/region/writer.rs b/src/storage/src/region/writer.rs index f78cf46e70..edd2e4cc46 100644 --- a/src/storage/src/region/writer.rs +++ b/src/storage/src/region/writer.rs @@ -18,7 +18,6 @@ use std::time::Duration; use common_base::readable_size::ReadableSize; use common_telemetry::logging; use futures::TryStreamExt; -use metrics::increment_counter; use snafu::{ensure, ResultExt}; use store_api::logstore::LogStore; use store_api::manifest::{Manifest, ManifestLogStorage, ManifestVersion, MetaAction}; @@ -39,7 +38,7 @@ use crate::manifest::action::{ }; use crate::memtable::{Inserter, MemtableBuilderRef, MemtableId, MemtableRef, MemtableVersion}; use crate::metadata::RegionMetadataRef; -use crate::metrics::{FLUSH_REASON, FLUSH_REQUESTS_TOTAL, PREPROCESS_ELAPSED}; +use crate::metrics::{FLUSH_REQUESTS_TOTAL, PREPROCESS_ELAPSED}; use crate::proto::wal::WalHeader; use crate::region::{ CompactContext, RecoveredMetadata, RecoveredMetadataMap, RegionManifest, SharedDataRef, @@ -761,7 +760,7 @@ impl WriterInner { &mut self, writer_ctx: &WriterContext<'_, S>, ) -> Result<()> { - let _timer = common_telemetry::timer!(PREPROCESS_ELAPSED); + let _timer = PREPROCESS_ELAPSED.start_timer(); let version_control = writer_ctx.version_control(); // Check whether memtable is full or flush should be triggered. We need to do this first since @@ -821,7 +820,9 @@ impl WriterInner { // Freeze all mutable memtables so we can flush them later. version_control.freeze_mutable(new_mutable); - increment_counter!(FLUSH_REQUESTS_TOTAL, FLUSH_REASON => reason.as_str()); + FLUSH_REQUESTS_TOTAL + .with_label_values(&[reason.as_str()]) + .inc(); if let Some(flush_handle) = self.flush_handle.take() { // Previous flush job is incomplete, wait util it is finished. diff --git a/src/storage/src/wal.rs b/src/storage/src/wal.rs index 92b721f26d..54629cb7f9 100644 --- a/src/storage/src/wal.rs +++ b/src/storage/src/wal.rs @@ -16,7 +16,6 @@ use std::pin::Pin; use std::sync::Arc; use common_error::ext::BoxedError; -use common_telemetry::timer; use futures::{stream, Stream, TryStreamExt}; use prost::Message; use snafu::{ensure, Location, ResultExt}; @@ -117,7 +116,7 @@ impl Wal { mut header: WalHeader, payload: Option<&Payload>, ) -> Result { - let _timer = timer!(crate::metrics::LOG_STORE_WRITE_ELAPSED); + let _timer = crate::metrics::LOG_STORE_WRITE_ELAPSED.start_timer(); if let Some(p) = payload { header.mutation_types = wal::gen_mutation_types(p); } diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index f75420d605..78a6984dc5 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -28,7 +28,7 @@ common-recordbatch = { workspace = true } common-runtime = { workspace = true } common-telemetry = { workspace = true } common-test-util = { workspace = true } -datanode = { workspace = true } +datanode = { workspace = true, features = ["testing"] } datatypes = { workspace = true } dotenv = "0.15" frontend = { workspace = true, features = ["testing"] } diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 7fbf7ea1d3..1b355d4877 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -495,7 +495,6 @@ pub async fn test_prom_http_api(store_type: StorageType) { pub async fn test_metrics_api(store_type: StorageType) { common_telemetry::init_default_ut_logging(); - common_telemetry::init_default_metrics_recorder(); let (app, mut guard) = setup_test_http_app(store_type, "metrics_api").await; let client = TestClient::new(app);