forked from neondatabase/neon
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(compute): Add some basic compute_ctl metrics (neondatabase#10504)
## Problem There are several parts of `compute_ctl` with a very low visibility of errors: 1. DB migrations that run async in the background after compute start. 2. Requests made to control plane (currently only `GetSpec`). 3. Requests made to the remote extensions server. ## Summary of changes Add new counters to quickly evaluate the amount of errors among the fleet. Part of neondatabase/cloud#17590
- Loading branch information
1 parent
eee327b
commit 3f90e77
Showing
9 changed files
with
231 additions
and
63 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
use metrics::core::Collector; | ||
use metrics::proto::MetricFamily; | ||
use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; | ||
use once_cell::sync::Lazy; | ||
|
||
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| { | ||
register_uint_gauge_vec!( | ||
"compute_installed_extensions", | ||
"Number of databases where the version of extension is installed", | ||
&["extension_name", "version", "owned_by_superuser"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH, | ||
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec. | ||
// And it's fair to call it a 'RPC' (Remote Procedure Call). | ||
pub enum CPlaneRequestRPC { | ||
GetSpec, | ||
} | ||
|
||
impl CPlaneRequestRPC { | ||
pub fn as_str(&self) -> &str { | ||
match self { | ||
CPlaneRequestRPC::GetSpec => "GetSpec", | ||
} | ||
} | ||
} | ||
|
||
pub const UNKNOWN_HTTP_STATUS: &str = "unknown"; | ||
|
||
pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_cplane_requests_total", | ||
"Total number of control plane requests made by compute_ctl", | ||
&["rpc"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_cplane_requests_failed_total", | ||
"Total number of failed control plane requests made by compute_ctl", | ||
&["rpc", "http_status"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
/// Total number of failed database migrations. Per-compute, this is actually a boolean metric, | ||
/// either empty or with a single value (1, migration_id) because we stop at the first failure. | ||
/// Yet, the sum over the fleet will provide the total number of failures. | ||
pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_db_migration_failed_total", | ||
"Total number of failed database migrations", | ||
&["migration_id"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_remote_ext_requests_total", | ||
"Total number of requests made by compute_ctl to download extensions from S3 proxy", | ||
// Do not use any labels like extension name yet. | ||
// We can add them later if needed. | ||
&[] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_remote_ext_requests_failed_total", | ||
"Total number of failed requests to S3 proxy", | ||
&["http_status"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub fn collect() -> Vec<MetricFamily> { | ||
let mut metrics = INSTALLED_EXTENSIONS.collect(); | ||
metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); | ||
metrics.extend(CPLANE_REQUESTS_FAILED.collect()); | ||
metrics.extend(DB_MIGRATION_FAILED.collect()); | ||
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); | ||
metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect()); | ||
metrics | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.