-
Notifications
You must be signed in to change notification settings - Fork 483
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(compute): Add some basic compute_ctl metrics (#10504)
## Problem There are several parts of `compute_ctl` with a very low visibility of errors: 1. DB migrations that run async in the background after compute start. 2. Requests made to control plane (currently only `GetSpec`). 3. Requests made to the remote extensions server. ## Summary of changes Add new counters to quickly evaluate the amount of errors among the fleet. Part of neondatabase/cloud#17590
- Loading branch information
Showing
9 changed files
with
231 additions
and
63 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
use metrics::core::Collector; | ||
use metrics::proto::MetricFamily; | ||
use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; | ||
use once_cell::sync::Lazy; | ||
|
||
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| { | ||
register_uint_gauge_vec!( | ||
"compute_installed_extensions", | ||
"Number of databases where the version of extension is installed", | ||
&["extension_name", "version", "owned_by_superuser"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH, | ||
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec. | ||
// And it's fair to call it a 'RPC' (Remote Procedure Call). | ||
pub enum CPlaneRequestRPC { | ||
GetSpec, | ||
} | ||
|
||
impl CPlaneRequestRPC { | ||
pub fn as_str(&self) -> &str { | ||
match self { | ||
CPlaneRequestRPC::GetSpec => "GetSpec", | ||
} | ||
} | ||
} | ||
|
||
pub const UNKNOWN_HTTP_STATUS: &str = "unknown"; | ||
|
||
pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_cplane_requests_total", | ||
"Total number of control plane requests made by compute_ctl", | ||
&["rpc"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_cplane_requests_failed_total", | ||
"Total number of failed control plane requests made by compute_ctl", | ||
&["rpc", "http_status"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
/// Total number of failed database migrations. Per-compute, this is actually a boolean metric, | ||
/// either empty or with a single value (1, migration_id) because we stop at the first failure. | ||
/// Yet, the sum over the fleet will provide the total number of failures. | ||
pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_db_migration_failed_total", | ||
"Total number of failed database migrations", | ||
&["migration_id"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_remote_ext_requests_total", | ||
"Total number of requests made by compute_ctl to download extensions from S3 proxy", | ||
// Do not use any labels like extension name yet. | ||
// We can add them later if needed. | ||
&[] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| { | ||
register_int_counter_vec!( | ||
"compute_ctl_remote_ext_requests_failed_total", | ||
"Total number of failed requests to S3 proxy", | ||
&["http_status"] | ||
) | ||
.expect("failed to define a metric") | ||
}); | ||
|
||
pub fn collect() -> Vec<MetricFamily> { | ||
let mut metrics = INSTALLED_EXTENSIONS.collect(); | ||
metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); | ||
metrics.extend(CPLANE_REQUESTS_FAILED.collect()); | ||
metrics.extend(DB_MIGRATION_FAILED.collect()); | ||
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); | ||
metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect()); | ||
metrics | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
d04d924
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
7565 tests run: 7203 passed, 0 failed, 362 skipped (full report)
Flaky tests (9)
Postgres 17
test_compute_migrations_retry
: release-arm64-with-lfctest_pgdata_import_smoke[None-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]
: debug-x86-64-without-lfc, release-arm64-with-lfc, release-arm64-without-lfctest_pgdata_import_smoke[8-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]
: debug-x86-64-without-lfc, release-arm64-without-lfc, release-arm64-with-lfctest_issue_5878[same_generation]
: debug-x86-64-without-lfctest_scrubber_tenant_snapshot[4]
: release-arm64-with-lfcCode coverage* (full report)
functions
:33.5% (8506 of 25418 functions)
lines
:49.2% (71542 of 145394 lines)
* collected from Rust tests only
d04d924 at 2025-01-28T22:35:54.037Z :recycle: