Skip to content

Commit

Permalink
Add pprof heap endpoints to node api
Browse files Browse the repository at this point in the history
  • Loading branch information
jackkleeman committed Dec 9, 2024
1 parent f88d42b commit 9d701b6
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 19 deletions.
90 changes: 74 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,7 @@ tempfile = "3.6.0"
test-log = { version = "0.2.11", default-features = false, features = [
"trace",
] }
# tikv-jemallocator has not yet been released with musl target support, so we pin a main commit
tikv-jemallocator = { git = "https://github.com/restatedev/jemallocator", rev = "7c32f6e3d6ad5e4e492cc08d6bdb8307acf9afa0", default-features = false }
tikv-jemallocator = "0.6"
thiserror = "1.0"
tokio = { version = "1.40.0", default-features = false, features = [
"rt-multi-thread",
Expand Down
3 changes: 3 additions & 0 deletions crates/node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ tower = { workspace = true }
tower-http = { workspace = true, features = ["trace"] }
tracing = { workspace = true }

[target.'cfg(not(target_env = "msvc"))'.dependencies]
jemalloc_pprof = "0.6.0"

[dev-dependencies]
restate-test-util = { workspace = true }

Expand Down
1 change: 1 addition & 0 deletions crates/node/src/network_server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
mod handler;
mod metrics;
mod multiplex;
mod pprof;
mod prometheus_helpers;
mod service;
mod state;
Expand Down
82 changes: 82 additions & 0 deletions crates/node/src/network_server/pprof.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#[cfg(target_os = "linux")]
mod pprof {
use http::StatusCode;

pub async fn heap() -> Result<impl axum::response::IntoResponse, (StatusCode, String)> {
match jemalloc_pprof::PROF_CTL.as_ref() {
Some(prof_ctl) => {
let mut prof_ctl = prof_ctl.lock().await;
if prof_ctl.activated() {
let pprof = prof_ctl
.dump_pprof()
.map_err(|err| (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()))?;
Ok(pprof)
} else {
Err((
axum::http::StatusCode::PRECONDITION_FAILED,
"Heap profiling not activated: first call /debug/pprof/heap/activate\n"
.into(),
))
}
}
None => Err((
axum::http::StatusCode::PRECONDITION_FAILED,
"Heap profiling not enabled: run with MALLOC_CONF=\"prof:true\"\n".into(),
)),
}
}

pub async fn activate_heap() -> Result<(), (StatusCode, String)> {
match jemalloc_pprof::PROF_CTL.as_ref() {
Some(prof_ctl) => {
let mut prof_ctl = prof_ctl.lock().await;
prof_ctl
.activate()
.map_err(|err| (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()))?;
Ok(())
}
None => Err((
axum::http::StatusCode::PRECONDITION_FAILED,
"Heap profiling not enabled: run with MALLOC_CONF=\"prof:true\"\n".into(),
)),
}
}

pub async fn deactivate_heap() -> Result<(), (StatusCode, String)> {
match jemalloc_pprof::PROF_CTL.as_ref() {
Some(prof_ctl) => {
let mut prof_ctl = prof_ctl.lock().await;
prof_ctl
.deactivate()
.map_err(|err| (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()))?;
Ok(())
}
None => Err((
axum::http::StatusCode::PRECONDITION_FAILED,
"Heap profiling not enabled: run with MALLOC_CONF=\"prof:true\"\n".into(),
)),
}
}
}

#[cfg(not(target_os = "linux"))]
mod pprof {
use http::StatusCode;

pub async fn heap() -> (StatusCode, String) {
(
axum::http::StatusCode::PRECONDITION_FAILED,
"Heap profiling is only available on Linux\n".into(),
)
}

pub async fn activate_heap() -> (StatusCode, String) {
heap().await
}

pub async fn deactivate_heap() -> (StatusCode, String) {
heap().await
}
}

pub use pprof::*;
5 changes: 4 additions & 1 deletion crates/node/src/network_server/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use restate_metadata_store::MetadataStoreClient;
use restate_storage_query_datafusion::context::QueryContext;
use restate_types::config::CommonOptions;

use super::pprof;
use crate::network_server::handler;
use crate::network_server::handler::cluster_ctrl::ClusterCtrlSvcHandler;
use crate::network_server::handler::node::NodeSvcHandler;
Expand All @@ -40,7 +41,6 @@ pub struct NetworkServer {
worker_deps: Option<WorkerDependencies>,
admin_deps: Option<AdminDependencies>,
}

impl NetworkServer {
pub fn new(
connection_manager: ConnectionManager,
Expand Down Expand Up @@ -74,6 +74,9 @@ impl NetworkServer {
// -- HTTP service (for prometheus et al.)
let router = axum::Router::new()
.route("/metrics", get(handler::render_metrics))
.route("/debug/pprof/heap", get(pprof::heap))
.route("/debug/pprof/heap/activate", get(pprof::activate_heap))
.route("/debug/pprof/heap/deactivate", get(pprof::deactivate_heap))
.with_state(shared_state)
.layer(TraceLayer::new_for_http().make_span_with(span_factory.clone()))
.fallback(handler_404);
Expand Down
6 changes: 6 additions & 0 deletions server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ use tikv_jemallocator::Jemalloc;
#[global_allocator]
static GLOBAL: Jemalloc = Jemalloc;

// On linux, run jemalloc with profiling enabled, but inactive (so there is no performance impact)
// If needed, profiling can be activated with :5122/debug/pprof/heap/activate, or by overriding this value with $MALLOC_CONF
#[cfg(target_os = "linux")]
#[export_name = "malloc_conf"]
pub static MALLOC_CONF: &[u8] = b"prof:true,prof_active:false,lg_prof_sample:19\0";

#[derive(Debug, clap::Parser)]
#[command(author, version, about)]
struct RestateArguments {
Expand Down

0 comments on commit 9d701b6

Please sign in to comment.