Skip to main content

kipuka/routes/admin/
health.rs

1//! System health check endpoints for the admin API.
2//!
3//! Provides health probes for the overall system and individual
4//! subsystems (database, HSM, CA backends).  These endpoints are
5//! designed for monitoring systems (Kubernetes readiness probes,
6//! Prometheus health checks, etc.).
7
8use std::sync::Arc;
9
10use axum::Json;
11use axum::extract::State;
12use axum::http::StatusCode;
13use axum::response::{IntoResponse, Response};
14use serde::Serialize;
15
16use super::AdminAuth;
17use crate::state::AppState;
18
19/// Overall system health status.
20#[derive(Serialize)]
21pub struct SystemHealth {
22    /// Overall status: "healthy", "degraded", or "unhealthy".
23    pub status: String,
24
25    /// Server uptime in seconds.
26    pub uptime_secs: u64,
27
28    /// Database health.
29    pub database: SubsystemHealth,
30
31    /// HSM health (if configured).
32    pub hsm: Option<SubsystemHealth>,
33
34    /// Number of configured CAs.
35    pub ca_count: usize,
36
37    /// Number of healthy CAs.
38    pub healthy_ca_count: usize,
39
40    /// Server version.
41    pub version: String,
42}
43
44/// Health status of an individual subsystem.
45#[derive(Serialize)]
46pub struct SubsystemHealth {
47    /// Subsystem name.
48    pub name: String,
49
50    /// Status: "healthy", "degraded", or "unhealthy".
51    pub status: String,
52
53    /// Optional detail message.
54    pub detail: Option<String>,
55
56    /// Response latency in milliseconds (if measured).
57    pub latency_ms: Option<u64>,
58}
59
60/// `GET /admin/health` — Overall system health.
61///
62/// Returns the aggregate health of all subsystems.  The HTTP status
63/// code reflects the overall health:
64///
65/// - `200 OK` — all subsystems healthy
66/// - `503 Service Unavailable` — one or more critical subsystems unhealthy
67pub async fn get_health(_admin: AdminAuth, State(state): State<Arc<AppState>>) -> Response {
68    let uptime = state.startup_time.elapsed().as_secs();
69
70    // Check database health.
71    let db_health = check_database_health(&state).await;
72
73    // Check HSM health (if configured).
74    let hsm_health = if state.config.hsm.is_some() {
75        Some(check_hsm_health(&state).await)
76    } else {
77        None
78    };
79
80    // Count healthy CAs.
81    let ca_count = state.config.cas.len();
82    let healthy_ca_count = state
83        .ha_manager
84        .as_ref()
85        .map(|ha| {
86            ha.pool()
87                .status_snapshot()
88                .into_values()
89                .filter(|s| s.health.is_available())
90                .count()
91        })
92        .unwrap_or(ca_count);
93
94    // Determine overall status.
95    let overall_status =
96        if db_health.status == "unhealthy" || (healthy_ca_count == 0 && ca_count > 0) {
97            "unhealthy"
98        } else if db_health.status == "degraded" || healthy_ca_count < ca_count {
99            "degraded"
100        } else {
101            "healthy"
102        };
103
104    let health = SystemHealth {
105        status: overall_status.to_string(),
106        uptime_secs: uptime,
107        database: db_health,
108        hsm: hsm_health,
109        ca_count,
110        healthy_ca_count,
111        version: env!("CARGO_PKG_VERSION").to_string(),
112    };
113
114    let status_code = if overall_status == "unhealthy" {
115        StatusCode::SERVICE_UNAVAILABLE
116    } else {
117        StatusCode::OK
118    };
119
120    (status_code, Json(health)).into_response()
121}
122
123/// `GET /admin/health/db` — Database connectivity check.
124///
125/// Performs a lightweight query to verify database connectivity.
126pub async fn get_health_db(_admin: AdminAuth, State(state): State<Arc<AppState>>) -> Response {
127    let health = check_database_health(&state).await;
128
129    let status = if health.status == "healthy" {
130        StatusCode::OK
131    } else {
132        StatusCode::SERVICE_UNAVAILABLE
133    };
134
135    (status, Json(health)).into_response()
136}
137
138/// `GET /admin/health/hsm` — HSM connectivity check.
139///
140/// Verifies that the configured HSM is reachable and the PKCS#11
141/// session is active.
142pub async fn get_health_hsm(_admin: AdminAuth, State(state): State<Arc<AppState>>) -> Response {
143    if state.config.hsm.is_none() {
144        return (
145            StatusCode::OK,
146            Json(SubsystemHealth {
147                name: "hsm".to_string(),
148                status: "not_configured".to_string(),
149                detail: Some("no HSM is configured".to_string()),
150                latency_ms: None,
151            }),
152        )
153            .into_response();
154    }
155
156    let health = check_hsm_health(&state).await;
157    let status = if health.status == "healthy" {
158        StatusCode::OK
159    } else {
160        StatusCode::SERVICE_UNAVAILABLE
161    };
162
163    (status, Json(health)).into_response()
164}
165
166/// `GET /admin/health/ca` — CA backend health for all configured CAs.
167///
168/// Returns the health status of each configured CA backend from the
169/// HA subsystem.
170pub async fn get_health_ca(_admin: AdminAuth, State(state): State<Arc<AppState>>) -> Response {
171    let mut ca_health: Vec<serde_json::Value> = Vec::new();
172
173    for ca_config in &state.config.cas {
174        let (health, latency_ms) = state
175            .ha_manager
176            .as_ref()
177            .and_then(|ha| {
178                let ca_id_key = crate::ha::CaId(ca_config.id.clone());
179                ha.pool().status_snapshot().get(&ca_id_key).map(|s| {
180                    let h = format!("{:?}", s.health);
181                    let l = Some(s.latency_ema_ms as u64);
182                    (h, l)
183                })
184            })
185            .unwrap_or(("unknown".to_string(), None));
186
187        ca_health.push(serde_json::json!({
188            "ca_id": ca_config.id,
189            "health": health,
190            "latency_ms": latency_ms,
191            "hsm_backed": ca_config.is_hsm_backed(),
192        }));
193    }
194
195    (StatusCode::OK, Json(ca_health)).into_response()
196}
197
198// ── Internal health check implementations ────────────────────────────────────
199
200/// Check database connectivity with a lightweight query.
201async fn check_database_health(_state: &AppState) -> SubsystemHealth {
202    let start = std::time::Instant::now();
203
204    // TODO: Execute a lightweight query like `SELECT 1`.
205    //
206    // match sqlx::query("SELECT 1").execute(&state.db).await {
207    //     Ok(_) => SubsystemHealth { ... status: "healthy" ... },
208    //     Err(e) => SubsystemHealth { ... status: "unhealthy", detail: Some(e.to_string()) ... },
209    // }
210
211    let latency = start.elapsed().as_millis() as u64;
212
213    SubsystemHealth {
214        name: "database".to_string(),
215        status: "healthy".to_string(),
216        detail: None,
217        latency_ms: Some(latency),
218    }
219}
220
221/// Check HSM connectivity by attempting a session operation.
222async fn check_hsm_health(state: &AppState) -> SubsystemHealth {
223    let _ = state;
224
225    // TODO: Verify PKCS#11 session is active.
226    //
227    // match kipuka_hsm::ping_session(&state.hsm).await {
228    //     Ok(latency) => SubsystemHealth { ... status: "healthy" ... },
229    //     Err(e) => SubsystemHealth { ... status: "unhealthy" ... },
230    // }
231
232    SubsystemHealth {
233        name: "hsm".to_string(),
234        status: "healthy".to_string(),
235        detail: None,
236        latency_ms: None,
237    }
238}