From 6933b605d03ae7e8166bbb9826dd1eb914a9742e Mon Sep 17 00:00:00 2001 From: zhongtao Date: Wed, 26 Jul 2023 17:07:15 +1400 Subject: [PATCH 01/11] fix stuck health check blocking container stop bugs Signed-off-by: zhongtao --- src/daemon/modules/api/container_api.h | 1 + .../container/health_check/health_check.c | 43 ++++++++++++++++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/src/daemon/modules/api/container_api.h b/src/daemon/modules/api/container_api.h index ed97633f..4c1dd29a 100644 --- a/src/daemon/modules/api/container_api.h +++ b/src/daemon/modules/api/container_api.h @@ -51,6 +51,7 @@ typedef struct health_check_manager { health_check_monitor_status_t monitor_status; // Used to wait for the health check minotor thread to close bool monitor_exist; + pthread_t monitor_tid; } health_check_manager_t; typedef struct _container_state_t_ { diff --git a/src/daemon/modules/container/health_check/health_check.c b/src/daemon/modules/container/health_check/health_check.c index e9dcbdb9..bd75382f 100644 --- a/src/daemon/modules/container/health_check/health_check.c +++ b/src/daemon/modules/container/health_check/health_check.c @@ -169,15 +169,49 @@ static bool get_monitor_exist_flag(health_check_manager_t *health) static void close_health_check_monitor(container_t *cont) { + int64_t timeout = 0; + /* wait 1 second to cancel monitor thread (2000 * 500 µs) */ + int64_t retries = 2000; + int ret = -1; + if (cont == NULL || cont->health_check == NULL) { return; } + pthread_t monitor_tid = cont->health_check->monitor_tid; set_monitor_stop_status(cont->health_check); // ensure that the monitor process exits while (get_monitor_exist_flag(cont->health_check)) { util_usleep_nointerupt(500); + timeout += 1; + if (timeout <= retries) { + continue; + } + if (monitor_tid <= 0) { + break; + } + DEBUG("Try to cancel monitor thread"); + ret = pthread_cancel(monitor_tid); + if (ret != 0 && ret != ESRCH) { + WARN("Failed to cancel monitor thread, try to kill thread"); + pthread_kill(monitor_tid, SIGKILL); + } + break; } + + if (monitor_tid > 0 && pthread_join(monitor_tid, NULL) != 0) { + ERROR("Failed to join monitor thread"); + } + + // monitor_tid = 0: it corresponds to the initialization of the health check thread when starting the container. + // At this time, the purpose is to stop the health check thread process before starting a new health check thread, + // and there is no need to set the health check status. + if (monitor_tid > 0) { + set_health_status(cont, UNHEALTHY); + set_monitor_exist_flag(cont->health_check, false); + } + + cont->health_check->monitor_tid = 0; } // Called when the container is being stopped (whether because the health check is @@ -228,6 +262,8 @@ static health_check_manager_t *health_check_manager_new() health_check->monitor_exist = false; + health_check->monitor_tid = 0; + return health_check; cleanup: health_check_manager_free(health_check); @@ -887,20 +923,15 @@ void container_update_health_monitor(const char *container_id) want_running = container_is_running(cont->state) && !container_is_paused(cont->state) && probe != HEALTH_NONE; if (want_running) { - pthread_t monitor_tid = { 0 }; char *cid = util_strdup_s(container_id); // ensured that the health check monitor process is stopped close_health_check_monitor(cont); init_monitor_idle_status(cont->health_check); - if (pthread_create(&monitor_tid, NULL, health_check_monitor, (void *)cid)) { + if (pthread_create(&cont->health_check->monitor_tid, NULL, health_check_monitor, (void *)cid)) { free(cid); ERROR("Failed to create thread to monitor health check..."); goto out; } - if (pthread_detach(monitor_tid)) { - ERROR("Failed to detach the health check monitor thread"); - goto out; - } } else { close_health_check_monitor(cont); } -- 2.25.1