[Git][ghc/ghc][wip/fix-eventlog-flush-deadlock] rts: Fix a deadlock with eventlog flush interval and RTS shutdown
Matthew Pickering pushed to branch wip/fix-eventlog-flush-deadlock at Glasgow Haskell Compiler / GHC Commits: e886aae7 by Matthew Pickering at 2025-11-17T10:34:01+00:00 rts: Fix a deadlock with eventlog flush interval and RTS shutdown The ghc_ticker thread attempts to flush at the eventlog tick interval, this requires waiting to take all capabilities. At the same time, the main thread is shutting down, the schedule is stopped and then we wait for the ticker thread to finish. Therefore we are deadlocked. The solution is to create a mutex which is taken when the eventlog starts flushing. Then during shutdown we can block before exiting the scheduler, so the scheduler can't stop whilst we are in the process of flushing. Fixes #26573 - - - - - 3 changed files: - rts/RtsStartup.c - rts/eventlog/EventLog.c - rts/eventlog/EventLog.h Changes: ===================================== rts/RtsStartup.c ===================================== @@ -493,9 +493,18 @@ hs_exit_(bool wait_foreign) stopIOManager(); + // If a flush is in progress, we need to make sure the scheduler isn't stopped + // and likewise, if we are about to remove the scheduler, don't allow the flush + // to start. +#if defined(TRACING) && defined(THREADED_RTS) + ACQUIRE_LOCK(&eventlog_flush_mutex); +#endif /* stop all running tasks. This is also where we stop concurrent non-moving * collection if it's running */ exitScheduler(wait_foreign); +#if defined(TRACING) && defined(THREADED_RTS) + RELEASE_LOCK(&eventlog_flush_mutex); +#endif /* run C finalizers for all active weak pointers */ for (i = 0; i < getNumCapabilities(); i++) { ===================================== rts/eventlog/EventLog.c ===================================== @@ -133,6 +133,13 @@ static EventsBuf eventBuf; // an EventsBuf not associated with any Capability static Mutex eventBufMutex; // protected by this mutex #endif +#if defined(THREADED_RTS) +Mutex eventlog_flush_mutex; +// Mutex which is taken when an eventlog is being flushed. +// In particular, this mutex is taken during event shutdown to avoid races between +// the shutdown thread and timer thread (#26573) +#endif + // Event type typedef struct _EventType { EventTypeNum etNum; // Event Type number. @@ -394,6 +401,7 @@ initEventLogging(void) #if defined(THREADED_RTS) initMutex(&eventBufMutex); initMutex(&state_change_mutex); + initMutex(&eventlog_flush_mutex); #endif } @@ -491,13 +499,7 @@ endEventLogging(void) eventlog_enabled = false; - // Flush all events remaining in the buffers. - // - // N.B. Don't flush if shutting down: this was done in - // finishCapEventLogging and the capabilities have already been freed. - if (getSchedState() != SCHED_SHUTTING_DOWN) { - flushEventLog(NULL); - } + flushEventLog(NULL); ACQUIRE_LOCK(&eventBufMutex); @@ -1626,6 +1628,26 @@ void flushEventLog(Capability **cap USED_IF_THREADS) return; } + // This lock is also taken during shutdown, so that any flush can be finished + // before the shutdown procedure starts. + ACQUIRE_LOCK(&eventlog_flush_mutex); + + // N.B. Don't flush if shutting down: this was done in + // finishCapEventLogging and the capabilities have already been freed. + // This can also race against the shutdown if the flush is triggered by the + // ticker thread. (#26573) + + // Acquire the sched_mutex for the duration of the flush, since if the scheduler + // starts to shut down after we have checked the status, then stopAllCapabilitiesWith will + // block. + + // Also if the scheduler is shutting down, then the rts_shutdown will perform a final flush of + // all the buffers, so we don't also need to flush here. + if (getSchedState() == SCHED_SHUTTING_DOWN) { + RELEASE_LOCK(&eventlog_flush_mutex); + return; + } + ACQUIRE_LOCK(&eventBufMutex); printAndClearEventBuf(&eventBuf); RELEASE_LOCK(&eventBufMutex); @@ -1639,6 +1661,7 @@ void flushEventLog(Capability **cap USED_IF_THREADS) flushLocalEventsBuf(getCapability(0)); #endif flushEventLogWriter(); + RELEASE_LOCK(&eventlog_flush_mutex); } #else ===================================== rts/eventlog/EventLog.h ===================================== @@ -19,6 +19,10 @@ extern bool eventlog_enabled; +#if defined(THREADED_RTS) +extern Mutex eventlog_flush_mutex; +#endif + void initEventLogging(void); void restartEventLogging(void); void finishCapEventLogging(void); View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e886aae77da3953ddd36fa5289fe3c72... -- View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e886aae77da3953ddd36fa5289fe3c72... You're receiving this email because of your account on gitlab.haskell.org.
participants (1)
-
Matthew Pickering (@mpickering)