Matthew Pickering pushed to branch wip/fix-eventlog-flush-deadlock at Glasgow Haskell Compiler / GHC

Commits:

3 changed files:

Changes:

  • rts/RtsStartup.c
    ... ... @@ -493,9 +493,18 @@ hs_exit_(bool wait_foreign)
    493 493
     
    
    494 494
         stopIOManager();
    
    495 495
     
    
    496
    +    // If a flush is in progress, we need to make sure the scheduler isn't stopped
    
    497
    +    // and likewise, if we are about to remove the scheduler, don't allow the flush
    
    498
    +    // to start.
    
    499
    +#if defined(TRACING) && defined(THREADED_RTS)
    
    500
    +    ACQUIRE_LOCK(&eventlog_flush_mutex);
    
    501
    +#endif
    
    496 502
         /* stop all running tasks. This is also where we stop concurrent non-moving
    
    497 503
          * collection if it's running */
    
    498 504
         exitScheduler(wait_foreign);
    
    505
    +#if defined(TRACING) && defined(THREADED_RTS)
    
    506
    +    RELEASE_LOCK(&eventlog_flush_mutex);
    
    507
    +#endif
    
    499 508
     
    
    500 509
         /* run C finalizers for all active weak pointers */
    
    501 510
         for (i = 0; i < getNumCapabilities(); i++) {
    

  • rts/eventlog/EventLog.c
    ... ... @@ -133,6 +133,13 @@ static EventsBuf eventBuf; // an EventsBuf not associated with any Capability
    133 133
     static Mutex eventBufMutex; // protected by this mutex
    
    134 134
     #endif
    
    135 135
     
    
    136
    +#if defined(THREADED_RTS)
    
    137
    +Mutex eventlog_flush_mutex;
    
    138
    +// Mutex which is taken when an eventlog is being flushed.
    
    139
    +// In particular, this mutex is taken during event shutdown to avoid races between
    
    140
    +// the shutdown thread and timer thread (#26573)
    
    141
    +#endif
    
    142
    +
    
    136 143
     // Event type
    
    137 144
     typedef struct _EventType {
    
    138 145
       EventTypeNum etNum;  // Event Type number.
    
    ... ... @@ -394,6 +401,7 @@ initEventLogging(void)
    394 401
     #if defined(THREADED_RTS)
    
    395 402
         initMutex(&eventBufMutex);
    
    396 403
         initMutex(&state_change_mutex);
    
    404
    +    initMutex(&eventlog_flush_mutex);
    
    397 405
     #endif
    
    398 406
     }
    
    399 407
     
    
    ... ... @@ -491,13 +499,7 @@ endEventLogging(void)
    491 499
     
    
    492 500
         eventlog_enabled = false;
    
    493 501
     
    
    494
    -    // Flush all events remaining in the buffers.
    
    495
    -    //
    
    496
    -    // N.B. Don't flush if shutting down: this was done in
    
    497
    -    // finishCapEventLogging and the capabilities have already been freed.
    
    498
    -    if (getSchedState() != SCHED_SHUTTING_DOWN) {
    
    499
    -        flushEventLog(NULL);
    
    500
    -    }
    
    502
    +    flushEventLog(NULL);
    
    501 503
     
    
    502 504
         ACQUIRE_LOCK(&eventBufMutex);
    
    503 505
     
    
    ... ... @@ -1626,6 +1628,26 @@ void flushEventLog(Capability **cap USED_IF_THREADS)
    1626 1628
             return;
    
    1627 1629
         }
    
    1628 1630
     
    
    1631
    +    // This lock is also taken during shutdown, so that any flush can be finished
    
    1632
    +    // before the shutdown procedure starts.
    
    1633
    +    ACQUIRE_LOCK(&eventlog_flush_mutex);
    
    1634
    +
    
    1635
    +    // N.B. Don't flush if shutting down: this was done in
    
    1636
    +    // finishCapEventLogging and the capabilities have already been freed.
    
    1637
    +    // This can also race against the shutdown if the flush is triggered by the
    
    1638
    +    // ticker thread. (#26573)
    
    1639
    +
    
    1640
    +    // Acquire the sched_mutex for the duration of the flush, since if the scheduler
    
    1641
    +    // starts to shut down after we have checked the status, then stopAllCapabilitiesWith will
    
    1642
    +    // block.
    
    1643
    +
    
    1644
    +    // Also if the scheduler is shutting down, then the rts_shutdown will perform a final flush of
    
    1645
    +    // all the buffers, so we don't also need to flush here.
    
    1646
    +    if (getSchedState() == SCHED_SHUTTING_DOWN) {
    
    1647
    +      RELEASE_LOCK(&eventlog_flush_mutex);
    
    1648
    +      return;
    
    1649
    +    }
    
    1650
    +
    
    1629 1651
         ACQUIRE_LOCK(&eventBufMutex);
    
    1630 1652
         printAndClearEventBuf(&eventBuf);
    
    1631 1653
         RELEASE_LOCK(&eventBufMutex);
    
    ... ... @@ -1639,6 +1661,7 @@ void flushEventLog(Capability **cap USED_IF_THREADS)
    1639 1661
         flushLocalEventsBuf(getCapability(0));
    
    1640 1662
     #endif
    
    1641 1663
         flushEventLogWriter();
    
    1664
    +    RELEASE_LOCK(&eventlog_flush_mutex);
    
    1642 1665
     }
    
    1643 1666
     
    
    1644 1667
     #else
    

  • rts/eventlog/EventLog.h
    ... ... @@ -19,6 +19,10 @@
    19 19
     
    
    20 20
     extern bool eventlog_enabled;
    
    21 21
     
    
    22
    +#if defined(THREADED_RTS)
    
    23
    +extern Mutex eventlog_flush_mutex;
    
    24
    +#endif
    
    25
    +
    
    22 26
     void initEventLogging(void);
    
    23 27
     void restartEventLogging(void);
    
    24 28
     void finishCapEventLogging(void);