[Git][ghc/ghc][master] 3 commits: Fix assert in Interpreter.c

27 Mar 2026


      Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
404b71c1 by Luite Stegeman at 2026-03-27T04:40:49-04:00
Fix assert in Interpreter.c

If we skip exactly the number of words on the stack we end up on
the first word in the next chunk.

- - - - -
a85bd503 by Luite Stegeman at 2026-03-27T04:40:49-04:00
Support arbitrary size unboxed tuples in bytecode

This stores the size (number of words on the stack) of the next
expected tuple in the TSO, ctoi_spill_size field, eliminating
the need of stg_ctoi_tN frames for each size.

Note: On 32 bit platform there is still a bytecode tuple size
limit of 255 words on the stack.

Fixes #26946

- - - - -
e2209031 by Luite Stegeman at 2026-03-27T04:40:49-04:00
Add specialized frames for small tuples

Small tuples are now returned more efficiently to the interpreter.
They use one less word of stack space and don't need manipulation
of the TSO anymore.

- - - - -


25 changed files:

- compiler/GHC/ByteCode/Asm.hs
- compiler/GHC/ByteCode/Instr.hs
- compiler/GHC/ByteCode/Types.hs
- compiler/GHC/StgToByteCode.hs
- rts/Apply.cmm
- rts/Continuation.c
- rts/ContinuationOps.cmm
- rts/Interpreter.c
- rts/Printer.c
- rts/RaiseAsync.c
- rts/RtsSymbols.c
- rts/Schedule.c
- rts/StgMiscClosures.cmm
- rts/Threads.c
- rts/Threads.h
- rts/include/rts/Bytecodes.h
- rts/include/rts/storage/TSO.h
- rts/include/stg/MiscClosures.h
- + testsuite/tests/bytecode/tuplestress/ByteCode.hs
- + testsuite/tests/bytecode/tuplestress/Common.hs-incl
- + testsuite/tests/bytecode/tuplestress/Obj.hs
- + testsuite/tests/bytecode/tuplestress/TupleStress.hs
- + testsuite/tests/bytecode/tuplestress/TupleStress.stdout
- + testsuite/tests/bytecode/tuplestress/all.T
- utils/deriveConstants/Main.hs


Changes:

=====================================
compiler/GHC/ByteCode/Asm.hs
=====================================
@@ -973,13 +973,16 @@ return_non_tuple V32 = error "return_non_tuple: vector"
 return_non_tuple V64 = error "return_non_tuple: vector"
 
 {-
-  we can only handle up to a fixed number of words on the stack,
-  because we need a stg_ctoi_tN stack frame for each size N. See
-  Note [unboxed tuple bytecodes and tuple_BCO].
+  The maximum number of words that can be spilled on the stack for
+  a tuple return. This is limited by the encoding of the stack
+  spill size in the call_info word (used by stg_ret_t):
 
-  If needed, you can support larger tuples by adding more in
-  Jumps.cmm, StgMiscClosures.cmm, Interpreter.c and MiscClosures.h and
-  raising this limit.
+    - On 32-bit platforms: 8-bit  (bits 24-31), max 255
+    - On 64-bit platforms: 40-bit (bits 24-63)
+
+  The stg_ctoi_t frame itself has no size limit since it reads the
+  spill count from the TSO's ctoi_tuple_spill_words field. See
+  Note [GHCi unboxed tuples stack spills] in StgMiscClosures.cmm.
 
   Note that the limit is the number of words passed on the stack.
   If the calling convention passes part of the tuple in registers, the
@@ -987,8 +990,10 @@ return_non_tuple V64 = error "return_non_tuple: vector"
   take multiple words on the stack (for example Double# on a 32 bit
   platform).
  -}
-maxTupleReturnNativeStackSize :: WordOff
-maxTupleReturnNativeStackSize = 62
+maxTupleReturnNativeStackSize :: Platform -> WordOff
+maxTupleReturnNativeStackSize platform = case platformWordSize platform of
+  PW4 -> 255
+  PW8 -> 1099511627775
 
 {-
   Construct the call_info word that stg_ctoi_t, stg_ret_t and stg_primcall
@@ -997,9 +1002,10 @@ maxTupleReturnNativeStackSize = 62
 
   See Note [GHCi and native call registers] for more information.
  -}
-mkNativeCallInfoSig :: Platform -> NativeCallInfo -> Word32
+mkNativeCallInfoSig :: Platform -> NativeCallInfo -> Word64
 mkNativeCallInfoSig platform NativeCallInfo{..}
-  | nativeCallType == NativeTupleReturn && nativeCallStackSpillSize > maxTupleReturnNativeStackSize
+  | nativeCallType == NativeTupleReturn
+  && nativeCallStackSpillSize > maxTupleReturnNativeStackSize platform
   = pprPanic "mkNativeCallInfoSig: tuple too big for the bytecode compiler"
              (ppr nativeCallStackSpillSize <+> text "stack words." <+>
               text "Use -fobject-code to get around this limit"
@@ -1008,8 +1014,9 @@ mkNativeCallInfoSig platform NativeCallInfo{..}
   = -- 24 bits for register bitmap
     assertPpr (length argRegs <= 24) (text "too many registers for bitmap:" <+> ppr (length argRegs))
 
-    -- 8 bits for continuation offset (only for NativeTupleReturn)
-    assertPpr (cont_offset < 255) (text "continuation offset too large:" <+> ppr cont_offset)
+    -- continuation offset must fit in available bits above the bitmap
+    assertPpr (cont_offset <= fromIntegral (maxTupleReturnNativeStackSize platform))
+              (text "continuation offset too large:" <+> ppr cont_offset)
 
     -- all regs accounted for
     assertPpr (all (`elem` (map fst argRegs)) (regSetToList nativeCallRegs))
@@ -1023,12 +1030,12 @@ mkNativeCallInfoSig platform NativeCallInfo{..}
 
     foldl' reg_bit 0 argRegs .|. (cont_offset `shiftL` 24)
   where
-    cont_offset :: Word32
+    cont_offset :: Word64
     cont_offset
       | nativeCallType == NativeTupleReturn = fromIntegral nativeCallStackSpillSize
       | otherwise                           = 0 -- there is no continuation for primcalls
 
-    reg_bit :: Word32 -> (GlobalReg, Int) -> Word32
+    reg_bit :: Word64 -> (GlobalReg, Int) -> Word64
     reg_bit x (r, n)
       | r `elemRegSet` nativeCallRegs = x .|. 1 `shiftL` n
       | otherwise                     = x


=====================================
compiler/GHC/ByteCode/Instr.hs
=====================================
@@ -488,11 +488,12 @@ bciStackUse PUSH_BCO{}            = 1
 bciStackUse (PUSH_ALTS bco _)     = 2 {- profiling only, restore CCCS -} +
                                     3 + protoBCOStackUse bco
 bciStackUse (PUSH_ALTS_TUPLE bco info _) =
-   -- (tuple_bco, call_info word, cont_bco, stg_ctoi_t)
-   -- tuple
-   -- (call_info, tuple_bco, stg_ret_t)
+   -- ctoi frame: small (4 words) or generic (5 words, with old_spill)
+   -- + tuple data + ret_t frame (3 words)
    1 {- profiling only -} +
-   7 + fromIntegral (nativeCallSize info) + protoBCOStackUse bco
+   ctoi_frame + 3 + fromIntegral (nativeCallSize info) + protoBCOStackUse bco
+   where ctoi_frame | nativeCallStackSpillSize info <= mAX_SMALL_TUPLE_CTOI = 4
+                    | otherwise                                             = 5
 bciStackUse (PUSH_PAD8)           = 1  -- overapproximation
 bciStackUse (PUSH_PAD16)          = 1  -- overapproximation
 bciStackUse (PUSH_PAD32)          = 1  -- overapproximation on 64bit arch


=====================================
compiler/GHC/ByteCode/Types.hs
=====================================
@@ -6,6 +6,8 @@
 --  (c) The University of Glasgow 2002-2006
 --
 
+#include "Bytecodes.h"
+
 -- | Bytecode assembler types
 module GHC.ByteCode.Types
   ( CompiledByteCode(..), seqCompiledByteCode
@@ -13,6 +15,7 @@ module GHC.ByteCode.Types
   , FFIInfo(..)
   , RegBitmap(..)
   , NativeCallType(..), NativeCallInfo(..), voidTupleReturnInfo, voidPrimCallInfo
+  , mAX_SMALL_TUPLE_CTOI
   , ByteOff(..), WordOff(..), HalfWord(..)
   , UnlinkedBCO(..), BCOPtr(..), BCONPtr(..)
   , ItblEnv, ItblPtr(..)
@@ -160,6 +163,12 @@ voidTupleReturnInfo = NativeCallInfo NativeTupleReturn 0 emptyRegSet 0
 voidPrimCallInfo :: NativeCallInfo
 voidPrimCallInfo = NativeCallInfo NativePrimCall 0 emptyRegSet 0
 
+-- | Maximum nativeCallStackSpillSize for which we use a small
+-- stg_ctoi_tN frame (no old_spill slot, no TSO access) instead of
+-- the generic stg_ctoi_t frame.
+mAX_SMALL_TUPLE_CTOI :: WordOff
+mAX_SMALL_TUPLE_CTOI = MAX_SMALL_TUPLE_CTOI
+
 type ItblEnv = NameEnv (Name, ItblPtr)
 type AddrEnv = NameEnv (Name, AddrPtr)
         -- We need the Name in the range so we know which


=====================================
compiler/GHC/StgToByteCode.hs
=====================================
@@ -1128,7 +1128,7 @@ doCase d s p scrut bndr alts
         -- 'Simple' tuples with at most one non-void component,
         -- like (# Word# #) or (# Int#, State# RealWorld #) do not have a
         -- tuple return frame. This is because (# foo #) and (# foo, Void# #)
-        -- have the same runtime rep. We have more efficient specialized
+        -- have the same runtime rep. We have more efficient small
         -- return frames for the situations with one non-void element.
 
         non_void_arg_reps = typeArgReps platform bndr_ty
@@ -1146,10 +1146,19 @@ doCase d s p scrut bndr alts
         -- When an alt is entered, it assumes the returned value is
         -- on top of the itbl; see Note [Return convention for non-tuple values]
         -- for details.
+        -- Whether this tuple return uses a small stg_ctoi_tN frame
+        -- (no old_spill slot, no TSO access) instead of the generic
+        -- stg_ctoi_t frame.
+        small_tuple_frame :: Bool
+        small_tuple_frame =
+          ubx_tuple_frame && nativeCallStackSpillSize call_info <= mAX_SMALL_TUPLE_CTOI
+
         ctoi_frame_header_w :: WordOff
         ctoi_frame_header_w
-          | ubx_tuple_frame =
+          | small_tuple_frame =
               if profiling then 5 else 4
+          | ubx_tuple_frame =
+              if profiling then 6 else 5
           | otherwise = 2
 
         -- The size of the ret_*_info frame header, whose frame returns the
@@ -1293,10 +1302,16 @@ doCase d s p scrut bndr alts
         -- case-of-case expressions, which is the only time we can be compiling a
         -- case expression with s /= 0.
 
-        -- unboxed tuples get two more words, the second is a pointer (tuple_bco)
+        -- unboxed tuples get extra words in the ctoi frame after the
+        -- info pointer and cont_BCO:
+        --   call_info, tuple_BCO, [old_spill], [CCCS]
+        -- tuple_BCO at position 1 is a pointer.
+        -- Small frames (stg_ctoi_tN) omit the old_spill slot.
         (extra_pointers, extra_slots)
-           | ubx_tuple_frame && profiling = ([1], 3) -- call_info, tuple_BCO, CCCS
-           | ubx_tuple_frame              = ([1], 2) -- call_info, tuple_BCO
+           | small_tuple_frame && profiling = ([1], 3) -- call_info, tuple_BCO, CCCS
+           | small_tuple_frame              = ([1], 2) -- call_info, tuple_BCO
+           | ubx_tuple_frame && profiling = ([1], 4) -- call_info, tuple_BCO, old_spill, CCCS
+           | ubx_tuple_frame              = ([1], 3) -- call_info, tuple_BCO, old_spill
            | otherwise                    = ([], 0)
 
         bitmap_size :: WordOff
@@ -1535,14 +1550,12 @@ for the call and and a stack offset. The layout is as follows:
                list is active. Bit 1 for the
                second register in the list and so on.
 
-  - bit 24-31: Unsigned byte indicating the stack offset
+  - bit 24+:   Unsigned value indicating the stack offset
                of the continuation in words. For tuple returns
                this is the number of words returned on the
                stack. For primcalls this field is unused, since
                we don't jump to a continuation.
 
-The upper 32 bits on 64 bit platforms are currently unused.
-
 If a register is smaller than a word on the stack (for example a
 single precision float on a 64 bit system), then the stack slot
 is padded to a whole word.
@@ -1551,8 +1564,8 @@ is padded to a whole word.
 
     If a tuple is returned in three registers and an additional two
     words on the stack, then three bits in the register bitmap
-    (bits 0-23) would be set. And bit 24-31 would be
-    00000010 (two in binary).
+    (bits 0-23) would be set. And the stack offset (bits 24+) would
+    encode the value two.
 
     The values on the stack before a call to POP_ARG_REGS would
     be as follows:
@@ -1580,7 +1593,7 @@ is padded to a whole word.
 
     At this point all the arguments are in place and we are ready
     to jump to the continuation, the location (offset from Sp) of
-    which is found by inspecting the value of bits 24-31. In this
+    which is found by inspecting the value of bits 24+. In this
     case the offset is two words.
 
 On x86_64, the double precision (Dn) and single precision
@@ -1734,9 +1747,11 @@ Note [unboxed tuple bytecodes and tuple_BCO]
      * tuple_BCO: see below
 
   The interpreter pushes these onto the stack when the PUSH_ALTS_TUPLE
-  instruction is executed, followed by stg_ctoi_tN_info, with N depending
-  on the number of stack words used by the tuple in the GHC native calling
-  convention. N is derived from call_info.
+  instruction is executed, followed by stg_ctoi_t_info. It also saves
+  the old ctoi_tuple_spill_words value from the TSO in the frame and sets
+  the TSO field to the number of stack words used by the tuple in the
+  GHC native calling convention. This spill count is derived from
+  call_info.
 
   For example if we expect a tuple with three words on the stack, the stack
   looks as follows after PUSH_ALTS_TUPLE:
@@ -1747,12 +1762,13 @@ Note [unboxed tuple bytecodes and tuple_BCO]
       cont_free_var_2
       ...
       cont_free_var_n
+      old_spill
       call_info
       tuple_BCO
       cont_BCO
-      stg_ctoi_t3_info <- Sp
+      stg_ctoi_t_info  <- Sp
 
-  If the tuple is returned by object code, stg_ctoi_t3 will deal with
+  If the tuple is returned by object code, stg_ctoi_t will deal with
   adjusting the stack pointer and converting the tuple to the bytecode
   calling convention. See Note [GHCi unboxed tuples stack spills] for more
   details.


=====================================
rts/Apply.cmm
=====================================
@@ -719,6 +719,8 @@ for:
     goto for;
   }
 
+  ccall restoreStackInvariants(CurrentTSO "ptr", Sp "ptr", Words);
+
   // Off we go!
   TICK_ENT_VIA_NODE();
 
@@ -776,6 +778,8 @@ for:
     goto for;
   }
 
+  ccall restoreStackInvariants(CurrentTSO "ptr", Sp "ptr", Words);
+
   // Off we go!
   TICK_ENT_VIA_NODE();
 


=====================================
rts/Continuation.c
=====================================
@@ -457,6 +457,11 @@ StgClosure *captureContinuationAndAbort(Capability *cap, StgTSO *tso, StgPromptT
       }
     }
 
+    // see Note [GHCi unboxed tuples stack spills]
+    if (info_ptr == &stg_ctoi_t_info) {
+      tso->ctoi_tuple_spill_words = frame[CTOI_OLD_TUPLE_SPILL_WORDS_OFFSET];
+    }
+
     // Advance to the next frame.
     frame += stack_frame_sizeW((StgClosure *)frame);
   }


=====================================
rts/ContinuationOps.cmm
=====================================
@@ -200,6 +200,8 @@ stg_CONTINUATION_apply // explicit stack
   Sp_adj(-new_stack_words);
   prim %memcpy(Sp, p, WDS(new_stack_words), SIZEOF_W);
 
+  ccall restoreStackInvariants(CurrentTSO "ptr", Sp "ptr", new_stack_words);
+
   TICK_UNKNOWN_CALL();
   TICK_SLOW_CALL_fast_v();
 


=====================================
rts/Interpreter.c
=====================================
@@ -572,72 +572,6 @@ void interp_shutdown( void ){
 
 #endif
 
-const StgPtr ctoi_tuple_infos[] = {
-    (StgPtr) &stg_ctoi_t0_info,
-    (StgPtr) &stg_ctoi_t1_info,
-    (StgPtr) &stg_ctoi_t2_info,
-    (StgPtr) &stg_ctoi_t3_info,
-    (StgPtr) &stg_ctoi_t4_info,
-    (StgPtr) &stg_ctoi_t5_info,
-    (StgPtr) &stg_ctoi_t6_info,
-    (StgPtr) &stg_ctoi_t7_info,
-    (StgPtr) &stg_ctoi_t8_info,
-    (StgPtr) &stg_ctoi_t9_info,
-    (StgPtr) &stg_ctoi_t10_info,
-    (StgPtr) &stg_ctoi_t11_info,
-    (StgPtr) &stg_ctoi_t12_info,
-    (StgPtr) &stg_ctoi_t13_info,
-    (StgPtr) &stg_ctoi_t14_info,
-    (StgPtr) &stg_ctoi_t15_info,
-    (StgPtr) &stg_ctoi_t16_info,
-    (StgPtr) &stg_ctoi_t17_info,
-    (StgPtr) &stg_ctoi_t18_info,
-    (StgPtr) &stg_ctoi_t19_info,
-    (StgPtr) &stg_ctoi_t20_info,
-    (StgPtr) &stg_ctoi_t21_info,
-    (StgPtr) &stg_ctoi_t22_info,
-    (StgPtr) &stg_ctoi_t23_info,
-    (StgPtr) &stg_ctoi_t24_info,
-    (StgPtr) &stg_ctoi_t25_info,
-    (StgPtr) &stg_ctoi_t26_info,
-    (StgPtr) &stg_ctoi_t27_info,
-    (StgPtr) &stg_ctoi_t28_info,
-    (StgPtr) &stg_ctoi_t29_info,
-    (StgPtr) &stg_ctoi_t30_info,
-    (StgPtr) &stg_ctoi_t31_info,
-    (StgPtr) &stg_ctoi_t32_info,
-    (StgPtr) &stg_ctoi_t33_info,
-    (StgPtr) &stg_ctoi_t34_info,
-    (StgPtr) &stg_ctoi_t35_info,
-    (StgPtr) &stg_ctoi_t36_info,
-    (StgPtr) &stg_ctoi_t37_info,
-    (StgPtr) &stg_ctoi_t38_info,
-    (StgPtr) &stg_ctoi_t39_info,
-    (StgPtr) &stg_ctoi_t40_info,
-    (StgPtr) &stg_ctoi_t41_info,
-    (StgPtr) &stg_ctoi_t42_info,
-    (StgPtr) &stg_ctoi_t43_info,
-    (StgPtr) &stg_ctoi_t44_info,
-    (StgPtr) &stg_ctoi_t45_info,
-    (StgPtr) &stg_ctoi_t46_info,
-    (StgPtr) &stg_ctoi_t47_info,
-    (StgPtr) &stg_ctoi_t48_info,
-    (StgPtr) &stg_ctoi_t49_info,
-    (StgPtr) &stg_ctoi_t50_info,
-    (StgPtr) &stg_ctoi_t51_info,
-    (StgPtr) &stg_ctoi_t52_info,
-    (StgPtr) &stg_ctoi_t53_info,
-    (StgPtr) &stg_ctoi_t54_info,
-    (StgPtr) &stg_ctoi_t55_info,
-    (StgPtr) &stg_ctoi_t56_info,
-    (StgPtr) &stg_ctoi_t57_info,
-    (StgPtr) &stg_ctoi_t58_info,
-    (StgPtr) &stg_ctoi_t59_info,
-    (StgPtr) &stg_ctoi_t60_info,
-    (StgPtr) &stg_ctoi_t61_info,
-    (StgPtr) &stg_ctoi_t62_info,
-};
-
 #if defined(PROFILING)
 
 //
@@ -710,7 +644,7 @@ slow_spw(void *Sp, StgStack *cur_stack, StgWord offset_words){
 
       // How many words were on the stack
       stackWords = (StgWord *)frame - (StgWord *) Sp;
-      ASSERT(offset_words > stackWords);
+      ASSERT(offset_words >= stackWords);
 
       // Recursive, in the very unlikely case we have to traverse two
       // stack chunks.
@@ -1317,10 +1251,12 @@ do_return_nonpointer:
                  things on the stack. Therefore we store the CCCS inside the
                  stg_ctoi_t frame.
 
-                 If we have a tuple being returned, the stack looks like this:
+                 If we have a tuple being returned, the stack looks like this
+                 for the generic stg_ctoi_t frame:
 
                      ...
-                     <CCCS>           <- to restore, Sp offset <next frame + 4 words>
+                     <CCCS>           <- to restore, Sp offset <next frame + 5 words>
+                     old_spill
                      tuple_BCO
                      tuple_info
                      cont_BCO
@@ -1331,13 +1267,31 @@ do_return_nonpointer:
                      tuple_info
                      tuple_BCO
                      stg_ret_t        <- Sp
+
+                 Small frames (stg_ctoi_tN) omit the old_spill slot,
+                 so CCCS is at offset <next frame + 4 words>.
                */
 
               if(SpW(0) == (W_)&stg_ret_t_info) {
-                  cap->r.rCCCS = (CostCentreStack*)ReadSpW(offset + 4);
+                  StgWord cccs_offset =
+                      (ReadSpW(offset) == (W_)&stg_ctoi_t_info) ? 5 : 4;
+                  cap->r.rCCCS = (CostCentreStack*)ReadSpW(offset + cccs_offset);
               }
 #endif
 
+              /* When returning a tuple to a generic stg_ctoi_t frame
+                 (as opposed to a small stg_ctoi_tN frame), restore
+                 tso->ctoi_tuple_spill_words from the frame's old_spill
+                 slot.
+
+                 See Note [GHCi unboxed tuples stack spills] in
+                 StgMiscClosures.cmm. */
+              if(SpW(0) == (W_)&stg_ret_t_info
+                 && ReadSpW(offset) == (W_)&stg_ctoi_t_info) {
+                  cap->r.rCurrentTSO->ctoi_tuple_spill_words =
+                      ReadSpW(offset + CTOI_OLD_TUPLE_SPILL_WORDS_OFFSET);
+              }
+
               /* Keep the ret frame and the ctoi frame for run_BCO.
                * See Note [Stack layout when entering run_BCO] */
               goto run_BCO;
@@ -2332,22 +2286,47 @@ run_BCO:
             W_ o_bco = BCO_GET_LARGE_ARG;
             W_ tuple_info = (W_)BCO_LIT(BCO_GET_LARGE_ARG);
             W_ o_tuple_bco = BCO_GET_LARGE_ARG;
+            int tuple_stack_words = tuple_info >> 24;
 
 #if defined(PROFILING)
             SpW(-1) = (W_)cap->r.rCCCS;
             Sp_subW(1);
 #endif
 
-            SpW(-1) = BCO_PTR(o_tuple_bco);
-            SpW(-2) = tuple_info;
-            SpW(-3) = BCO_PTR(o_bco);
-            int tuple_stack_words = (tuple_info >> 24) & 0xff;
-            if (tuple_stack_words > 62) {
-                barf("unsupported tuple size %d", tuple_stack_words);
+            /* See Note [GHCi unboxed tuples stack spills] in
+               StgMiscClosures.cmm */
+            if (tuple_stack_words <= MAX_SMALL_TUPLE_CTOI) {
+                /* Use a small info table that encodes the spill
+                   count statically, avoiding access to
+                   TSO->ctoi_tuple_spill_words entirely.
+                   The frame is one word smaller than stg_ctoi_t
+                   (no old_spill slot). */
+                static const StgInfoTable *const ctoi_t_small[] = {
+                    &stg_ctoi_t0_info, &stg_ctoi_t1_info,
+                    &stg_ctoi_t2_info, &stg_ctoi_t3_info,
+                    &stg_ctoi_t4_info, &stg_ctoi_t5_info,
+                    &stg_ctoi_t6_info, &stg_ctoi_t7_info,
+                    &stg_ctoi_t8_info
+                };
+                _Static_assert(sizeof(ctoi_t_small) / sizeof(ctoi_t_small[0])
+                    == MAX_SMALL_TUPLE_CTOI + 1,
+                    "ctoi_t_small must have MAX_SMALL_TUPLE_CTOI + 1 entries");
+                SpW(-1) = BCO_PTR(o_tuple_bco);
+                SpW(-2) = tuple_info;
+                SpW(-3) = BCO_PTR(o_bco);
+                SpW(-4) = (W_)ctoi_t_small[tuple_stack_words];
+                Sp_subW(4);
+            } else {
+                /* Generic path: save/restore ctoi_tuple_spill_words
+                   via the TSO */
+                SpW(-1) = cap->r.rCurrentTSO->ctoi_tuple_spill_words;
+                SpW(-2) = BCO_PTR(o_tuple_bco);
+                SpW(-3) = tuple_info;
+                SpW(-4) = BCO_PTR(o_bco);
+                SpW(-5) = (W_)&stg_ctoi_t_info;
+                Sp_subW(5);
+                cap->r.rCurrentTSO->ctoi_tuple_spill_words = tuple_stack_words;
             }
-            W_ ctoi_t_offset = (W_) ctoi_tuple_infos[tuple_stack_words];
-            SpW(-4) = ctoi_t_offset;
-            Sp_subW(4);
             NEXT_INSTRUCTION;
         }
 


=====================================
rts/Printer.c
=====================================
@@ -705,6 +705,8 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
                 debugBelch("stg_apply_interp_info" );
             } else if (c == (StgWord)&stg_ret_t_info) {
                 debugBelch("stg_ret_t_info" );
+            } else if (c == (StgWord)&stg_ctoi_t_info) {
+                debugBelch("stg_ctoi_t_info" );
             } else if (c == (StgWord)&stg_ctoi_t0_info) {
                 debugBelch("stg_ctoi_t0_info" );
             } else if (c == (StgWord)&stg_ctoi_t1_info) {
@@ -723,8 +725,6 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
                 debugBelch("stg_ctoi_t7_info" );
             } else if (c == (StgWord)&stg_ctoi_t8_info) {
                 debugBelch("stg_ctoi_t8_info" );
-            /* there are more stg_ctoi_tN_info frames,
-               but we don't print them all */
             } else {
                 debugBelch("RET_BCO");
             }


=====================================
rts/RaiseAsync.c
=====================================
@@ -1074,6 +1074,11 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
                 tso->flags |= TSO_BLOCKEX;
                 tso->flags &= ~TSO_INTERRUPTIBLE;
             }
+            // see Note [GHCi unboxed tuples stack spills] in
+            // StgMiscClosures.cmm
+            if (*frame == (W_)&stg_ctoi_t_info) {
+                tso->ctoi_tuple_spill_words = frame[CTOI_OLD_TUPLE_SPILL_WORDS_OFFSET];
+            }
             break;
         }
 


=====================================
rts/RtsSymbols.c
=====================================
@@ -473,7 +473,16 @@ extern char **environ;
       SymI_HasDataProto(stg_ret_d_info)                                     \
       SymI_HasDataProto(stg_ret_l_info)                                     \
       SymI_HasDataProto(stg_ret_t_info)                                     \
-      SymI_HasDataProto(stg_ctoi_t)                                         \
+      SymI_HasDataProto(stg_ctoi_t_info)                                    \
+      SymI_HasDataProto(stg_ctoi_t0_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t1_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t2_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t3_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t4_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t5_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t6_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t7_info)                                   \
+      SymI_HasDataProto(stg_ctoi_t8_info)                                   \
       SymI_HasDataProto(stg_primcall_info)                                  \
       SymI_HasDataProto(stg_gc_prim_p)                                      \
       SymI_HasDataProto(stg_gc_prim_pp)                                     \


=====================================
rts/Schedule.c
=====================================
@@ -3110,6 +3110,11 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
                 tso->flags |= TSO_BLOCKEX;
                 tso->flags &= ~TSO_INTERRUPTIBLE;
             }
+            // see Note [GHCi unboxed tuples stack spills] in
+            // StgMiscClosures.cmm
+            if (*p == (StgWord)&stg_ctoi_t_info) {
+                tso->ctoi_tuple_spill_words = p[CTOI_OLD_TUPLE_SPILL_WORDS_OFFSET]; // restore old_spill
+            }
             p = next;
             continue;
         }


=====================================
rts/StgMiscClosures.cmm
=====================================
@@ -230,25 +230,22 @@ INFO_TABLE_RET( stg_ctoi_V, RET_BCO )
        spilled_2
        spilled_3   <- Sp
 
-   This makes it difficult to write a procedure that can handle tuples of
-   any size.
+   stg_ctoi_t reads the number of spilled words from the
+   ctoi_tuple_spill_words field in the TSO to skip over the spilled data
+   on the stack. This field is set by the interpreter when pushing
+   the stg_ctoi_t frame (bci_PUSH_ALTS_T instruction). The old
+   value of the TSO field is saved in the frame itself, to handle
+   nested tuple returns correctly.
 
-   To get around this, we use a Cmm procedure that adjusts the stack pointer
-   to skip over the tuple:
-
-       ...
-       stg_ctoi_t3  (advances Sp by 3 words, then calls stg_ctoi_t)
-       spilled_1
-       spilled_2
-       spilled_3    <- Sp
-
-   When stg_ctoi_t is called, the stack looks like:
+   When stg_ctoi_t has adjusted Sp and read the frame, the stack
+   looks like:
 
        ...
+       old_spill
        tuple_BCO
        tuple_info
        cont_BCO     (continuation in bytecode)
-       stg_ctoi_t3  <- Sp
+       stg_ctoi_t   <- Sp
        spilled_1
        spilled_2
        spilled_3
@@ -258,10 +255,11 @@ INFO_TABLE_RET( stg_ctoi_V, RET_BCO )
    stack looks as follows:
 
        ...
+       old_spill
        tuple_BCO
        tuple_info
        cont_BCO
-       stg_ctoi_t3
+       stg_ctoi_t
        spilled_1
        spilled_2
        spilled_3
@@ -279,108 +277,52 @@ INFO_TABLE_RET( stg_ctoi_V, RET_BCO )
 
    At this point we can safely jump to the interpreter.
 
- */
+   We maintain the following invariants around the spill info:
 
-#define MK_STG_CTOI_T(N) INFO_TABLE_RET( \
-  stg_ctoi_t ## N, RET_BCO ) \
-  { Sp_adj(N); jump stg_ctoi_t SCALAR_ARG_REGS; }
-
-MK_STG_CTOI_T(0)
-MK_STG_CTOI_T(1)
-MK_STG_CTOI_T(2)
-MK_STG_CTOI_T(3)
-MK_STG_CTOI_T(4)
-MK_STG_CTOI_T(5)
-MK_STG_CTOI_T(6)
-MK_STG_CTOI_T(7)
-MK_STG_CTOI_T(8)
-MK_STG_CTOI_T(9)
-
-MK_STG_CTOI_T(10)
-MK_STG_CTOI_T(11)
-MK_STG_CTOI_T(12)
-MK_STG_CTOI_T(13)
-MK_STG_CTOI_T(14)
-MK_STG_CTOI_T(15)
-MK_STG_CTOI_T(16)
-MK_STG_CTOI_T(17)
-MK_STG_CTOI_T(18)
-MK_STG_CTOI_T(19)
-
-MK_STG_CTOI_T(20)
-MK_STG_CTOI_T(21)
-MK_STG_CTOI_T(22)
-MK_STG_CTOI_T(23)
-MK_STG_CTOI_T(24)
-MK_STG_CTOI_T(25)
-MK_STG_CTOI_T(26)
-MK_STG_CTOI_T(27)
-MK_STG_CTOI_T(28)
-MK_STG_CTOI_T(29)
-
-MK_STG_CTOI_T(30)
-MK_STG_CTOI_T(31)
-MK_STG_CTOI_T(32)
-MK_STG_CTOI_T(33)
-MK_STG_CTOI_T(34)
-MK_STG_CTOI_T(35)
-MK_STG_CTOI_T(36)
-MK_STG_CTOI_T(37)
-MK_STG_CTOI_T(38)
-MK_STG_CTOI_T(39)
-
-MK_STG_CTOI_T(40)
-MK_STG_CTOI_T(41)
-MK_STG_CTOI_T(42)
-MK_STG_CTOI_T(43)
-MK_STG_CTOI_T(44)
-MK_STG_CTOI_T(45)
-MK_STG_CTOI_T(46)
-MK_STG_CTOI_T(47)
-MK_STG_CTOI_T(48)
-MK_STG_CTOI_T(49)
-
-MK_STG_CTOI_T(50)
-MK_STG_CTOI_T(51)
-MK_STG_CTOI_T(52)
-MK_STG_CTOI_T(53)
-MK_STG_CTOI_T(54)
-MK_STG_CTOI_T(55)
-MK_STG_CTOI_T(56)
-MK_STG_CTOI_T(57)
-MK_STG_CTOI_T(58)
-MK_STG_CTOI_T(59)
-
-MK_STG_CTOI_T(60)
-MK_STG_CTOI_T(61)
-MK_STG_CTOI_T(62)
+     - tso->ctoi_tuple_spill_words == (frame[CTOI_TUPLE_INFO_OFFSET] >> 24)
+       where frame is the topmost stg_ctoi_t frame on the tso's stack.
+     - for each stg_ctoi_t frame, ctoi_t_frame[CTOI_OLD_TUPLE_SPILL_WORDS_OFFSET]
+       is equal to previous_ctoi_t_frame[CTOI_TUPLE_INFO_OFFSET] >> 24
+
+   This affects unwinding/capturing/restoring the stack for exceptions
+   and continuations.
+ */
 
 /*
   Convert a tuple return value to be used in bytecode.
 
   See Note [GHCi and native call registers] for information on how
   values are moved between the stack and registers.
+
+  See Note [GHCi unboxed tuples stack spills] for the stack layout.
  */
 
-stg_ctoi_t
-    /* explicit stack */
+INFO_TABLE_RET( stg_ctoi_t, RET_BCO )
 {
-
-    W_ tuple_info, tuple_stack;
+    W_ tuple_spill, tuple_info;
     P_ tuple_BCO;
+    W_ old_spill;
+
+    /* read number of spilled stack words from the TSO */
+    tuple_spill = StgTSO_ctoi_tuple_spill_words(CurrentTSO);
+
+    /* skip over tuple data on the stack */
+    Sp = Sp + WDS(tuple_spill);
 
     tuple_info = Sp(2); /* tuple information word */
     tuple_BCO  = Sp(3); /* bytecode object that returns the tuple in
                            the interpreter */
+    old_spill  = Sp(4); /* saved ctoi_tuple_spill_words from TSO */
 
 #if defined(PROFILING)
-    CCCS = Sp(4);
+    CCCS = Sp(5);
 #endif
 
-    /* number of words spilled on stack */
-    tuple_stack  = (tuple_info >> 24) & 0xff;
+    /* restore old spill count in the TSO */
+    StgTSO_ctoi_tuple_spill_words(CurrentTSO) = old_spill;
 
-    Sp = Sp - WDS(tuple_stack);
+    /* move Sp back down to include spilled data */
+    Sp = Sp - WDS(tuple_spill);
 
     PUSH_SCALAR_ARG_REGS(tuple_info);
 
@@ -393,6 +335,59 @@ stg_ctoi_t
     jump stg_yield_to_interpreter [];
 }
 
+/*
+  Small versions of stg_ctoi_t for small spill counts (0..MAX_SMALL_TUPLE_CTOI
+  words).
+
+  These avoid accessing TSO->ctoi_tuple_spill_words entirely, since the
+  spill count is known statically from the info table.
+
+  The frame layout is one word smaller than stg_ctoi_t, omitting
+  the old_spill slot:
+
+      CCCS               (profiling only)
+      tuple_BCO
+      tuple_info
+      cont_BCO
+      stg_ctoi_tN_info   (N = spill count, words)
+
+  Exception unwinding code and restoreStackInvariants only match
+  stg_ctoi_t_info, so these frames are correctly skipped.
+
+  See Note [GHCi unboxed tuples stack spills] for the general design.
+ */
+
+#if defined(PROFILING)
+#define CTOI_TN_RESTORE_CCS  CCCS = Sp(4);
+#else
+#define CTOI_TN_RESTORE_CCS
+#endif
+
+#define STG_CTOI_TN_BODY(n)                       \
+    W_ tuple_info;                                 \
+    P_ tuple_BCO;                                  \
+    Sp = Sp + WDS(n);                              \
+    tuple_info = Sp(2);                            \
+    tuple_BCO  = Sp(3);                            \
+    CTOI_TN_RESTORE_CCS                            \
+    Sp = Sp - WDS(n);                              \
+    PUSH_SCALAR_ARG_REGS(tuple_info);              \
+    Sp_adj(-3);                                    \
+    Sp(2) = tuple_info;                            \
+    Sp(1) = tuple_BCO;                             \
+    Sp(0) = stg_ret_t_info;                        \
+    jump stg_yield_to_interpreter [];
+
+INFO_TABLE_RET( stg_ctoi_t0, RET_BCO ) { STG_CTOI_TN_BODY(0) }
+INFO_TABLE_RET( stg_ctoi_t1, RET_BCO ) { STG_CTOI_TN_BODY(1) }
+INFO_TABLE_RET( stg_ctoi_t2, RET_BCO ) { STG_CTOI_TN_BODY(2) }
+INFO_TABLE_RET( stg_ctoi_t3, RET_BCO ) { STG_CTOI_TN_BODY(3) }
+INFO_TABLE_RET( stg_ctoi_t4, RET_BCO ) { STG_CTOI_TN_BODY(4) }
+INFO_TABLE_RET( stg_ctoi_t5, RET_BCO ) { STG_CTOI_TN_BODY(5) }
+INFO_TABLE_RET( stg_ctoi_t6, RET_BCO ) { STG_CTOI_TN_BODY(6) }
+INFO_TABLE_RET( stg_ctoi_t7, RET_BCO ) { STG_CTOI_TN_BODY(7) }
+INFO_TABLE_RET( stg_ctoi_t8, RET_BCO ) { STG_CTOI_TN_BODY(8) }
+
 INFO_TABLE_RET( stg_ret_t, RET_BCO )
 {
     W_ tuple_info, tuple_stack;
@@ -401,7 +396,7 @@ INFO_TABLE_RET( stg_ret_t, RET_BCO )
     Sp_adj(3);
 
     /* number of words spilled on stack */
-    tuple_stack  = (tuple_info >> 24) & 0xff;
+    tuple_stack  = tuple_info >> 24;
 
     POP_SCALAR_ARG_REGS(tuple_info);
 


=====================================
rts/Threads.c
=====================================
@@ -114,6 +114,8 @@ createThread(Capability *cap, W_ size)
 
     ASSIGN_Int64((W_*)&(tso->alloc_limit), 0);
 
+    tso->ctoi_tuple_spill_words = 0;
+
     tso->trec = NO_TREC;
     tso->label = NULL;
 
@@ -1053,3 +1055,38 @@ printThreadQueue(StgTSO *t)
 }
 
 #endif /* DEBUG */
+
+/*
+ * restoreStackInvariants: restore stack invariants
+ *
+ * This should be called after restoring a captured stack from
+ * sp .. sp + words
+ */
+void
+restoreStackInvariants(StgTSO *tso, StgPtr sp, StgWord words)
+{
+    StgPtr end = sp + words;
+    StgPtr frame = sp;
+
+    /*
+       Restore ctoi_tuple_spill_words invariants after adding stack:
+
+         - set the saved value in the last stg_ctoi_t frame to the current
+              tso->ctoi_tuple_spill_words
+         - set tso->ctoi_tuple_spill_words to the value in the first stg_ctoi_t frame
+
+       See Note [GHCi unboxed tuples stack spills] in StgMiscClosures.cmm.
+     */
+     StgPtr first_ctoi_frame = NULL, last_ctoi_frame = NULL;
+     while (frame < end) {
+        if (*(StgWord*)frame == (StgWord)&stg_ctoi_t_info) {
+            if(first_ctoi_frame == NULL) first_ctoi_frame = frame;
+            last_ctoi_frame = frame;
+        }
+        frame += stack_frame_sizeW((StgClosure *)frame);
+    }
+    if(last_ctoi_frame != NULL) {
+        last_ctoi_frame[CTOI_OLD_TUPLE_SPILL_WORDS_OFFSET] = tso->ctoi_tuple_spill_words;
+        tso->ctoi_tuple_spill_words = first_ctoi_frame[CTOI_TUPLE_INFO_OFFSET] >> 24;
+    }
+}


=====================================
rts/Threads.h
=====================================
@@ -40,6 +40,10 @@ StgBool isThreadBound (StgTSO* tso);
 void threadStackOverflow  (Capability *cap, StgTSO *tso);
 W_   threadStackUnderflow (Capability *cap, StgTSO *tso);
 
+#define CTOI_OLD_TUPLE_SPILL_WORDS_OFFSET 4
+#define CTOI_TUPLE_INFO_OFFSET 2
+void restoreStackInvariants(StgTSO *tso, StgPtr sp, StgWord words);
+
 #if defined(DEBUG)
 void printThreadBlockage (StgTSO *tso);
 void printThreadStatus (StgTSO *t);


=====================================
rts/include/rts/Bytecodes.h
=====================================
@@ -232,4 +232,10 @@
    cases. */
 #define INTERP_STACK_CHECK_THRESH 50
 
+/* Maximum nativeCallStackSpillSize for which we use a small stg_ctoi_tN
+   frame (no old_spill slot, no TSO access) instead of the generic
+   stg_ctoi_t frame.  Must match the stg_ctoi_tN definitions in
+   StgMiscClosures.cmm. */
+#define MAX_SMALL_TUPLE_CTOI    8
+
 /*-------------------------------------------------------------------------*/


=====================================
rts/include/rts/storage/TSO.h
=====================================
@@ -186,6 +186,15 @@ typedef struct StgTSO_ {
      */
     StgWord32  tot_stack_size;
 
+    /*
+     * The number of stack words spilled by the current stg_ctoi_t
+     * frame. This is used by stg_ctoi_t to handle tuple returns from compiled
+     * to interpreted code.
+     *
+     * See Note [GHCi unboxed tuples stack spills] in StgMiscClosures.cmm
+     */
+    StgWord    ctoi_tuple_spill_words;
+
 #if defined(TICKY_TICKY)
     /* TICKY-specific stuff would go here. */
 #endif


=====================================
rts/include/stg/MiscClosures.h
=====================================
@@ -96,7 +96,7 @@ RTS_RET(stg_ctoi_D1);
 RTS_RET(stg_ctoi_L1);
 RTS_RET(stg_ctoi_V);
 
-RTS_FUN_DECL(stg_ctoi_t);
+RTS_RET(stg_ctoi_t);
 RTS_RET(stg_ctoi_t0);
 RTS_RET(stg_ctoi_t1);
 RTS_RET(stg_ctoi_t2);
@@ -106,66 +106,6 @@ RTS_RET(stg_ctoi_t5);
 RTS_RET(stg_ctoi_t6);
 RTS_RET(stg_ctoi_t7);
 RTS_RET(stg_ctoi_t8);
-RTS_RET(stg_ctoi_t9);
-
-RTS_RET(stg_ctoi_t10);
-RTS_RET(stg_ctoi_t11);
-RTS_RET(stg_ctoi_t12);
-RTS_RET(stg_ctoi_t13);
-RTS_RET(stg_ctoi_t14);
-RTS_RET(stg_ctoi_t15);
-RTS_RET(stg_ctoi_t16);
-RTS_RET(stg_ctoi_t17);
-RTS_RET(stg_ctoi_t18);
-RTS_RET(stg_ctoi_t19);
-
-RTS_RET(stg_ctoi_t20);
-RTS_RET(stg_ctoi_t21);
-RTS_RET(stg_ctoi_t22);
-RTS_RET(stg_ctoi_t23);
-RTS_RET(stg_ctoi_t24);
-RTS_RET(stg_ctoi_t25);
-RTS_RET(stg_ctoi_t26);
-RTS_RET(stg_ctoi_t27);
-RTS_RET(stg_ctoi_t28);
-RTS_RET(stg_ctoi_t29);
-
-RTS_RET(stg_ctoi_t30);
-RTS_RET(stg_ctoi_t31);
-RTS_RET(stg_ctoi_t32);
-RTS_RET(stg_ctoi_t33);
-RTS_RET(stg_ctoi_t34);
-RTS_RET(stg_ctoi_t35);
-RTS_RET(stg_ctoi_t36);
-RTS_RET(stg_ctoi_t37);
-RTS_RET(stg_ctoi_t38);
-RTS_RET(stg_ctoi_t39);
-
-RTS_RET(stg_ctoi_t40);
-RTS_RET(stg_ctoi_t41);
-RTS_RET(stg_ctoi_t42);
-RTS_RET(stg_ctoi_t43);
-RTS_RET(stg_ctoi_t44);
-RTS_RET(stg_ctoi_t45);
-RTS_RET(stg_ctoi_t46);
-RTS_RET(stg_ctoi_t47);
-RTS_RET(stg_ctoi_t48);
-RTS_RET(stg_ctoi_t49);
-
-RTS_RET(stg_ctoi_t50);
-RTS_RET(stg_ctoi_t51);
-RTS_RET(stg_ctoi_t52);
-RTS_RET(stg_ctoi_t53);
-RTS_RET(stg_ctoi_t54);
-RTS_RET(stg_ctoi_t55);
-RTS_RET(stg_ctoi_t56);
-RTS_RET(stg_ctoi_t57);
-RTS_RET(stg_ctoi_t58);
-RTS_RET(stg_ctoi_t59);
-
-RTS_RET(stg_ctoi_t60);
-RTS_RET(stg_ctoi_t61);
-RTS_RET(stg_ctoi_t62);
 
 RTS_RET(stg_primcall);
 RTS_RET(stg_apply_interp);


=====================================
testsuite/tests/bytecode/tuplestress/ByteCode.hs
=====================================
@@ -0,0 +1,9 @@
+{-# LANGUAGE CPP, UnboxedTuples, MagicHash #-}
+{-# OPTIONS_GHC -fbyte-code #-}
+
+module ByteCode where
+
+import GHC.Exts
+import GHC.Word
+
+#include "Common.hs-incl"


=====================================
testsuite/tests/bytecode/tuplestress/Common.hs-incl
=====================================
@@ -0,0 +1,492 @@
+-- Stress test definitions for unboxed tuples in the bytecode interpreter.
+--
+-- See Note [Unboxed tuple stress test] for an overview.
+
+-- Note [Unboxed tuple stress test]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- This test exercises the marshalling of unboxed tuples between
+-- native code and the bytecode interpreter. It systematically tests
+-- various tuple sizes around key boundaries (register capacity,
+-- small frame limit), different element types (pointers, Int#,
+-- Double#, Float#, Word64#, sub-word types), mixed type combinations,
+-- and void components.
+--
+-- For each tuple type, a NOINLINE producer and consumer are defined.
+-- The main test calls each through all four combinations of
+-- bytecode/native producer x bytecode/native consumer.
+--
+-- Key boundaries on x86_64:
+--   - 6 vanilla registers for pointers and non-pointer words
+--   - 6 float/double registers
+--   - small_tuple_frame: nativeCallStackSpillSize <= mAX_SMALL_TUPLE_CTOI
+--   - generic stg_ctoi_t frame for larger spills
+
+-- ============================================================
+-- Pure pointer tuples
+-- ============================================================
+
+{-# NOINLINE p2 #-}
+p2 :: a -> a -> (# a, a #)
+p2 x1 x2 = (# x1, x2 #)
+
+{-# NOINLINE p2_a #-}
+p2_a :: (a -> a -> (# a, a #)) -> a -> a -> (a, a)
+p2_a f x1 x2 = case f x1 x2 of (# y1, y2 #) -> (y1, y2)
+
+{-# NOINLINE p7 #-}
+p7 :: a -> a -> a -> a -> a -> a -> a
+   -> (# a, a, a, a, a, a, a #)
+p7 x1 x2 x3 x4 x5 x6 x7 =
+    (# x1, x2, x3, x4, x5, x6, x7 #)
+
+{-# NOINLINE p7_a #-}
+p7_a :: (a -> a -> a -> a -> a -> a -> a
+     -> (# a, a, a, a, a, a, a #))
+     -> a -> a -> a -> a -> a -> a -> a
+     -> (a, a, a, a, a, a, a)
+p7_a f x1 x2 x3 x4 x5 x6 x7 =
+    case f x1 x2 x3 x4 x5 x6 x7 of
+      (# y1, y2, y3, y4, y5, y6, y7 #) ->
+        (y1, y2, y3, y4, y5, y6, y7)
+
+-- ============================================================
+-- Pure Int# tuples
+-- ============================================================
+
+{-# NOINLINE n2 #-}
+n2 :: Int -> Int -> (# Int#, Int# #)
+n2 (I# x1) (I# x2) = (# x1, x2 #)
+
+{-# NOINLINE n2_a #-}
+n2_a :: (Int -> Int -> (# Int#, Int# #)) -> Int -> Int -> (Int, Int)
+n2_a f x1 x2 = case f x1 x2 of (# y1, y2 #) -> (I# y1, I# y2)
+
+{-# NOINLINE n7 #-}
+n7 :: Int -> Int -> Int -> Int -> Int -> Int -> Int
+   -> (# Int#, Int#, Int#, Int#, Int#, Int#, Int# #)
+n7 (I# x1) (I# x2) (I# x3) (I# x4) (I# x5) (I# x6) (I# x7) =
+    (# x1, x2, x3, x4, x5, x6, x7 #)
+
+{-# NOINLINE n7_a #-}
+n7_a :: (Int -> Int -> Int -> Int -> Int -> Int -> Int
+     -> (# Int#, Int#, Int#, Int#, Int#, Int#, Int# #))
+     -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+     -> (Int, Int, Int, Int, Int, Int, Int)
+n7_a f x1 x2 x3 x4 x5 x6 x7 =
+    case f x1 x2 x3 x4 x5 x6 x7 of
+      (# y1, y2, y3, y4, y5, y6, y7 #) ->
+        (I# y1, I# y2, I# y3, I# y4, I# y5, I# y6, I# y7)
+
+type TN15 = Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> (# Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#
+            ,  Int#, Int#, Int#, Int#, Int#, Int#, Int# #)
+
+{-# NOINLINE n15 #-}
+n15 :: TN15
+n15 (I# x1) (I# x2) (I# x3) (I# x4) (I# x5) (I# x6) (I# x7) (I# x8)
+    (I# x9) (I# x10) (I# x11) (I# x12) (I# x13) (I# x14) (I# x15) =
+    (# x1, x2, x3, x4, x5, x6, x7, x8
+    ,  x9, x10, x11, x12, x13, x14, x15 #)
+
+{-# NOINLINE n15_a #-}
+n15_a :: TN15
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> ((Int,Int,Int,Int,Int),(Int,Int,Int,Int,Int),(Int,Int,Int,Int,Int))
+n15_a f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 =
+    case f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 of
+      (# y1, y2, y3, y4, y5, y6, y7, y8
+       , y9, y10, y11, y12, y13, y14, y15 #) ->
+        ( (I# y1, I# y2, I# y3, I# y4, I# y5)
+        , (I# y6, I# y7, I# y8, I# y9, I# y10)
+        , (I# y11, I# y12, I# y13, I# y14, I# y15) )
+
+-- ============================================================
+-- Pure Double# tuples
+-- ============================================================
+
+{-# NOINLINE d7 #-}
+d7 :: Double -> Double -> Double -> Double
+   -> Double -> Double -> Double
+   -> (# Double#, Double#, Double#, Double#
+      ,  Double#, Double#, Double# #)
+d7 (D# x1) (D# x2) (D# x3) (D# x4) (D# x5) (D# x6) (D# x7) =
+    (# x1, x2, x3, x4, x5, x6, x7 #)
+
+{-# NOINLINE d7_a #-}
+d7_a :: (Double -> Double -> Double -> Double
+     -> Double -> Double -> Double
+     -> (# Double#, Double#, Double#, Double#
+        ,  Double#, Double#, Double# #))
+     -> Double -> Double -> Double -> Double
+     -> Double -> Double -> Double
+     -> (Double, Double, Double, Double, Double, Double, Double)
+d7_a f x1 x2 x3 x4 x5 x6 x7 =
+    case f x1 x2 x3 x4 x5 x6 x7 of
+      (# y1, y2, y3, y4, y5, y6, y7 #) ->
+        (D# y1, D# y2, D# y3, D# y4, D# y5, D# y6, D# y7)
+
+-- ============================================================
+-- Pure Float# tuples
+-- ============================================================
+
+{-# NOINLINE fl7 #-}
+fl7 :: Float -> Float -> Float -> Float
+    -> Float -> Float -> Float
+    -> (# Float#, Float#, Float#, Float#
+       ,  Float#, Float#, Float# #)
+fl7 (F# x1) (F# x2) (F# x3) (F# x4) (F# x5) (F# x6) (F# x7) =
+    (# x1, x2, x3, x4, x5, x6, x7 #)
+
+{-# NOINLINE fl7_a #-}
+fl7_a :: (Float -> Float -> Float -> Float
+      -> Float -> Float -> Float
+      -> (# Float#, Float#, Float#, Float#
+         ,  Float#, Float#, Float# #))
+      -> Float -> Float -> Float -> Float
+      -> Float -> Float -> Float
+      -> (Float, Float, Float, Float, Float, Float, Float)
+fl7_a f x1 x2 x3 x4 x5 x6 x7 =
+    case f x1 x2 x3 x4 x5 x6 x7 of
+      (# y1, y2, y3, y4, y5, y6, y7 #) ->
+        (F# y1, F# y2, F# y3, F# y4, F# y5, F# y6, F# y7)
+
+-- ============================================================
+-- Pure Word64# tuples
+-- ============================================================
+
+{-# NOINLINE w7 #-}
+w7 :: Word64 -> Word64 -> Word64 -> Word64
+   -> Word64 -> Word64 -> Word64
+   -> (# Word64#, Word64#, Word64#, Word64#
+      ,  Word64#, Word64#, Word64# #)
+w7 (W64# x1) (W64# x2) (W64# x3) (W64# x4)
+   (W64# x5) (W64# x6) (W64# x7) =
+    (# x1, x2, x3, x4, x5, x6, x7 #)
+
+{-# NOINLINE w7_a #-}
+w7_a :: (Word64 -> Word64 -> Word64 -> Word64
+     -> Word64 -> Word64 -> Word64
+     -> (# Word64#, Word64#, Word64#, Word64#
+        ,  Word64#, Word64#, Word64# #))
+     -> Word64 -> Word64 -> Word64 -> Word64
+     -> Word64 -> Word64 -> Word64
+     -> (Word64, Word64, Word64, Word64, Word64, Word64, Word64)
+w7_a f x1 x2 x3 x4 x5 x6 x7 =
+    case f x1 x2 x3 x4 x5 x6 x7 of
+      (# y1, y2, y3, y4, y5, y6, y7 #) ->
+        (W64# y1, W64# y2, W64# y3, W64# y4,
+         W64# y5, W64# y6, W64# y7)
+
+-- ============================================================
+-- Mixed pointer + Int# tuples (interleaved)
+-- ============================================================
+
+-- 6 elements: 3 pointers + 3 Int#
+{-# NOINLINE mpi6 #-}
+mpi6 :: Int -> Int -> Int -> Int -> Int -> Int
+     -> (# Int, Int#, Int, Int#, Int, Int# #)
+mpi6 x1 (I# x2) x3 (I# x4) x5 (I# x6) =
+    (# x1, x2, x3, x4, x5, x6 #)
+
+{-# NOINLINE mpi6_a #-}
+mpi6_a :: (Int -> Int -> Int -> Int -> Int -> Int
+       -> (# Int, Int#, Int, Int#, Int, Int# #))
+       -> Int -> Int -> Int -> Int -> Int -> Int
+       -> (Int, Int, Int, Int, Int, Int)
+mpi6_a f x1 x2 x3 x4 x5 x6 =
+    case f x1 x2 x3 x4 x5 x6 of
+      (# y1, y2, y3, y4, y5, y6 #) ->
+        (y1, I# y2, y3, I# y4, y5, I# y6)
+
+-- ============================================================
+-- Mixed pointer + Double# tuples (interleaved)
+-- ============================================================
+
+-- 6 elements: 3 pointers + 3 Double#
+{-# NOINLINE mpd6 #-}
+mpd6 :: Int -> Double -> Int -> Double -> Int -> Double
+     -> (# Int, Double#, Int, Double#, Int, Double# #)
+mpd6 x1 (D# x2) x3 (D# x4) x5 (D# x6) =
+    (# x1, x2, x3, x4, x5, x6 #)
+
+{-# NOINLINE mpd6_a #-}
+mpd6_a :: (Int -> Double -> Int -> Double -> Int -> Double
+       -> (# Int, Double#, Int, Double#, Int, Double# #))
+       -> Int -> Double -> Int -> Double -> Int -> Double
+       -> (Int, Double, Int, Double, Int, Double)
+mpd6_a f x1 x2 x3 x4 x5 x6 =
+    case f x1 x2 x3 x4 x5 x6 of
+      (# y1, y2, y3, y4, y5, y6 #) ->
+        (y1, D# y2, y3, D# y4, y5, D# y6)
+
+-- ============================================================
+-- Mixed all types: pointer + Int# + Double# + Float#
+-- ============================================================
+
+-- 8 elements: 2 of each type, interleaved
+{-# NOINLINE mall8 #-}
+mall8 :: Int -> Int -> Double -> Float -> Int -> Int -> Double -> Float
+      -> (# Int, Int#, Double#, Float#, Int, Int#, Double#, Float# #)
+mall8 x1 (I# x2) (D# x3) (F# x4) x5 (I# x6) (D# x7) (F# x8) =
+    (# x1, x2, x3, x4, x5, x6, x7, x8 #)
+
+{-# NOINLINE mall8_a #-}
+mall8_a :: (Int -> Int -> Double -> Float -> Int -> Int -> Double -> Float
+        -> (# Int, Int#, Double#, Float#, Int, Int#, Double#, Float# #))
+        -> Int -> Int -> Double -> Float -> Int -> Int -> Double -> Float
+        -> (Int, Int, Double, Float, Int, Int, Double, Float)
+mall8_a f x1 x2 x3 x4 x5 x6 x7 x8 =
+    case f x1 x2 x3 x4 x5 x6 x7 x8 of
+      (# y1, y2, y3, y4, y5, y6, y7, y8 #) ->
+        (y1, I# y2, D# y3, F# y4, y5, I# y6, D# y7, F# y8)
+
+-- ============================================================
+-- Sub-word types: Word8#, Word16#, Word32#
+-- ============================================================
+
+{-# NOINLINE sub5 #-}
+sub5 :: Word8 -> Word16 -> Word32 -> Int -> Int
+     -> (# Word8#, Word16#, Word32#, Int#, Int #)
+sub5 (W8# x1) (W16# x2) (W32# x3) (I# x4) x5 =
+    (# x1, x2, x3, x4, x5 #)
+
+{-# NOINLINE sub5_a #-}
+sub5_a :: (Word8 -> Word16 -> Word32 -> Int -> Int
+       -> (# Word8#, Word16#, Word32#, Int#, Int #))
+       -> Word8 -> Word16 -> Word32 -> Int -> Int
+       -> (Word8, Word16, Word32, Int, Int)
+sub5_a f x1 x2 x3 x4 x5 =
+    case f x1 x2 x3 x4 x5 of
+      (# y1, y2, y3, y4, y5 #) ->
+        (W8# y1, W16# y2, W32# y3, I# y4, y5)
+
+-- ============================================================
+-- Void components: (# #) interleaved with real values
+-- ============================================================
+
+{-# NOINLINE vd6 #-}
+vd6 :: Int -> Int -> Int
+    -> (# Int, (# #), Int, (# #), Int#, (# #) #)
+vd6 x1 x2 (I# x3) = (# x1, (# #), x2, (# #), x3, (# #) #)
+
+{-# NOINLINE vd6_a #-}
+vd6_a :: (Int -> Int -> Int
+      -> (# Int, (# #), Int, (# #), Int#, (# #) #))
+      -> Int -> Int -> Int
+      -> (Int, Int, Int)
+vd6_a f x1 x2 x3 =
+    case f x1 x2 x3 of
+      (# y1, _, y3, _, y5, _ #) -> (y1, y3, I# y5)
+
+-- ============================================================
+-- Recursive step functions
+-- ============================================================
+
+-- 4-element mixed step: each element incremented by a different amount
+-- ptr: +1, Int#: +2, Double#: +0.5, Double#: +1.5
+{-# NOINLINE rec_step4 #-}
+rec_step4 :: Int -> Int -> Double -> Double
+          -> (# Int, Int#, Double#, Double# #)
+rec_step4 x1 (I# x2) (D# x3) (D# x4) =
+    (# x1 + 1, x2 +# 2#, x3 +## 0.5##, x4 +## 1.5## #)
+
+{-# NOINLINE rec_step4_a #-}
+rec_step4_a :: (Int -> Int -> Double -> Double
+            -> (# Int, Int#, Double#, Double# #))
+            -> Int -> Int -> Double -> Double
+            -> (Int, Int, Double, Double)
+rec_step4_a f x1 x2 x3 x4 =
+    case f x1 x2 x3 x4 of
+      (# y1, y2, y3, y4 #) -> (y1, I# y2, D# y3, D# y4)
+
+-- ============================================================
+-- Large tuples: boundary and stress sizes
+-- ============================================================
+
+-- 14 Int#: exactly stg_ctoi_t8 (last small frame, spill = 8 words on x86_64)
+type TN14 = Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> (# Int#, Int#, Int#, Int#, Int#, Int#, Int#
+            ,  Int#, Int#, Int#, Int#, Int#, Int#, Int# #)
+
+{-# NOINLINE n14 #-}
+n14 :: TN14
+n14 (I# x1) (I# x2) (I# x3) (I# x4) (I# x5) (I# x6) (I# x7)
+    (I# x8) (I# x9) (I# x10) (I# x11) (I# x12) (I# x13) (I# x14) =
+    (# x1, x2, x3, x4, x5, x6, x7
+    ,  x8, x9, x10, x11, x12, x13, x14 #)
+
+{-# NOINLINE n14_a #-}
+n14_a :: TN14
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> ((Int,Int,Int,Int,Int,Int,Int),(Int,Int,Int,Int,Int,Int,Int))
+n14_a f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 =
+    case f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 of
+      (# y1, y2, y3, y4, y5, y6, y7
+       , y8, y9, y10, y11, y12, y13, y14 #) ->
+        ( (I# y1, I# y2, I# y3, I# y4, I# y5, I# y6, I# y7)
+        , (I# y8, I# y9, I# y10, I# y11, I# y12, I# y13, I# y14) )
+
+-- 20 Int#: generic frame, large spill, all non-pointer
+type TN20 = Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> (# Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#
+            ,  Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int# #)
+
+{-# NOINLINE n20 #-}
+n20 :: TN20
+n20 (I# x1) (I# x2) (I# x3) (I# x4) (I# x5)
+    (I# x6) (I# x7) (I# x8) (I# x9) (I# x10)
+    (I# x11) (I# x12) (I# x13) (I# x14) (I# x15)
+    (I# x16) (I# x17) (I# x18) (I# x19) (I# x20) =
+    (# x1, x2, x3, x4, x5, x6, x7, x8, x9, x10
+    ,  x11, x12, x13, x14, x15, x16, x17, x18, x19, x20 #)
+
+{-# NOINLINE n20_a #-}
+n20_a :: TN20
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> ((Int,Int,Int,Int,Int),(Int,Int,Int,Int,Int)
+         ,(Int,Int,Int,Int,Int),(Int,Int,Int,Int,Int))
+n20_a f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
+        x11 x12 x13 x14 x15 x16 x17 x18 x19 x20 =
+    case f x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
+           x11 x12 x13 x14 x15 x16 x17 x18 x19 x20 of
+      (# y1, y2, y3, y4, y5, y6, y7, y8, y9, y10
+       , y11, y12, y13, y14, y15, y16, y17, y18, y19, y20 #) ->
+        ( (I# y1, I# y2, I# y3, I# y4, I# y5)
+        , (I# y6, I# y7, I# y8, I# y9, I# y10)
+        , (I# y11, I# y12, I# y13, I# y14, I# y15)
+        , (I# y16, I# y17, I# y18, I# y19, I# y20) )
+
+-- 32 Int#: very large generic frame, spill = 26 words
+type TN32 = Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+         -> (# Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#
+            ,  Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#
+            ,  Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int#
+            ,  Int#, Int#, Int#, Int#, Int#, Int#, Int#, Int# #)
+
+{-# NOINLINE n32 #-}
+n32 :: TN32
+n32 (I# x1)  (I# x2)  (I# x3)  (I# x4)  (I# x5)  (I# x6)  (I# x7)  (I# x8)
+    (I# x9)  (I# x10) (I# x11) (I# x12) (I# x13) (I# x14) (I# x15) (I# x16)
+    (I# x17) (I# x18) (I# x19) (I# x20) (I# x21) (I# x22) (I# x23) (I# x24)
+    (I# x25) (I# x26) (I# x27) (I# x28) (I# x29) (I# x30) (I# x31) (I# x32) =
+    (# x1,  x2,  x3,  x4,  x5,  x6,  x7,  x8
+    ,  x9,  x10, x11, x12, x13, x14, x15, x16
+    ,  x17, x18, x19, x20, x21, x22, x23, x24
+    ,  x25, x26, x27, x28, x29, x30, x31, x32 #)
+
+{-# NOINLINE n32_a #-}
+n32_a :: TN32
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int
+      -> ((Int,Int,Int,Int,Int,Int,Int,Int)
+         ,(Int,Int,Int,Int,Int,Int,Int,Int)
+         ,(Int,Int,Int,Int,Int,Int,Int,Int)
+         ,(Int,Int,Int,Int,Int,Int,Int,Int))
+n32_a f x1  x2  x3  x4  x5  x6  x7  x8
+        x9  x10 x11 x12 x13 x14 x15 x16
+        x17 x18 x19 x20 x21 x22 x23 x24
+        x25 x26 x27 x28 x29 x30 x31 x32 =
+    case f x1  x2  x3  x4  x5  x6  x7  x8
+           x9  x10 x11 x12 x13 x14 x15 x16
+           x17 x18 x19 x20 x21 x22 x23 x24
+           x25 x26 x27 x28 x29 x30 x31 x32 of
+      (# y1,  y2,  y3,  y4,  y5,  y6,  y7,  y8
+       , y9,  y10, y11, y12, y13, y14, y15, y16
+       , y17, y18, y19, y20, y21, y22, y23, y24
+       , y25, y26, y27, y28, y29, y30, y31, y32 #) ->
+        ( (I# y1,  I# y2,  I# y3,  I# y4,  I# y5,  I# y6,  I# y7,  I# y8)
+        , (I# y9,  I# y10, I# y11, I# y12, I# y13, I# y14, I# y15, I# y16)
+        , (I# y17, I# y18, I# y19, I# y20, I# y21, I# y22, I# y23, I# y24)
+        , (I# y25, I# y26, I# y27, I# y28, I# y29, I# y30, I# y31, I# y32) )
+
+-- 32 mixed: 8 groups of (Int, Int#, Double#, Float#), all register classes
+-- spill = 14 words (10 vanilla + 2 double + 2 float)
+type TMIX32 = Int -> Int -> Double -> Float
+           -> Int -> Int -> Double -> Float
+           -> Int -> Int -> Double -> Float
+           -> Int -> Int -> Double -> Float
+           -> Int -> Int -> Double -> Float
+           -> Int -> Int -> Double -> Float
+           -> Int -> Int -> Double -> Float
+           -> Int -> Int -> Double -> Float
+           -> (# Int, Int#, Double#, Float#
+              ,  Int, Int#, Double#, Float#
+              ,  Int, Int#, Double#, Float#
+              ,  Int, Int#, Double#, Float#
+              ,  Int, Int#, Double#, Float#
+              ,  Int, Int#, Double#, Float#
+              ,  Int, Int#, Double#, Float#
+              ,  Int, Int#, Double#, Float# #)
+
+{-# NOINLINE mix32 #-}
+mix32 :: TMIX32
+mix32 x1  (I# x2)  (D# x3)  (F# x4)
+      x5  (I# x6)  (D# x7)  (F# x8)
+      x9  (I# x10) (D# x11) (F# x12)
+      x13 (I# x14) (D# x15) (F# x16)
+      x17 (I# x18) (D# x19) (F# x20)
+      x21 (I# x22) (D# x23) (F# x24)
+      x25 (I# x26) (D# x27) (F# x28)
+      x29 (I# x30) (D# x31) (F# x32) =
+    (# x1,  x2,  x3,  x4
+    ,  x5,  x6,  x7,  x8
+    ,  x9,  x10, x11, x12
+    ,  x13, x14, x15, x16
+    ,  x17, x18, x19, x20
+    ,  x21, x22, x23, x24
+    ,  x25, x26, x27, x28
+    ,  x29, x30, x31, x32 #)
+
+{-# NOINLINE mix32_a #-}
+mix32_a :: TMIX32
+        -> Int -> Int -> Double -> Float
+        -> Int -> Int -> Double -> Float
+        -> Int -> Int -> Double -> Float
+        -> Int -> Int -> Double -> Float
+        -> Int -> Int -> Double -> Float
+        -> Int -> Int -> Double -> Float
+        -> Int -> Int -> Double -> Float
+        -> Int -> Int -> Double -> Float
+        -> ((Int,Int,Double,Float)
+           ,(Int,Int,Double,Float)
+           ,(Int,Int,Double,Float)
+           ,(Int,Int,Double,Float)
+           ,(Int,Int,Double,Float)
+           ,(Int,Int,Double,Float)
+           ,(Int,Int,Double,Float)
+           ,(Int,Int,Double,Float))
+mix32_a f x1  x2  x3  x4  x5  x6  x7  x8
+          x9  x10 x11 x12 x13 x14 x15 x16
+          x17 x18 x19 x20 x21 x22 x23 x24
+          x25 x26 x27 x28 x29 x30 x31 x32 =
+    case f x1  x2  x3  x4  x5  x6  x7  x8
+           x9  x10 x11 x12 x13 x14 x15 x16
+           x17 x18 x19 x20 x21 x22 x23 x24
+           x25 x26 x27 x28 x29 x30 x31 x32 of
+      (# y1,  y2,  y3,  y4
+       , y5,  y6,  y7,  y8
+       , y9,  y10, y11, y12
+       , y13, y14, y15, y16
+       , y17, y18, y19, y20
+       , y21, y22, y23, y24
+       , y25, y26, y27, y28
+       , y29, y30, y31, y32 #) ->
+        ( (y1,  I# y2,  D# y3,  F# y4)
+        , (y5,  I# y6,  D# y7,  F# y8)
+        , (y9,  I# y10, D# y11, F# y12)
+        , (y13, I# y14, D# y15, F# y16)
+        , (y17, I# y18, D# y19, F# y20)
+        , (y21, I# y22, D# y23, F# y24)
+        , (y25, I# y26, D# y27, F# y28)
+        , (y29, I# y30, D# y31, F# y32) )


=====================================
testsuite/tests/bytecode/tuplestress/Obj.hs
=====================================
@@ -0,0 +1,11 @@
+{-# LANGUAGE CPP, UnboxedTuples, MagicHash #-}
+{-# OPTIONS_GHC -fobject-code #-}
+
+#include "MachDeps.h"
+
+module Obj where
+
+import GHC.Exts
+import GHC.Word
+
+#include "Common.hs-incl"


=====================================
testsuite/tests/bytecode/tuplestress/TupleStress.hs
=====================================
@@ -0,0 +1,631 @@
+{-# LANGUAGE UnboxedTuples, MagicHash #-}
+{-# OPTIONS_GHC -fbyte-code #-}
+
+{-
+  Stress test for unboxed tuples in the bytecode interpreter.
+
+  Tests various sized tuples with different element types,
+  focusing on converting tuples between native code and
+  interpreted code in all four combinations:
+    ByteCode producer x ByteCode consumer
+    ByteCode producer x Object consumer
+    Object producer x ByteCode consumer
+    Object producer x Object consumer
+
+  See Note [Unboxed tuple stress test] in Common.hs-incl.
+ -}
+
+module Main where
+
+import qualified Obj      as O
+import qualified ByteCode as B
+
+import GHC.Exts
+import GHC.Word
+import Control.Exception (try, evaluate, catch, SomeException)
+import Control.Concurrent
+import System.IO.Unsafe (unsafePerformIO)
+
+main :: IO ()
+main = do
+
+    -- ========================================================
+    -- Pure tuple tests: all 4 combinations (BB/BO/OB/OO)
+    -- ========================================================
+
+    testX "p7"
+          B.p7_a O.p7_a
+          B.p7   O.p7
+          (\f -> f (1::Int) 2 3 4 5 6 7)
+
+    testX "n2"
+          B.n2_a O.n2_a
+          B.n2   O.n2
+          (\f -> f 1 2)
+
+    testX "n7"
+          B.n7_a O.n7_a
+          B.n7   O.n7
+          (\f -> f 1 2 3 4 5 6 7)
+
+    testX "n15"
+          B.n15_a O.n15_a
+          B.n15   O.n15
+          (\f -> f 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+
+    testX "d7"
+          B.d7_a O.d7_a
+          B.d7   O.d7
+          (\f -> f 1.5 2.5 3.5 4.5 5.5 6.5 7.5)
+
+    testX "fl7"
+          B.fl7_a O.fl7_a
+          B.fl7   O.fl7
+          (\f -> f 1.25 2.25 3.25 4.25 5.25 6.25 7.25)
+
+    testX "w7"
+          B.w7_a O.w7_a
+          B.w7   O.w7
+          (\f -> f 100 200 300 400 500 600 700)
+
+    testX "mpi6"
+          B.mpi6_a O.mpi6_a
+          B.mpi6   O.mpi6
+          (\f -> f 1 2 3 4 5 6)
+
+    testX "mpd6"
+          B.mpd6_a O.mpd6_a
+          B.mpd6   O.mpd6
+          (\f -> f 1 1.5 2 2.5 3 3.5)
+
+    testX "mall8"
+          B.mall8_a O.mall8_a
+          B.mall8   O.mall8
+          (\f -> f 1 2 3.0 4.0 5 6 7.0 8.0)
+
+    testX "sub5"
+          B.sub5_a O.sub5_a
+          B.sub5   O.sub5
+          (\f -> f 42 1000 70000 99 100)
+
+    testX "vd6"
+          B.vd6_a O.vd6_a
+          B.vd6   O.vd6
+          (\f -> f 11 22 33)
+
+    -- 14 Int#: exactly at stg_ctoi_t8 boundary (last small frame)
+    testX "n14"
+          B.n14_a O.n14_a
+          B.n14   O.n14
+          (\f -> f 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
+
+    -- 20 Int#: generic frame, all non-pointer
+    testX "n20"
+          B.n20_a O.n20_a
+          B.n20   O.n20
+          (\f -> f 1 2 3 4 5 6 7 8 9 10
+                   11 12 13 14 15 16 17 18 19 20)
+
+    -- 32 Int#: very large generic frame (spill = 26 words)
+    testX "n32"
+          B.n32_a O.n32_a
+          B.n32   O.n32
+          (\f -> f 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+                   17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    -- 32 mixed (ptr+Int#+Double#+Float#): all register classes (spill = 14)
+    testX "mix32"
+          B.mix32_a O.mix32_a
+          B.mix32   O.mix32
+          (\f -> f 1 2 3.0 4.0  5 6 7.0 8.0  9 10 11.0 12.0
+                   13 14 15.0 16.0  17 18 19.0 20.0  21 22 23.0 24.0
+                   25 26 27.0 28.0  29 30 31.0 32.0)
+
+    -- ========================================================
+    -- Loop tests: repeated calls to detect state corruption
+    -- ========================================================
+
+    -- Pointer 7-tuple loop, alternating B->O and O->B directions
+    let loop_p7_ok = and
+          [ (if even i then O.p7_a B.p7 else B.p7_a O.p7)
+            i (i+1) (i+2) (i+3) (i+4) (i+5) (i+6)
+            == (i, i+1, i+2, i+3, i+4, i+5, i+6)
+          | i <- [0 :: Int, 7 .. 700]
+          ]
+    putStrLn $ "loop_p7 " ++ show loop_p7_ok
+
+    -- Mixed ptr+Double# loop
+    let loop_mpd_ok = and
+          [ O.mpd6_a B.mpd6 i (fromIntegral i + 0.5)
+            (i+1) (fromIntegral (i+1) + 0.5)
+            (i+2) (fromIntegral (i+2) + 0.5)
+            == ( i, fromIntegral i + 0.5
+               , i+1, fromIntegral (i+1) + 0.5
+               , i+2, fromIntegral (i+2) + 0.5 )
+          | i <- [0 :: Int, 3 .. 300]
+          ]
+    putStrLn $ "loop_mpd " ++ show loop_mpd_ok
+
+    -- 32-element Int# loop: exercises very large generic frame
+    let loop_n32_ok = and
+          [ B.n32_a O.n32
+            i (i+1) (i+2) (i+3) (i+4) (i+5) (i+6) (i+7)
+            (i+8) (i+9) (i+10) (i+11) (i+12) (i+13) (i+14) (i+15)
+            (i+16) (i+17) (i+18) (i+19) (i+20) (i+21) (i+22) (i+23)
+            (i+24) (i+25) (i+26) (i+27) (i+28) (i+29) (i+30) (i+31)
+            == ( (i,i+1,i+2,i+3,i+4,i+5,i+6,i+7)
+               , (i+8,i+9,i+10,i+11,i+12,i+13,i+14,i+15)
+               , (i+16,i+17,i+18,i+19,i+20,i+21,i+22,i+23)
+               , (i+24,i+25,i+26,i+27,i+28,i+29,i+30,i+31) )
+          | i <- [0 :: Int, 32 .. 3200]
+          ]
+    putStrLn $ "loop_n32 " ++ show loop_n32_ok
+
+    -- ========================================================
+    -- Chain tests: output of one call feeds into the next
+    -- ========================================================
+
+    -- 7-tuple chain with arithmetic
+    let (c1,c2,c3,c4,c5,c6,c7) = O.p7_a B.p7 (10::Int) 20 30 40 50 60 70
+    putStrLn $ "chain_arith " ++ show
+      (B.p7_a O.p7 (c1+c7) (c2+c6) (c3+c5) c4 (c5+c3) (c6+c2) (c7+c1))
+
+    -- 100 alternating swaps across bytecode/native boundary
+    putStrLn $ "swap_stress " ++ show (swapStress (100 :: Int) (1 :: Int, 2))
+
+    -- ========================================================
+    -- Recursive tuple tests
+    -- ========================================================
+
+    -- 4-element mixed accumulation: 50 steps alternating B/O
+    -- rec_step4 (x1,x2,x3,x4) = (x1+1, x2+2, x3+0.5, x4+1.5)
+    -- After 50 steps from (0,0,0,0): (50, 100, 25.0, 75.0)
+    let recMixed x1 x2 x3 x4 0 = (x1, x2, x3, x4)
+        recMixed x1 x2 x3 x4 n
+          | even n    = let (a,b,c,d) = B.rec_step4_a O.rec_step4 x1 x2 x3 x4
+                        in recMixed a b c d (n-1)
+          | otherwise = let (a,b,c,d) = O.rec_step4_a B.rec_step4 x1 x2 x3 x4
+                        in recMixed a b c d (n-1)
+    putStrLn $ "rec_mixed " ++ show
+      (recMixed (0::Int) (0::Int) (0.0::Double) (0.0::Double) (50::Int))
+
+    -- Fibonacci via 2-tuples, 30 levels crossing boundaries at each level
+    let fibCross 0 = B.n2_a O.n2 0 1
+        fibCross 1 = O.n2_a B.n2 1 0
+        fibCross n =
+          let (a, b) = fibCross (n-1)
+          in if even n
+             then B.n2_a O.n2 (a+b) a
+             else O.n2_a B.n2 (a+b) a
+    putStrLn $ "fib_cross " ++ show (fst (fibCross (30::Int)))
+
+    -- ========================================================
+    -- Exception tests: verify stack state is restored
+    -- ========================================================
+
+    -- Exception in 7-element Int# tuple (small frame), B->O
+    do r <- tryEval (B.n7_a O.n7 (error "exc") 2 3 4 5 6 7)
+       let threw = case r of { Left _ -> True; Right _ -> False }
+       let ok = B.n7_a O.n7 1 2 3 4 5 6 7 == (1,2,3,4,5,6,7)
+       putStrLn $ "exc_n7_bo " ++ show (threw && ok)
+
+    -- Exception in 15-element Int# tuple (generic frame), B->O
+    do r <- tryEval (B.n15_a O.n15 (error "exc") 2 3 4 5 6 7 8
+                     9 10 11 12 13 14 15)
+       let threw = case r of { Left _ -> True; Right _ -> False }
+       let ok = B.n15_a O.n15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+                == ((1,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15))
+       putStrLn $ "exc_n15_bo " ++ show (threw && ok)
+
+    -- Exception in mixed ptr+Double# tuple, B->O
+    do r <- tryEval (B.mpd6_a O.mpd6 1 (error "exc") 2 2.5 3 3.5)
+       let threw = case r of { Left _ -> True; Right _ -> False }
+       let ok = B.mpd6_a O.mpd6 1 1.5 2 2.5 3 3.5 == (1,1.5,2,2.5,3,3.5)
+       putStrLn $ "exc_mpd_bo " ++ show (threw && ok)
+
+    -- Repeated exceptions: throw 50 times, then verify recovery
+    do let throwOnce = tryEval (B.n7_a O.n7 (error "exc") 2 3 4 5 6 7)
+       results <- sequence [throwOnce | _ <- [1..50::Int]]
+       let allThrew = all (\r -> case r of { Left _ -> True; Right _ -> False })
+                          results
+       let final = O.n7_a B.n7 10 20 30 40 50 60 70
+       putStrLn $ "exc_repeat " ++ show (allThrew && final == (10,20,30,40,50,60,70))
+
+    -- Exception at stg_ctoi_t8 boundary (14 Int#, last small frame)
+    do r <- tryEval (B.n14_a O.n14 (error "exc") 2 3 4 5 6 7
+                     8 9 10 11 12 13 14)
+       let threw = case r of { Left _ -> True; Right _ -> False }
+       let ok = B.n14_a O.n14 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+                == ((1,2,3,4,5,6,7),(8,9,10,11,12,13,14))
+       putStrLn $ "exc_n14_bo " ++ show (threw && ok)
+
+    -- Exception with 32-element Int# tuple (very large generic frame)
+    do r <- tryEval (B.n32_a O.n32 (error "exc") 2 3 4 5 6 7 8
+                     9 10 11 12 13 14 15 16
+                     17 18 19 20 21 22 23 24
+                     25 26 27 28 29 30 31 32)
+       let threw = case r of { Left _ -> True; Right _ -> False }
+       let ok = B.n32_a O.n32 1 2 3 4 5 6 7 8
+                9 10 11 12 13 14 15 16
+                17 18 19 20 21 22 23 24
+                25 26 27 28 29 30 31 32
+                == ((1,2,3,4,5,6,7,8),(9,10,11,12,13,14,15,16),
+                    (17,18,19,20,21,22,23,24),(25,26,27,28,29,30,31,32))
+       putStrLn $ "exc_n32_bo " ++ show (threw && ok)
+
+    -- ========================================================
+    -- Nested generic ctoi exception tests
+    -- ========================================================
+    -- Tests that exception unwinding correctly restores
+    -- ctoi_tuple_spill_words when passing through multiple
+    -- stg_ctoi_t frames.
+    -- See Note [GHCi unboxed tuples stack spills] in StgMiscClosures.cmm.
+
+    -- Exception through 2 nested generic ctoi frames (n15 inside n20).
+    do let l1 = case B.n15_a O.n15 (error "exc") 2 3 4 5 6 7 8
+                     9 10 11 12 13 14 15
+                of ((a,_,_,_,_),_,_) -> a
+       r <- tryEval (B.n20_a O.n20 l1 2 3 4 5 6 7 8 9 10
+                     11 12 13 14 15 16 17 18 19 20)
+       let threw = case r of { Left _ -> True; Right _ -> False }
+       let ok = B.n20_a O.n20 1 2 3 4 5 6 7 8 9 10
+                11 12 13 14 15 16 17 18 19 20
+                == ((1,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15),(16,17,18,19,20))
+       putStrLn $ "exc_nested_2gen " ++ show (threw && ok)
+
+    -- Exception caught between 2 generic ctoi frames.
+    -- A catch handler sits between ctoi(n20,spill=14) and ctoi(n15,spill=9).
+    -- The error in O.n15 unwinds through ctoi(n15), which must restore
+    -- ctoi_tuple_spill_words to the outer frame's spill count before
+    -- hitting the catch. If the restore is missing, ctoi(n20) reads the
+    -- wrong number of spill words and corrupts the stack.
+    do let inner_result :: Int
+           inner_result = unsafePerformIO $
+             catch (evaluate (case B.n15_a O.n15 (error "exc") 2 3 4 5 6 7 8
+                                   9 10 11 12 13 14 15
+                              of ((a,_,_,_,_),_,_) -> a))
+                   (const (return 99) :: SomeException -> IO Int)
+       result <- evaluate (B.n20_a O.n20 inner_result 2 3 4 5 6 7 8 9 10
+                           11 12 13 14 15 16 17 18 19 20)
+       putStrLn $ "exc_catch_between " ++ show
+         (result == ((99,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15),(16,17,18,19,20)))
+
+    -- ========================================================
+    -- Async exception / AP_STACK replay tests
+    -- ========================================================
+
+    apStackTest "async_n7" (42 :: Int)
+        (\b -> B.n7_a O.n7 b 2 3 4 5 6 7)
+        (42,2,3,4,5,6,7)
+
+    -- AP_STACK replayed in a third thread (not the killer, not the killed)
+    do entered <- newEmptyMVar
+       gate <- newEmptyMVar
+       resultVar <- newEmptyMVar
+       let thunk = B.n7_a O.n7
+               (unsafePerformIO $ do
+                   _ <- tryPutMVar entered ()
+                   takeMVar gate)
+               2 3 4 5 6 7
+       tid <- forkIO $ do
+           _ <- tryEval thunk
+           return ()
+       takeMVar entered
+       killThread tid
+       threadDelay 10000
+       putMVar gate 42
+       _ <- forkIO $ do
+           result <- evaluate thunk
+           putMVar resultVar result
+       result <- takeMVar resultVar
+       putStrLn $ "async_other " ++ show (result == (42,2,3,4,5,6,7))
+
+    -- AP_STACK at stg_ctoi_t8 boundary (14 Int#, last small frame)
+    apStackTest "async_n14" (42 :: Int)
+        (\b -> B.n14_a O.n14 b 2 3 4 5 6 7 8 9 10 11 12 13 14)
+        ((42,2,3,4,5,6,7),(8,9,10,11,12,13,14))
+
+    -- Nested async: interrupt the AP_STACK replay itself.
+    -- Round 1: blocks on arg1; Round 2: blocks on arg2; Round 3: completes
+    do entered1 <- newEmptyMVar
+       entered2 <- newEmptyMVar
+       gate1 <- newEmptyMVar
+       gate2 <- newEmptyMVar
+       let thunk = B.n7_a O.n7
+               (unsafePerformIO $ do
+                   _ <- tryPutMVar entered1 ()
+                   takeMVar gate1)
+               (unsafePerformIO $ do
+                   _ <- tryPutMVar entered2 ()
+                   takeMVar gate2)
+               3 4 5 6 7
+       tid1 <- forkIO $ do
+           _ <- tryEval thunk
+           return ()
+       takeMVar entered1
+       killThread tid1
+       threadDelay 10000
+       putMVar gate1 100
+       tid2 <- forkIO $ do
+           _ <- tryEval thunk
+           return ()
+       takeMVar entered2
+       killThread tid2
+       threadDelay 10000
+       putMVar gate2 200
+       result <- evaluate thunk
+       putStrLn $ "async_nested " ++ show (result == (100,200,3,4,5,6,7))
+
+    -- Async + sync exception combo: async replay, then sync throw, then normal
+    do entered <- newEmptyMVar
+       gate <- newEmptyMVar
+       let thunk = B.n7_a O.n7
+               (unsafePerformIO $ do
+                   _ <- tryPutMVar entered ()
+                   takeMVar gate)
+               2 3 4 5 6 7
+       tid <- forkIO $ do
+           _ <- tryEval thunk
+           return ()
+       takeMVar entered
+       killThread tid
+       threadDelay 10000
+       putMVar gate 42
+       rAsync <- evaluate thunk
+       rSync <- tryEval (B.n7_a O.n7 (error "sync") 2 3 4 5 6 7)
+       let syncThrew = case rSync of { Left _ -> True; Right _ -> False }
+       let rNormal = O.n7_a B.n7 10 20 30 40 50 60 70
+       putStrLn $ "async_exc_combo " ++ show
+         (rAsync == (42,2,3,4,5,6,7) && syncThrew &&
+          rNormal == (10,20,30,40,50,60,70))
+
+    -- Async loop: create, kill, and replay AP_STACKs 20 times
+    do let oneRound i = do
+               entered <- newEmptyMVar
+               gate <- newEmptyMVar
+               let thunk = B.n7_a O.n7
+                       (unsafePerformIO $ do
+                           _ <- tryPutMVar entered ()
+                           takeMVar gate)
+                       (i+1) (i+2) (i+3) (i+4) (i+5) (i+6)
+               tid <- forkIO $ do
+                   _ <- tryEval thunk
+                   return ()
+               takeMVar entered
+               killThread tid
+               threadDelay 5000
+               putMVar gate i
+               r <- tryEval thunk
+               return (isRight (i,i+1,i+2,i+3,i+4,i+5,i+6) r)
+       results <- mapM oneRound [1000 :: Int, 1001 .. 1019]
+       putStrLn $ "async_loop " ++ show (and results)
+
+    -- ========================================================
+    -- Multi-ctoi AP_STACK tests
+    -- ========================================================
+
+    -- 2 ctoi frames: B.n2_a->O.n2 inside B.n7_a->O.n7
+    apStackTest "async_2ctoi" (42 :: Int)
+        (\b -> let l1 = case B.n2_a O.n2 b 2 of (a, _) -> a
+               in B.n7_a O.n7 l1 20 30 40 50 60 70)
+        (42,20,30,40,50,60,70)
+
+    -- 3 ctoi frames with different sizes:
+    --   innermost: stg_ctoi_t0 (n2, spill=0)
+    --   middle:    stg_ctoi_t1 (n7, spill=1)
+    --   outermost: stg_ctoi_t  (n15, generic, spill=9)
+    apStackTest "async_3ctoi" (42 :: Int)
+        (\b -> let l1 = case B.n2_a O.n2 b 2 of (a, _) -> a
+                   l2 = case B.n7_a O.n7 l1 2 3 4 5 6 7 of (a,_,_,_,_,_,_) -> a
+               in B.n15_a O.n15 l2 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+        ((42,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15))
+
+    -- Nested async with multi-ctoi: 2 rounds of interruption,
+    -- each with different numbers of ctoi frames on the stack.
+    do entered1 <- newEmptyMVar
+       entered2 <- newEmptyMVar
+       gate1 <- newEmptyMVar
+       gate2 <- newEmptyMVar
+       let blocking1 = unsafePerformIO $ do
+               _ <- tryPutMVar entered1 ()
+               takeMVar gate1
+       let blocking2 = unsafePerformIO $ do
+               _ <- tryPutMVar entered2 ()
+               takeMVar gate2
+       let l1 = case B.n2_a O.n2 blocking1 2 of (a, _) -> a
+       let thunk = B.n7_a O.n7 l1 blocking2 3 4 5 6 7
+       tid1 <- forkIO $ do
+           _ <- tryEval thunk
+           return ()
+       takeMVar entered1
+       killThread tid1
+       threadDelay 10000
+       putMVar gate1 100
+       tid2 <- forkIO $ do
+           _ <- tryEval thunk
+           return ()
+       takeMVar entered2
+       killThread tid2
+       threadDelay 10000
+       putMVar gate2 200
+       result <- evaluate thunk
+       putStrLn $ "async_nested_ctoi " ++ show
+         (result == (100,200,3,4,5,6,7))
+
+    -- ========================================================
+    -- All-generic multi-ctoi AP_STACK tests (32+ element tuples)
+    -- ========================================================
+
+    -- Single 32-element generic ctoi frame (spill = 26)
+    apStackTest "async_n32" (42 :: Int)
+        (\b -> B.n32_a O.n32 b 2 3 4 5 6 7 8
+               9 10 11 12 13 14 15 16
+               17 18 19 20 21 22 23 24
+               25 26 27 28 29 30 31 32)
+        ((42,2,3,4,5,6,7,8),(9,10,11,12,13,14,15,16),
+         (17,18,19,20,21,22,23,24),(25,26,27,28,29,30,31,32))
+
+    -- 2 generic ctoi frames: n20 (spill=14) inside n32 (spill=26)
+    apStackTest "async_2gen32" (42 :: Int)
+        (\b -> let l1 = case B.n20_a O.n20 b 2 3 4 5 6 7 8 9 10
+                             11 12 13 14 15 16 17 18 19 20
+                        of ((a,_,_,_,_),_,_,_) -> a
+               in B.n32_a O.n32 l1 2 3 4 5 6 7 8
+                  9 10 11 12 13 14 15 16
+                  17 18 19 20 21 22 23 24
+                  25 26 27 28 29 30 31 32)
+        ((42,2,3,4,5,6,7,8),(9,10,11,12,13,14,15,16),
+         (17,18,19,20,21,22,23,24),(25,26,27,28,29,30,31,32))
+
+    -- 2 generic ctoi frames with mixed types: n15 (spill=9) inside mix32 (spill=14)
+    apStackTest "async_gen_mix" (42 :: Int)
+        (\b -> let l1 = case B.n15_a O.n15 b 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+                        of ((a,_,_,_,_),_,_) -> a
+               in B.mix32_a O.mix32
+                  1 l1 3.0 4.0  5 6 7.0 8.0  9 10 11.0 12.0
+                  13 14 15.0 16.0  17 18 19.0 20.0  21 22 23.0 24.0
+                  25 26 27.0 28.0  29 30 31.0 32.0)
+        ((1,42,3.0,4.0),(5,6,7.0,8.0),(9,10,11.0,12.0),
+         (13,14,15.0,16.0),(17,18,19.0,20.0),(21,22,23.0,24.0),
+         (25,26,27.0,28.0),(29,30,31.0,32.0))
+
+    -- ========================================================
+    -- AP_STACK replay with non-zero base TSO state
+    -- ========================================================
+    -- These tests replay AP_STACKs inside an outer generic ctoi frame,
+    -- so restoreStackInvariants must patch the saved old_spill in the
+    -- replayed segment to match the outer frame's spill count.
+    -- See Note [GHCi unboxed tuples stack spills] in StgMiscClosures.cmm.
+
+    -- AP_STACK with generic ctoi(n15, spill=9) replayed inside
+    -- ctoi(n20, spill=14). If restoreStackInvariants doesn't patch
+    -- n15's old_spill to 14, n15's return restores TSO to 0 (from the
+    -- killed thread's context), and ctoi(n20) reads 0 spill words
+    -- instead of 14 -> stack corruption.
+    do entered <- newEmptyMVar
+       gate <- newEmptyMVar
+       let innerThunk = B.n15_a O.n15
+               (unsafePerformIO $ do
+                   _ <- tryPutMVar entered ()
+                   takeMVar gate)
+               2 3 4 5 6 7 8 9 10 11 12 13 14 15
+       tid <- forkIO $ do
+           _ <- tryEval innerThunk
+           return ()
+       takeMVar entered
+       killThread tid
+       threadDelay 10000
+       putMVar gate 42
+       -- Force innerThunk (AP_STACK replay) inside generic ctoi(n20)
+       let l1 = case innerThunk of ((a,_,_,_,_),_,_) -> a
+       result <- evaluate (B.n20_a O.n20 l1 2 3 4 5 6 7 8 9 10
+                           11 12 13 14 15 16 17 18 19 20)
+       putStrLn $ "async_replay_base " ++ show
+         (result == ((42,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15),(16,17,18,19,20)))
+
+    -- AP_STACK with 2 generic ctoi frames (n15+n20) replayed inside
+    -- ctoi(n32, spill=26). restoreStackInvariants must patch the outermost
+    -- replayed frame's (n20) old_spill to 26.
+    do entered <- newEmptyMVar
+       gate <- newEmptyMVar
+       let blocking = unsafePerformIO $ do
+               _ <- tryPutMVar entered ()
+               takeMVar gate
+       let l1 = case B.n15_a O.n15 blocking 2 3 4 5 6 7 8
+                     9 10 11 12 13 14 15
+                of ((a,_,_,_,_),_,_) -> a
+       let innerThunk = B.n20_a O.n20 l1 2 3 4 5 6 7 8 9 10
+                        11 12 13 14 15 16 17 18 19 20
+       tid <- forkIO $ do
+           _ <- tryEval innerThunk
+           return ()
+       takeMVar entered
+       killThread tid
+       threadDelay 10000
+       putMVar gate 42
+       -- Force inside generic ctoi(n32, spill=26); replays 2 inner frames
+       let l2 = case innerThunk of ((a,_,_,_,_),_,_,_) -> a
+       result <- evaluate (B.n32_a O.n32 l2 2 3 4 5 6 7 8
+                           9 10 11 12 13 14 15 16
+                           17 18 19 20 21 22 23 24
+                           25 26 27 28 29 30 31 32)
+       putStrLn $ "async_replay_2inner " ++ show
+         (result == ((42,2,3,4,5,6,7,8),(9,10,11,12,13,14,15,16),
+                     (17,18,19,20,21,22,23,24),(25,26,27,28,29,30,31,32)))
+
+    -- AP_STACK replay inside ctoi(n20), where the replay triggers an
+    -- exception caught between the restored ctoi(n15) and outer ctoi(n20).
+    -- Tests restoreStackInvariants patching AND exception unwinding through
+    -- the patched frame: if n15's old_spill is wrong, the unwind restores
+    -- the wrong value, and ctoi(n20) reads the wrong spill count.
+    do entered <- newEmptyMVar
+       gate <- newEmptyMVar
+       let innerThunk = B.n15_a O.n15
+               (unsafePerformIO $ do
+                   _ <- tryPutMVar entered ()
+                   takeMVar gate)
+               2 3 4 5 6 7 8 9 10 11 12 13 14 15
+       tid <- forkIO $ do
+           _ <- tryEval innerThunk
+           return ()
+       takeMVar entered
+       killThread tid
+       threadDelay 10000
+       putMVar gate (error "exc")
+       -- Force inside ctoi(n20); replay throws, caught between frames
+       let l1 :: Int
+           l1 = unsafePerformIO $
+             catch (evaluate innerThunk >>= \r ->
+                      case r of ((a,_,_,_,_),_,_) -> return a)
+                   (const (return 99) :: SomeException -> IO Int)
+       result <- evaluate (B.n20_a O.n20 l1 2 3 4 5 6 7 8 9 10
+                           11 12 13 14 15 16 17 18 19 20)
+       putStrLn $ "async_replay_catch " ++ show
+         (result == ((99,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15),(16,17,18,19,20)))
+
+-- ========================================================
+-- Helpers
+-- ========================================================
+
+swapStress :: Int -> (Int, Int) -> (Int, Int)
+swapStress n (a, b)
+    | n <= 0    = (a, b)
+    | even n    = swapStress (n-1) (B.p2_a O.p2 b a)
+    | otherwise = swapStress (n-1) (O.p2_a B.p2 b a)
+
+testX :: (Eq a, Show a)
+      => String -> (p -> t) -> (p -> t) -> p -> p -> (t -> a) -> IO ()
+testX msg a1 a2 b1 b2 ap =
+    let (r:rs) = [ap (f g) | f <- [a1,a2], g <- [b1,b2]]
+    in putStrLn (msg ++ " " ++ show (all (==r) rs) ++ " " ++ show r)
+
+-- | Evaluate an expression and catch any exception.
+tryEval :: a -> IO (Either SomeException a)
+tryEval x = try (evaluate x)
+
+-- | Check that an Either SomeException result is Right with the expected value.
+isRight :: Eq a => a -> Either SomeException a -> Bool
+isRight expected (Right v) = v == expected
+isRight _        (Left _)  = False
+
+-- | Run an AP_STACK replay test. @mkThunk@ receives a blocking value (backed
+-- by an MVar) and should build a thunk that forces it during evaluation.
+-- The thunk is evaluated in a thread that gets killed (creating an AP_STACK),
+-- then the MVar is filled with @unblockVal@ and the AP_STACK is replayed.
+apStackTest :: Eq a => String -> b -> (b -> a) -> a -> IO ()
+apStackTest name unblockVal mkThunk expected = do
+    entered <- newEmptyMVar
+    gate <- newEmptyMVar
+    let blocking = unsafePerformIO $ do
+            _ <- tryPutMVar entered ()
+            takeMVar gate
+    let thunk = mkThunk blocking
+    tid <- forkIO $ do
+        _ <- tryEval thunk
+        return ()
+    takeMVar entered
+    killThread tid
+    threadDelay 10000
+    putMVar gate unblockVal
+    r <- tryEval thunk
+    putStrLn $ name ++ " " ++ show (isRight expected r)


=====================================
testsuite/tests/bytecode/tuplestress/TupleStress.stdout
=====================================
@@ -0,0 +1,46 @@
+p7 True (1,2,3,4,5,6,7)
+n2 True (1,2)
+n7 True (1,2,3,4,5,6,7)
+n15 True ((1,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15))
+d7 True (1.5,2.5,3.5,4.5,5.5,6.5,7.5)
+fl7 True (1.25,2.25,3.25,4.25,5.25,6.25,7.25)
+w7 True (100,200,300,400,500,600,700)
+mpi6 True (1,2,3,4,5,6)
+mpd6 True (1,1.5,2,2.5,3,3.5)
+mall8 True (1,2,3.0,4.0,5,6,7.0,8.0)
+sub5 True (42,1000,70000,99,100)
+vd6 True (11,22,33)
+n14 True ((1,2,3,4,5,6,7),(8,9,10,11,12,13,14))
+n20 True ((1,2,3,4,5),(6,7,8,9,10),(11,12,13,14,15),(16,17,18,19,20))
+n32 True ((1,2,3,4,5,6,7,8),(9,10,11,12,13,14,15,16),(17,18,19,20,21,22,23,24),(25,26,27,28,29,30,31,32))
+mix32 True ((1,2,3.0,4.0),(5,6,7.0,8.0),(9,10,11.0,12.0),(13,14,15.0,16.0),(17,18,19.0,20.0),(21,22,23.0,24.0),(25,26,27.0,28.0),(29,30,31.0,32.0))
+loop_p7 True
+loop_mpd True
+loop_n32 True
+chain_arith (80,80,80,40,80,80,80)
+swap_stress (1,2)
+rec_mixed (50,100,25.0,75.0)
+fib_cross 832040
+exc_n7_bo True
+exc_n15_bo True
+exc_mpd_bo True
+exc_repeat True
+exc_n14_bo True
+exc_n32_bo True
+exc_nested_2gen True
+exc_catch_between True
+async_n7 True
+async_other True
+async_n14 True
+async_nested True
+async_exc_combo True
+async_loop True
+async_2ctoi True
+async_3ctoi True
+async_nested_ctoi True
+async_n32 True
+async_2gen32 True
+async_gen_mix True
+async_replay_base True
+async_replay_2inner True
+async_replay_catch True


=====================================
testsuite/tests/bytecode/tuplestress/all.T
=====================================
@@ -0,0 +1,10 @@
+test('TupleStress',
+     [ extra_files(['Obj.hs', 'ByteCode.hs', 'Common.hs-incl']),
+       req_interp,
+       req_bco,
+       only_ways(ghci_ways),
+       extra_ways(ghci_ways),
+     ],
+     compile_and_run,
+     ['']
+    )


=====================================
utils/deriveConstants/Main.hs
=====================================
@@ -476,6 +476,7 @@ wanteds os = concat
           ,closureField  Both "StgTSO"      "alloc_limit"
           ,closureField_ Both "StgTSO_cccs" "StgTSO" "prof.cccs"
           ,closureField  Both "StgTSO"      "stackobj"
+          ,closureField  Both "StgTSO"      "ctoi_tuple_spill_words"
 
           ,closureField       Both "StgStack" "sp"
           ,closureFieldOffset Both "StgStack" "stack"



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/fe6e76c5e0155b9b34948957476b0bb...

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/fe6e76c5e0155b9b34948957476b0bb...
You're receiving this email because of your account on gitlab.haskell.org.

    

Marge Bot (＠marge-bot)

tags

participants (1)