GitLab

recursion-ninja pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC

Commits:

4a5afb4e

by Alex Washburn at 2025-08-12T14:43:24-04:00

Correcting LLVM linking of Intel BMI intrinsics pdep{8,16} and pext{8,16}.

This patch fixes #26045.

The LLVM interface does not expose bindings to:

  - llvm.x86.bmi.pdep.8
  - llvm.x86.bmi.pdep.16
  - llvm.x86.bmi.pext.8
  - llvm.x86.bmi.pext.16

So calls are instead made to llvm.x86.bmi.{pdep,pext}.32 in these cases,
with pre/post-operation truncation to constrain the logical value range.

4 changed files:

compiler/GHC/CmmToLlvm/CodeGen.hs
+ testsuite/tests/llvm/should_run/T26065.hs
+ testsuite/tests/llvm/should_run/T26065.stdout
testsuite/tests/llvm/should_run/all.T

Changes:

compiler/GHC/CmmToLlvm/CodeGen.hs

@@ -240,12 +240,22 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args =
      genCallSimpleCast w op dst args
  genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
      genCallSimpleCast w op dst args
 -genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
 -    genCallSimpleCast w op dst args
 -genCall (PrimTarget op@(MO_Pext w)) [dst] args =
 -    genCallSimpleCast w op dst args
  genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
      genCallSimpleCast w op dst args
 +-- Check if the Intel BMI are enabled, and if the bit-width is less than 'W32'.
 +-- If so, we truncate to the 'W32' call the 32-bit intrinsic operation because
 +-- LLVM does not expose a call to 'PDep' and 'PExt' operations for bit-widths
 +-- of 'W8 and 'W16'.
 +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
 +    cfg <- getConfig
 +    if   llvmCgBmiVersion cfg >= Just BMI2
 +    then genCallMinimumTruncationCast W32 w op dst args
 +    else genCallSimpleCast w op dst args
 +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
 +    cfg <- getConfig
 +    if   llvmCgBmiVersion cfg >= Just BMI2
 +    then genCallMinimumTruncationCast W32 w op dst args
 +    else genCallSimpleCast w op dst args
  genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
      addrVar <- exprToVarW addr
@@ -641,8 +651,12 @@ genCallExtract _ _ _ _ =
  -- from i32 to i8 explicitly as LLVM is strict about types.
  genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
                    -> LlvmM StmtData
 -genCallSimpleCast specW op dst args = do
 -    let width   = widthToLlvmInt specW
 +genCallSimpleCast w = genCallMinimumTruncationCast w w
++
 +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
 +                             -> [CmmActual] -> LlvmM StmtData
 +genCallMinimumTruncationCast minW specW op dst args = do
 +    let width   = widthToLlvmInt $ max minW specW
          argsW   = const width <$> args
          dstType = cmmToLlvmType $ localRegType dst
          signage = cmmPrimOpRetValSignage op
@@ -945,17 +959,24 @@ cmmPrimOpFunctions mop = do
        W256 -> fsLit "llvm.cttz.i256"
        W512 -> fsLit "llvm.cttz.i512"
      MO_Pdep w
 +      -- If the Intel BMI are enabled, then we will be calling the intrinsic operation
 +      -- through the LLVM binding, unless however the bit-width is 'W8' or 'W16'.
 +      -- In these cases, we truncate to the 'W32' bit-width and /directly/ call the
 +      -- 32-bit BMI operation. This is necessary because the LLVM does not expose a
 +      -- call to the 'PDep' and 'PExt' operation for bit-diths of 'W8 and 'W16'.
 +      -- Hence the necessity to to call the BMI intrinsic operation directlky from
 +      -- outside the LLVM.
        | isBmi2Enabled -> case w of
 -          W8   -> fsLit "llvm.x86.bmi.pdep.8"
 -          W16  -> fsLit "llvm.x86.bmi.pdep.16"
 +          W8   -> fsLit "llvm.x86.bmi.pdep.32"
 +          W16  -> fsLit "llvm.x86.bmi.pdep.32"
            W32  -> fsLit "llvm.x86.bmi.pdep.32"
            W64  -> fsLit "llvm.x86.bmi.pdep.64"
            W128 -> fsLit "llvm.x86.bmi.pdep.128"
            W256 -> fsLit "llvm.x86.bmi.pdep.256"
            W512 -> fsLit "llvm.x86.bmi.pdep.512"
        | otherwise -> case w of
 -          W8   -> fsLit "hs_pdep8"
 -          W16  -> fsLit "hs_pdep16"
 +          W8   -> fsLit "hs_pdep32"
 +          W16  -> fsLit "hs_pdep32"
            W32  -> fsLit "hs_pdep32"
            W64  -> fsLit "hs_pdep64"
            W128 -> fsLit "hs_pdep128"
@@ -963,8 +984,10 @@ cmmPrimOpFunctions mop = do
            W512 -> fsLit "hs_pdep512"
      MO_Pext w
        | isBmi2Enabled -> case w of
 -          W8   -> fsLit "llvm.x86.bmi.pext.8"
 -          W16  -> fsLit "llvm.x86.bmi.pext.16"
 +          -- See the 'Mo_Pdep' commentary above as to why we call 'pext.32'
 +          -- instead of calling 'pext.8' or 'pext.16' operations.
 +          W8   -> fsLit "llvm.x86.bmi.pext.32"
 +          W16  -> fsLit "llvm.x86.bmi.pext.32"
            W32  -> fsLit "llvm.x86.bmi.pext.32"
            W64  -> fsLit "llvm.x86.bmi.pext.64"
            W128 -> fsLit "llvm.x86.bmi.pext.128"

testsuite/tests/llvm/should_run/T26065.hs

 +{-# LANGUAGE MagicHash #-}
 +import GHC.Exts
 +import GHC.Word
++
 +pdep8 :: Word8 -> Word8 -> Word8
 +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
 +{-# NOINLINE pdep8 #-}
++
 +pdep16 :: Word16 -> Word16 -> Word16
 +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
 +{-# NOINLINE pdep16 #-}
++
 +pdep32 :: Word32 -> Word32 -> Word32
 +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
 +{-# NOINLINE pdep32 #-}
++
 +pdep64 :: Word64 -> Word64 -> Word64
 +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
 +{-# NOINLINE pdep64 #-}
++
 +pext8 :: Word8 -> Word8 -> Word8
 +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
 +{-# NOINLINE pext8 #-}
++
 +pext16 :: Word16 -> Word16 -> Word16
 +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
 +{-# NOINLINE pext16 #-}
++
 +pext32 :: Word32 -> Word32 -> Word32
 +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
 +{-# NOINLINE pext32 #-}
++
 +pext64 :: Word64 -> Word64 -> Word64
 +pext64 (W64# a) (W64# b) = W64# (pext64# a b)
 +{-# NOINLINE pext64 #-}
++
 +main :: IO ()
 +main = do
 +  putStr "pdep8:\t"  *> print (pdep8  7 3)
 +  putStr "pdep16:\t" *> print (pdep16 7 3)
 +  putStr "pdep32:\t" *> print (pdep32 7 3)
 +  putStr "pdep64:\t" *> print (pdep64 7 3)
 +  putStr "pext8:\t"  *> print (pext8  7 3)
 +  putStr "pext16:\t" *> print (pext16 7 3)
 +  putStr "pext32:\t" *> print (pext32 7 3)
 +  putStr "pext64:\t" *> print (pext64 7 3)

testsuite/tests/llvm/should_run/T26065.stdout

 +pdep8:	3
 +pdep16:	3
 +pdep32:	3
 +pdep64:	3
 +pext8:	3
 +pext16:	3
 +pext32:	3
 +pext64:	3

testsuite/tests/llvm/should_run/all.T

@@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a
  test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
    # T25730C.c contains Intel instrinsics, so only run this test on x86
  test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
 +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
 +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
 +                unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
 +                compile_and_run, ['-mbmi2'])
++