GitLab

recursion-ninja pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC

Commits:

1db5d2a8

by Alex Washburn at 2025-08-25T10:32:53-04:00

Correcting LLVM linking of Intel BMI intrinsics pdep{8,16} and pext{8,16}.

This patch fixes #26045.

The LLVM interface does not expose bindings to:

  - llvm.x86.bmi.pdep.8
  - llvm.x86.bmi.pdep.16
  - llvm.x86.bmi.pext.8
  - llvm.x86.bmi.pext.16

So calls are instead made to llvm.x86.bmi.{pdep,pext}.32 in these cases,
with pre/post-operation truncation to constrain the logical value range.

4 changed files:

compiler/GHC/CmmToLlvm/CodeGen.hs
+ testsuite/tests/llvm/should_run/T26065.hs
+ testsuite/tests/llvm/should_run/T26065.stdout
testsuite/tests/llvm/should_run/all.T

Changes:

compiler/GHC/CmmToLlvm/CodeGen.hs

@@ -240,12 +240,25 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args =
      genCallSimpleCast w op dst args
  genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
      genCallSimpleCast w op dst args
 -genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
 -    genCallSimpleCast w op dst args
 -genCall (PrimTarget op@(MO_Pext w)) [dst] args =
 -    genCallSimpleCast w op dst args
  genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
      genCallSimpleCast w op dst args
 +{- Note [LLVM PDep/PExt intrinsics]
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +Since x86 PDep/PExt instructions only exist for 32/64 bit widths
 +we use the 32bit variant to compute the 8/16bit primops.
 +To do so we extend/truncate the argument/result around the
 +call.
 +-}
 +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
 +    cfg <- getConfig
 +    if  llvmCgBmiVersion cfg >= Just BMI2
 +        then genCallMinimumTruncationCast W32 w op dst args
 +        else genCallSimpleCast w op dst args
 +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
 +    cfg <- getConfig
 +    if  llvmCgBmiVersion cfg >= Just BMI2
 +        then genCallMinimumTruncationCast W32 w op dst args
 +        else genCallSimpleCast w op dst args
  genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
      addrVar <- exprToVarW addr
@@ -641,8 +654,15 @@ genCallExtract _ _ _ _ =
  -- from i32 to i8 explicitly as LLVM is strict about types.
  genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
                    -> LlvmM StmtData
 -genCallSimpleCast specW op dst args = do
 -    let width   = widthToLlvmInt specW
 +genCallSimpleCast w = genCallMinimumTruncationCast w w
++
 +-- Given the minimum machine bit-width to use and the logical bit-width of the
 +-- value range, perform a type-cast truncation and extension before and after the
 +-- specified operation, respectively.
 +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
 +                             -> [CmmActual] -> LlvmM StmtData
 +genCallMinimumTruncationCast minW specW op dst args = do
 +    let width   = widthToLlvmInt $ max minW specW
          argsW   = const width <$> args
          dstType = cmmToLlvmType $ localRegType dst
          signage = cmmPrimOpRetValSignage op
@@ -945,9 +965,10 @@ cmmPrimOpFunctions mop = do
        W256 -> fsLit "llvm.cttz.i256"
        W512 -> fsLit "llvm.cttz.i512"
      MO_Pdep w
 +      -- See Note [LLVM PDep/PExt intrinsics]
        | isBmi2Enabled -> case w of
 -          W8   -> fsLit "llvm.x86.bmi.pdep.8"
 -          W16  -> fsLit "llvm.x86.bmi.pdep.16"
 +          W8   -> fsLit "llvm.x86.bmi.pdep.32"
 +          W16  -> fsLit "llvm.x86.bmi.pdep.32"
            W32  -> fsLit "llvm.x86.bmi.pdep.32"
            W64  -> fsLit "llvm.x86.bmi.pdep.64"
            W128 -> fsLit "llvm.x86.bmi.pdep.128"
@@ -963,8 +984,9 @@ cmmPrimOpFunctions mop = do
            W512 -> fsLit "hs_pdep512"
      MO_Pext w
        | isBmi2Enabled -> case w of
 -          W8   -> fsLit "llvm.x86.bmi.pext.8"
 -          W16  -> fsLit "llvm.x86.bmi.pext.16"
 +          -- See Note [LLVM PDep/PExt intrinsics]
 +          W8   -> fsLit "llvm.x86.bmi.pext.32"
 +          W16  -> fsLit "llvm.x86.bmi.pext.32"
            W32  -> fsLit "llvm.x86.bmi.pext.32"
            W64  -> fsLit "llvm.x86.bmi.pext.64"
            W128 -> fsLit "llvm.x86.bmi.pext.128"

testsuite/tests/llvm/should_run/T26065.hs

 +{-# LANGUAGE MagicHash #-}
 +{-# LANGUAGE ScopedTypeVariables #-}
 +import Data.Char (toUpper)
 +import GHC.Exts
 +import GHC.Word
 +import Numeric (showHex)
++
 +pdep8 :: Word8 -> Word8 -> Word8
 +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
 +{-# NOINLINE pdep8 #-}
++
 +pdep16 :: Word16 -> Word16 -> Word16
 +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
 +{-# NOINLINE pdep16 #-}
++
 +pdep32 :: Word32 -> Word32 -> Word32
 +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
 +{-# NOINLINE pdep32 #-}
++
 +pdep64 :: Word64 -> Word64 -> Word64
 +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
 +{-# NOINLINE pdep64 #-}
++
 +pext8 :: Word8 -> Word8 -> Word8
 +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
 +{-# NOINLINE pext8 #-}
++
 +pext16 :: Word16 -> Word16 -> Word16
 +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
 +{-# NOINLINE pext16 #-}
++
 +pext32 :: Word32 -> Word32 -> Word32
 +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
 +{-# NOINLINE pext32 #-}
++
 +pext64 :: Word64 -> Word64 -> Word64
 +pext64 (W64# a) (W64# b) = W64# (pext64# a b)
 +{-# NOINLINE pext64 #-}
++
 +valueSource :: Integral i => i
 +valueSource = fromInteger 0xA7F7A7F7A7F7A7F7
++
 +valueMask   :: Integral i => i
 +valueMask   = fromInteger 0x5555555555555555
++
 +printIntrinsicCall :: forall i. Integral i => String -> (i -> i -> i) -> IO ()
 +printIntrinsicCall label f =
 +  let op1 = valueSource
 +      op2 = valueMask
 +      pad s =
 +          let hex :: Integral a => a -> String
 +              hex = flip showHex ""
 +              str = toUpper <$> hex s
 +              len = length $ hex (maxBound :: Word64)
 +              n   = length str
 +          in  "0x" <> replicate (len - n) '0' <> str
 +  in  putStrLn $ unwords [ label, pad op1, pad op2, "=", pad (f op1 op2) ]
++
 +main :: IO ()
 +main = do
 +  printIntrinsicCall "pdep8 " pdep8
 +  printIntrinsicCall "pdep16" pdep16
 +  printIntrinsicCall "pdep32" pdep32
 +  printIntrinsicCall "pdep64" pdep64
 +  printIntrinsicCall "pext8 " pext8
 +  printIntrinsicCall "pext16" pext16
 +  printIntrinsicCall "pext32" pext32
 +  printIntrinsicCall "pext64" pext64

testsuite/tests/llvm/should_run/T26065.stdout

 +pdep8  0x00000000000000F7 0x0000000000000055 = 0x0000000000000015
 +pdep16 0x000000000000A7F7 0x0000000000005555 = 0x0000000000005515
 +pdep32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000044155515
 +pdep64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x4415551544155515
 +pext8  0x00000000000000F7 0x0000000000000055 = 0x000000000000000F
 +pext16 0x000000000000A7F7 0x0000000000005555 = 0x000000000000003F
 +pext32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000000003F3F
 +pext64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x000000003F3F3F3F

testsuite/tests/llvm/should_run/all.T

@@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a
  test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
    # T25730C.c contains Intel instrinsics, so only run this test on x86
  test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
 +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
 +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
 +                unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
 +                compile_and_run, ['-mbmi2'])
++