[Git][ghc/ghc][wip/fix-26065] Correcting LLVM linking of Intel BMI intrinsics pdep{8,16} and pext{8,16}.

12 Aug 2025


      recursion-ninja pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC


Commits:
4a5afb4e by Alex Washburn at 2025-08-12T14:43:24-04:00
Correcting LLVM linking of Intel BMI intrinsics pdep{8,16} and pext{8,16}.

This patch fixes #26045.

The LLVM interface does not expose bindings to:

  - llvm.x86.bmi.pdep.8
  - llvm.x86.bmi.pdep.16
  - llvm.x86.bmi.pext.8
  - llvm.x86.bmi.pext.16

So calls are instead made to llvm.x86.bmi.{pdep,pext}.32 in these cases,
with pre/post-operation truncation to constrain the logical value range.

- - - - -


4 changed files:

- compiler/GHC/CmmToLlvm/CodeGen.hs
- + testsuite/tests/llvm/should_run/T26065.hs
- + testsuite/tests/llvm/should_run/T26065.stdout
- testsuite/tests/llvm/should_run/all.T


Changes:

=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -240,12 +240,22 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args =
     genCallSimpleCast w op dst args
 genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
     genCallSimpleCast w op dst args
-genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
-    genCallSimpleCast w op dst args
-genCall (PrimTarget op@(MO_Pext w)) [dst] args =
-    genCallSimpleCast w op dst args
 genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
     genCallSimpleCast w op dst args
+-- Check if the Intel BMI are enabled, and if the bit-width is less than 'W32'.
+-- If so, we truncate to the 'W32' call the 32-bit intrinsic operation because
+-- LLVM does not expose a call to 'PDep' and 'PExt' operations for bit-widths
+-- of 'W8 and 'W16'.
+genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
+    cfg <- getConfig
+    if   llvmCgBmiVersion cfg >= Just BMI2
+    then genCallMinimumTruncationCast W32 w op dst args
+    else genCallSimpleCast w op dst args
+genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
+    cfg <- getConfig
+    if   llvmCgBmiVersion cfg >= Just BMI2
+    then genCallMinimumTruncationCast W32 w op dst args
+    else genCallSimpleCast w op dst args
 
 genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
     addrVar <- exprToVarW addr
@@ -641,8 +651,12 @@ genCallExtract _ _ _ _ =
 -- from i32 to i8 explicitly as LLVM is strict about types.
 genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
                   -> LlvmM StmtData
-genCallSimpleCast specW op dst args = do
-    let width   = widthToLlvmInt specW
+genCallSimpleCast w = genCallMinimumTruncationCast w w
+
+genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
+                             -> [CmmActual] -> LlvmM StmtData
+genCallMinimumTruncationCast minW specW op dst args = do
+    let width   = widthToLlvmInt $ max minW specW
         argsW   = const width <$> args
         dstType = cmmToLlvmType $ localRegType dst
         signage = cmmPrimOpRetValSignage op
@@ -945,17 +959,24 @@ cmmPrimOpFunctions mop = do
       W256 -> fsLit "llvm.cttz.i256"
       W512 -> fsLit "llvm.cttz.i512"
     MO_Pdep w
+      -- If the Intel BMI are enabled, then we will be calling the intrinsic operation
+      -- through the LLVM binding, unless however the bit-width is 'W8' or 'W16'.
+      -- In these cases, we truncate to the 'W32' bit-width and /directly/ call the
+      -- 32-bit BMI operation. This is necessary because the LLVM does not expose a
+      -- call to the 'PDep' and 'PExt' operation for bit-diths of 'W8 and 'W16'.
+      -- Hence the necessity to to call the BMI intrinsic operation directlky from
+      -- outside the LLVM.
       | isBmi2Enabled -> case w of
-          W8   -> fsLit "llvm.x86.bmi.pdep.8"
-          W16  -> fsLit "llvm.x86.bmi.pdep.16"
+          W8   -> fsLit "llvm.x86.bmi.pdep.32"
+          W16  -> fsLit "llvm.x86.bmi.pdep.32"
           W32  -> fsLit "llvm.x86.bmi.pdep.32"
           W64  -> fsLit "llvm.x86.bmi.pdep.64"
           W128 -> fsLit "llvm.x86.bmi.pdep.128"
           W256 -> fsLit "llvm.x86.bmi.pdep.256"
           W512 -> fsLit "llvm.x86.bmi.pdep.512"
       | otherwise -> case w of
-          W8   -> fsLit "hs_pdep8"
-          W16  -> fsLit "hs_pdep16"
+          W8   -> fsLit "hs_pdep32"
+          W16  -> fsLit "hs_pdep32"
           W32  -> fsLit "hs_pdep32"
           W64  -> fsLit "hs_pdep64"
           W128 -> fsLit "hs_pdep128"
@@ -963,8 +984,10 @@ cmmPrimOpFunctions mop = do
           W512 -> fsLit "hs_pdep512"
     MO_Pext w
       | isBmi2Enabled -> case w of
-          W8   -> fsLit "llvm.x86.bmi.pext.8"
-          W16  -> fsLit "llvm.x86.bmi.pext.16"
+          -- See the 'Mo_Pdep' commentary above as to why we call 'pext.32'
+          -- instead of calling 'pext.8' or 'pext.16' operations.
+          W8   -> fsLit "llvm.x86.bmi.pext.32"
+          W16  -> fsLit "llvm.x86.bmi.pext.32"
           W32  -> fsLit "llvm.x86.bmi.pext.32"
           W64  -> fsLit "llvm.x86.bmi.pext.64"
           W128 -> fsLit "llvm.x86.bmi.pext.128"


=====================================
testsuite/tests/llvm/should_run/T26065.hs
=====================================
@@ -0,0 +1,46 @@
+{-# LANGUAGE MagicHash #-}
+import GHC.Exts
+import GHC.Word
+
+pdep8 :: Word8 -> Word8 -> Word8
+pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
+{-# NOINLINE pdep8 #-}
+
+pdep16 :: Word16 -> Word16 -> Word16
+pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
+{-# NOINLINE pdep16 #-}
+
+pdep32 :: Word32 -> Word32 -> Word32
+pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
+{-# NOINLINE pdep32 #-}
+
+pdep64 :: Word64 -> Word64 -> Word64
+pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
+{-# NOINLINE pdep64 #-}
+
+pext8 :: Word8 -> Word8 -> Word8
+pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
+{-# NOINLINE pext8 #-}
+
+pext16 :: Word16 -> Word16 -> Word16
+pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
+{-# NOINLINE pext16 #-}
+
+pext32 :: Word32 -> Word32 -> Word32
+pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
+{-# NOINLINE pext32 #-}
+
+pext64 :: Word64 -> Word64 -> Word64
+pext64 (W64# a) (W64# b) = W64# (pext64# a b)
+{-# NOINLINE pext64 #-}
+
+main :: IO ()
+main = do
+  putStr "pdep8:\t"  *> print (pdep8  7 3)
+  putStr "pdep16:\t" *> print (pdep16 7 3)
+  putStr "pdep32:\t" *> print (pdep32 7 3)
+  putStr "pdep64:\t" *> print (pdep64 7 3)
+  putStr "pext8:\t"  *> print (pext8  7 3)
+  putStr "pext16:\t" *> print (pext16 7 3)
+  putStr "pext32:\t" *> print (pext32 7 3)
+  putStr "pext64:\t" *> print (pext64 7 3)


=====================================
testsuite/tests/llvm/should_run/T26065.stdout
=====================================
@@ -0,0 +1,8 @@
+pdep8:	3
+pdep16:	3
+pdep32:	3
+pdep64:	3
+pext8:	3
+pext16:	3
+pext32:	3
+pext64:	3


=====================================
testsuite/tests/llvm/should_run/all.T
=====================================
@@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a
 test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
   # T25730C.c contains Intel instrinsics, so only run this test on x86
 test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
+# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
+test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
+                unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
+                compile_and_run, ['-mbmi2'])
+



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4a5afb4e1b4f8391d999c326ba0bf297...

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4a5afb4e1b4f8391d999c326ba0bf297...
You're receiving this email because of your account on gitlab.haskell.org.

    

recursion-ninja (＠recursion-ninja)

tags

participants (1)