[Git][ghc/ghc][wip/fix-26065] Correcting LLVM linking of Intel BMI intrinsics pdep{8,16} and pext{8,16}.

recursion-ninja pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC Commits: 4a5afb4e by Alex Washburn at 2025-08-12T14:43:24-04:00 Correcting LLVM linking of Intel BMI intrinsics pdep{8,16} and pext{8,16}. This patch fixes #26045. The LLVM interface does not expose bindings to: - llvm.x86.bmi.pdep.8 - llvm.x86.bmi.pdep.16 - llvm.x86.bmi.pext.8 - llvm.x86.bmi.pext.16 So calls are instead made to llvm.x86.bmi.{pdep,pext}.32 in these cases, with pre/post-operation truncation to constrain the logical value range. - - - - - 4 changed files: - compiler/GHC/CmmToLlvm/CodeGen.hs - + testsuite/tests/llvm/should_run/T26065.hs - + testsuite/tests/llvm/should_run/T26065.stdout - testsuite/tests/llvm/should_run/all.T Changes: ===================================== compiler/GHC/CmmToLlvm/CodeGen.hs ===================================== @@ -240,12 +240,22 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args = genCallSimpleCast w op dst args genCall (PrimTarget op@(MO_BSwap w)) [dst] args = genCallSimpleCast w op dst args -genCall (PrimTarget op@(MO_Pdep w)) [dst] args = - genCallSimpleCast w op dst args -genCall (PrimTarget op@(MO_Pext w)) [dst] args = - genCallSimpleCast w op dst args genCall (PrimTarget op@(MO_PopCnt w)) [dst] args = genCallSimpleCast w op dst args +-- Check if the Intel BMI are enabled, and if the bit-width is less than 'W32'. +-- If so, we truncate to the 'W32' call the 32-bit intrinsic operation because +-- LLVM does not expose a call to 'PDep' and 'PExt' operations for bit-widths +-- of 'W8 and 'W16'. +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do + cfg <- getConfig + if llvmCgBmiVersion cfg >= Just BMI2 + then genCallMinimumTruncationCast W32 w op dst args + else genCallSimpleCast w op dst args +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do + cfg <- getConfig + if llvmCgBmiVersion cfg >= Just BMI2 + then genCallMinimumTruncationCast W32 w op dst args + else genCallSimpleCast w op dst args genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do addrVar <- exprToVarW addr @@ -641,8 +651,12 @@ genCallExtract _ _ _ _ = -- from i32 to i8 explicitly as LLVM is strict about types. genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual] -> LlvmM StmtData -genCallSimpleCast specW op dst args = do - let width = widthToLlvmInt specW +genCallSimpleCast w = genCallMinimumTruncationCast w w + +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal + -> [CmmActual] -> LlvmM StmtData +genCallMinimumTruncationCast minW specW op dst args = do + let width = widthToLlvmInt $ max minW specW argsW = const width <$> args dstType = cmmToLlvmType $ localRegType dst signage = cmmPrimOpRetValSignage op @@ -945,17 +959,24 @@ cmmPrimOpFunctions mop = do W256 -> fsLit "llvm.cttz.i256" W512 -> fsLit "llvm.cttz.i512" MO_Pdep w + -- If the Intel BMI are enabled, then we will be calling the intrinsic operation + -- through the LLVM binding, unless however the bit-width is 'W8' or 'W16'. + -- In these cases, we truncate to the 'W32' bit-width and /directly/ call the + -- 32-bit BMI operation. This is necessary because the LLVM does not expose a + -- call to the 'PDep' and 'PExt' operation for bit-diths of 'W8 and 'W16'. + -- Hence the necessity to to call the BMI intrinsic operation directlky from + -- outside the LLVM. | isBmi2Enabled -> case w of - W8 -> fsLit "llvm.x86.bmi.pdep.8" - W16 -> fsLit "llvm.x86.bmi.pdep.16" + W8 -> fsLit "llvm.x86.bmi.pdep.32" + W16 -> fsLit "llvm.x86.bmi.pdep.32" W32 -> fsLit "llvm.x86.bmi.pdep.32" W64 -> fsLit "llvm.x86.bmi.pdep.64" W128 -> fsLit "llvm.x86.bmi.pdep.128" W256 -> fsLit "llvm.x86.bmi.pdep.256" W512 -> fsLit "llvm.x86.bmi.pdep.512" | otherwise -> case w of - W8 -> fsLit "hs_pdep8" - W16 -> fsLit "hs_pdep16" + W8 -> fsLit "hs_pdep32" + W16 -> fsLit "hs_pdep32" W32 -> fsLit "hs_pdep32" W64 -> fsLit "hs_pdep64" W128 -> fsLit "hs_pdep128" @@ -963,8 +984,10 @@ cmmPrimOpFunctions mop = do W512 -> fsLit "hs_pdep512" MO_Pext w | isBmi2Enabled -> case w of - W8 -> fsLit "llvm.x86.bmi.pext.8" - W16 -> fsLit "llvm.x86.bmi.pext.16" + -- See the 'Mo_Pdep' commentary above as to why we call 'pext.32' + -- instead of calling 'pext.8' or 'pext.16' operations. + W8 -> fsLit "llvm.x86.bmi.pext.32" + W16 -> fsLit "llvm.x86.bmi.pext.32" W32 -> fsLit "llvm.x86.bmi.pext.32" W64 -> fsLit "llvm.x86.bmi.pext.64" W128 -> fsLit "llvm.x86.bmi.pext.128" ===================================== testsuite/tests/llvm/should_run/T26065.hs ===================================== @@ -0,0 +1,46 @@ +{-# LANGUAGE MagicHash #-} +import GHC.Exts +import GHC.Word + +pdep8 :: Word8 -> Word8 -> Word8 +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b))) +{-# NOINLINE pdep8 #-} + +pdep16 :: Word16 -> Word16 -> Word16 +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b))) +{-# NOINLINE pdep16 #-} + +pdep32 :: Word32 -> Word32 -> Word32 +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b))) +{-# NOINLINE pdep32 #-} + +pdep64 :: Word64 -> Word64 -> Word64 +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b) +{-# NOINLINE pdep64 #-} + +pext8 :: Word8 -> Word8 -> Word8 +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b))) +{-# NOINLINE pext8 #-} + +pext16 :: Word16 -> Word16 -> Word16 +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b))) +{-# NOINLINE pext16 #-} + +pext32 :: Word32 -> Word32 -> Word32 +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b))) +{-# NOINLINE pext32 #-} + +pext64 :: Word64 -> Word64 -> Word64 +pext64 (W64# a) (W64# b) = W64# (pext64# a b) +{-# NOINLINE pext64 #-} + +main :: IO () +main = do + putStr "pdep8:\t" *> print (pdep8 7 3) + putStr "pdep16:\t" *> print (pdep16 7 3) + putStr "pdep32:\t" *> print (pdep32 7 3) + putStr "pdep64:\t" *> print (pdep64 7 3) + putStr "pext8:\t" *> print (pext8 7 3) + putStr "pext16:\t" *> print (pext16 7 3) + putStr "pext32:\t" *> print (pext32 7 3) + putStr "pext64:\t" *> print (pext64 7 3) ===================================== testsuite/tests/llvm/should_run/T26065.stdout ===================================== @@ -0,0 +1,8 @@ +pdep8: 3 +pdep16: 3 +pdep32: 3 +pdep64: 3 +pext8: 3 +pext16: 3 +pext32: 3 +pext64: 3 ===================================== testsuite/tests/llvm/should_run/all.T ===================================== @@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c']) # T25730C.c contains Intel instrinsics, so only run this test on x86 test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, ['']) +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86 +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])), + unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)], + compile_and_run, ['-mbmi2']) + View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4a5afb4e1b4f8391d999c326ba0bf297... -- View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4a5afb4e1b4f8391d999c326ba0bf297... You're receiving this email because of your account on gitlab.haskell.org.
participants (1)
-
recursion-ninja (@recursion-ninja)