recursion-ninja pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC
Commits:
-
4a5afb4e
by Alex Washburn at 2025-08-12T14:43:24-04:00
4 changed files:
- compiler/GHC/CmmToLlvm/CodeGen.hs
- + testsuite/tests/llvm/should_run/T26065.hs
- + testsuite/tests/llvm/should_run/T26065.stdout
- testsuite/tests/llvm/should_run/all.T
Changes:
... | ... | @@ -240,12 +240,22 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args = |
240 | 240 | genCallSimpleCast w op dst args
|
241 | 241 | genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
|
242 | 242 | genCallSimpleCast w op dst args
|
243 | -genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
|
|
244 | - genCallSimpleCast w op dst args
|
|
245 | -genCall (PrimTarget op@(MO_Pext w)) [dst] args =
|
|
246 | - genCallSimpleCast w op dst args
|
|
247 | 243 | genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
|
248 | 244 | genCallSimpleCast w op dst args
|
245 | +-- Check if the Intel BMI are enabled, and if the bit-width is less than 'W32'.
|
|
246 | +-- If so, we truncate to the 'W32' call the 32-bit intrinsic operation because
|
|
247 | +-- LLVM does not expose a call to 'PDep' and 'PExt' operations for bit-widths
|
|
248 | +-- of 'W8 and 'W16'.
|
|
249 | +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
|
|
250 | + cfg <- getConfig
|
|
251 | + if llvmCgBmiVersion cfg >= Just BMI2
|
|
252 | + then genCallMinimumTruncationCast W32 w op dst args
|
|
253 | + else genCallSimpleCast w op dst args
|
|
254 | +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
|
|
255 | + cfg <- getConfig
|
|
256 | + if llvmCgBmiVersion cfg >= Just BMI2
|
|
257 | + then genCallMinimumTruncationCast W32 w op dst args
|
|
258 | + else genCallSimpleCast w op dst args
|
|
249 | 259 | |
250 | 260 | genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
|
251 | 261 | addrVar <- exprToVarW addr
|
... | ... | @@ -641,8 +651,12 @@ genCallExtract _ _ _ _ = |
641 | 651 | -- from i32 to i8 explicitly as LLVM is strict about types.
|
642 | 652 | genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
|
643 | 653 | -> LlvmM StmtData
|
644 | -genCallSimpleCast specW op dst args = do
|
|
645 | - let width = widthToLlvmInt specW
|
|
654 | +genCallSimpleCast w = genCallMinimumTruncationCast w w
|
|
655 | + |
|
656 | +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
|
|
657 | + -> [CmmActual] -> LlvmM StmtData
|
|
658 | +genCallMinimumTruncationCast minW specW op dst args = do
|
|
659 | + let width = widthToLlvmInt $ max minW specW
|
|
646 | 660 | argsW = const width <$> args
|
647 | 661 | dstType = cmmToLlvmType $ localRegType dst
|
648 | 662 | signage = cmmPrimOpRetValSignage op
|
... | ... | @@ -945,17 +959,24 @@ cmmPrimOpFunctions mop = do |
945 | 959 | W256 -> fsLit "llvm.cttz.i256"
|
946 | 960 | W512 -> fsLit "llvm.cttz.i512"
|
947 | 961 | MO_Pdep w
|
962 | + -- If the Intel BMI are enabled, then we will be calling the intrinsic operation
|
|
963 | + -- through the LLVM binding, unless however the bit-width is 'W8' or 'W16'.
|
|
964 | + -- In these cases, we truncate to the 'W32' bit-width and /directly/ call the
|
|
965 | + -- 32-bit BMI operation. This is necessary because the LLVM does not expose a
|
|
966 | + -- call to the 'PDep' and 'PExt' operation for bit-diths of 'W8 and 'W16'.
|
|
967 | + -- Hence the necessity to to call the BMI intrinsic operation directlky from
|
|
968 | + -- outside the LLVM.
|
|
948 | 969 | | isBmi2Enabled -> case w of
|
949 | - W8 -> fsLit "llvm.x86.bmi.pdep.8"
|
|
950 | - W16 -> fsLit "llvm.x86.bmi.pdep.16"
|
|
970 | + W8 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
971 | + W16 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
951 | 972 | W32 -> fsLit "llvm.x86.bmi.pdep.32"
|
952 | 973 | W64 -> fsLit "llvm.x86.bmi.pdep.64"
|
953 | 974 | W128 -> fsLit "llvm.x86.bmi.pdep.128"
|
954 | 975 | W256 -> fsLit "llvm.x86.bmi.pdep.256"
|
955 | 976 | W512 -> fsLit "llvm.x86.bmi.pdep.512"
|
956 | 977 | | otherwise -> case w of
|
957 | - W8 -> fsLit "hs_pdep8"
|
|
958 | - W16 -> fsLit "hs_pdep16"
|
|
978 | + W8 -> fsLit "hs_pdep32"
|
|
979 | + W16 -> fsLit "hs_pdep32"
|
|
959 | 980 | W32 -> fsLit "hs_pdep32"
|
960 | 981 | W64 -> fsLit "hs_pdep64"
|
961 | 982 | W128 -> fsLit "hs_pdep128"
|
... | ... | @@ -963,8 +984,10 @@ cmmPrimOpFunctions mop = do |
963 | 984 | W512 -> fsLit "hs_pdep512"
|
964 | 985 | MO_Pext w
|
965 | 986 | | isBmi2Enabled -> case w of
|
966 | - W8 -> fsLit "llvm.x86.bmi.pext.8"
|
|
967 | - W16 -> fsLit "llvm.x86.bmi.pext.16"
|
|
987 | + -- See the 'Mo_Pdep' commentary above as to why we call 'pext.32'
|
|
988 | + -- instead of calling 'pext.8' or 'pext.16' operations.
|
|
989 | + W8 -> fsLit "llvm.x86.bmi.pext.32"
|
|
990 | + W16 -> fsLit "llvm.x86.bmi.pext.32"
|
|
968 | 991 | W32 -> fsLit "llvm.x86.bmi.pext.32"
|
969 | 992 | W64 -> fsLit "llvm.x86.bmi.pext.64"
|
970 | 993 | W128 -> fsLit "llvm.x86.bmi.pext.128"
|
1 | +{-# LANGUAGE MagicHash #-}
|
|
2 | +import GHC.Exts
|
|
3 | +import GHC.Word
|
|
4 | + |
|
5 | +pdep8 :: Word8 -> Word8 -> Word8
|
|
6 | +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
|
|
7 | +{-# NOINLINE pdep8 #-}
|
|
8 | + |
|
9 | +pdep16 :: Word16 -> Word16 -> Word16
|
|
10 | +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
|
|
11 | +{-# NOINLINE pdep16 #-}
|
|
12 | + |
|
13 | +pdep32 :: Word32 -> Word32 -> Word32
|
|
14 | +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
|
|
15 | +{-# NOINLINE pdep32 #-}
|
|
16 | + |
|
17 | +pdep64 :: Word64 -> Word64 -> Word64
|
|
18 | +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
|
|
19 | +{-# NOINLINE pdep64 #-}
|
|
20 | + |
|
21 | +pext8 :: Word8 -> Word8 -> Word8
|
|
22 | +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
|
|
23 | +{-# NOINLINE pext8 #-}
|
|
24 | + |
|
25 | +pext16 :: Word16 -> Word16 -> Word16
|
|
26 | +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
|
|
27 | +{-# NOINLINE pext16 #-}
|
|
28 | + |
|
29 | +pext32 :: Word32 -> Word32 -> Word32
|
|
30 | +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
|
|
31 | +{-# NOINLINE pext32 #-}
|
|
32 | + |
|
33 | +pext64 :: Word64 -> Word64 -> Word64
|
|
34 | +pext64 (W64# a) (W64# b) = W64# (pext64# a b)
|
|
35 | +{-# NOINLINE pext64 #-}
|
|
36 | + |
|
37 | +main :: IO ()
|
|
38 | +main = do
|
|
39 | + putStr "pdep8:\t" *> print (pdep8 7 3)
|
|
40 | + putStr "pdep16:\t" *> print (pdep16 7 3)
|
|
41 | + putStr "pdep32:\t" *> print (pdep32 7 3)
|
|
42 | + putStr "pdep64:\t" *> print (pdep64 7 3)
|
|
43 | + putStr "pext8:\t" *> print (pext8 7 3)
|
|
44 | + putStr "pext16:\t" *> print (pext16 7 3)
|
|
45 | + putStr "pext32:\t" *> print (pext32 7 3)
|
|
46 | + putStr "pext64:\t" *> print (pext64 7 3) |
1 | +pdep8: 3
|
|
2 | +pdep16: 3
|
|
3 | +pdep32: 3
|
|
4 | +pdep64: 3
|
|
5 | +pext8: 3
|
|
6 | +pext16: 3
|
|
7 | +pext32: 3
|
|
8 | +pext64: 3 |
... | ... | @@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a |
18 | 18 | test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
|
19 | 19 | # T25730C.c contains Intel instrinsics, so only run this test on x86
|
20 | 20 | test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
|
21 | +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
|
|
22 | +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
|
|
23 | + unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
|
|
24 | + compile_and_run, ['-mbmi2'])
|
|
25 | + |