Andreas Klebinger pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC
Commits:
-
d97f54f2
by Alex Washburn at 2025-09-11T13:38:05+02:00
4 changed files:
- compiler/GHC/CmmToLlvm/CodeGen.hs
- + testsuite/tests/llvm/should_run/T26065.hs
- + testsuite/tests/llvm/should_run/T26065.stdout
- testsuite/tests/llvm/should_run/all.T
Changes:
... | ... | @@ -240,12 +240,25 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args = |
240 | 240 | genCallSimpleCast w op dst args
|
241 | 241 | genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
|
242 | 242 | genCallSimpleCast w op dst args
|
243 | -genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
|
|
244 | - genCallSimpleCast w op dst args
|
|
245 | -genCall (PrimTarget op@(MO_Pext w)) [dst] args =
|
|
246 | - genCallSimpleCast w op dst args
|
|
247 | 243 | genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
|
248 | 244 | genCallSimpleCast w op dst args
|
245 | +{- Note [LLVM PDep/PExt intrinsics]
|
|
246 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
247 | +Since x86 PDep/PExt instructions only exist for 32/64 bit widths
|
|
248 | +we use the 32bit variant to compute the 8/16bit primops.
|
|
249 | +To do so we extend/truncate the argument/result around the
|
|
250 | +call.
|
|
251 | +-}
|
|
252 | +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
|
|
253 | + cfg <- getConfig
|
|
254 | + if llvmCgBmiVersion cfg >= Just BMI2
|
|
255 | + then genCallMinimumTruncationCast W32 w op dst args
|
|
256 | + else genCallSimpleCast w op dst args
|
|
257 | +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
|
|
258 | + cfg <- getConfig
|
|
259 | + if llvmCgBmiVersion cfg >= Just BMI2
|
|
260 | + then genCallMinimumTruncationCast W32 w op dst args
|
|
261 | + else genCallSimpleCast w op dst args
|
|
249 | 262 | |
250 | 263 | genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
|
251 | 264 | addrVar <- exprToVarW addr
|
... | ... | @@ -641,8 +654,15 @@ genCallExtract _ _ _ _ = |
641 | 654 | -- from i32 to i8 explicitly as LLVM is strict about types.
|
642 | 655 | genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
|
643 | 656 | -> LlvmM StmtData
|
644 | -genCallSimpleCast specW op dst args = do
|
|
645 | - let width = widthToLlvmInt specW
|
|
657 | +genCallSimpleCast w = genCallMinimumTruncationCast w w
|
|
658 | + |
|
659 | +-- Given the minimum machine bit-width to use and the logical bit-width of the
|
|
660 | +-- value range, perform a type-cast truncation and extension before and after the
|
|
661 | +-- specified operation, respectively.
|
|
662 | +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
|
|
663 | + -> [CmmActual] -> LlvmM StmtData
|
|
664 | +genCallMinimumTruncationCast minW specW op dst args = do
|
|
665 | + let width = widthToLlvmInt $ max minW specW
|
|
646 | 666 | argsW = const width <$> args
|
647 | 667 | dstType = cmmToLlvmType $ localRegType dst
|
648 | 668 | signage = cmmPrimOpRetValSignage op
|
... | ... | @@ -945,9 +965,10 @@ cmmPrimOpFunctions mop = do |
945 | 965 | W256 -> fsLit "llvm.cttz.i256"
|
946 | 966 | W512 -> fsLit "llvm.cttz.i512"
|
947 | 967 | MO_Pdep w
|
968 | + -- See Note [LLVM PDep/PExt intrinsics]
|
|
948 | 969 | | isBmi2Enabled -> case w of
|
949 | - W8 -> fsLit "llvm.x86.bmi.pdep.8"
|
|
950 | - W16 -> fsLit "llvm.x86.bmi.pdep.16"
|
|
970 | + W8 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
971 | + W16 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
951 | 972 | W32 -> fsLit "llvm.x86.bmi.pdep.32"
|
952 | 973 | W64 -> fsLit "llvm.x86.bmi.pdep.64"
|
953 | 974 | W128 -> fsLit "llvm.x86.bmi.pdep.128"
|
... | ... | @@ -963,8 +984,9 @@ cmmPrimOpFunctions mop = do |
963 | 984 | W512 -> fsLit "hs_pdep512"
|
964 | 985 | MO_Pext w
|
965 | 986 | | isBmi2Enabled -> case w of
|
966 | - W8 -> fsLit "llvm.x86.bmi.pext.8"
|
|
967 | - W16 -> fsLit "llvm.x86.bmi.pext.16"
|
|
987 | + -- See Note [LLVM PDep/PExt intrinsics]
|
|
988 | + W8 -> fsLit "llvm.x86.bmi.pext.32"
|
|
989 | + W16 -> fsLit "llvm.x86.bmi.pext.32"
|
|
968 | 990 | W32 -> fsLit "llvm.x86.bmi.pext.32"
|
969 | 991 | W64 -> fsLit "llvm.x86.bmi.pext.64"
|
970 | 992 | W128 -> fsLit "llvm.x86.bmi.pext.128"
|
1 | +{-# LANGUAGE MagicHash #-}
|
|
2 | +{-# LANGUAGE ScopedTypeVariables #-}
|
|
3 | +import Data.Char (toUpper)
|
|
4 | +import GHC.Exts
|
|
5 | +import GHC.Word
|
|
6 | +import Numeric (showHex)
|
|
7 | + |
|
8 | +pdep8 :: Word8 -> Word8 -> Word8
|
|
9 | +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
|
|
10 | +{-# NOINLINE pdep8 #-}
|
|
11 | + |
|
12 | +pdep16 :: Word16 -> Word16 -> Word16
|
|
13 | +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
|
|
14 | +{-# NOINLINE pdep16 #-}
|
|
15 | + |
|
16 | +pdep32 :: Word32 -> Word32 -> Word32
|
|
17 | +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
|
|
18 | +{-# NOINLINE pdep32 #-}
|
|
19 | + |
|
20 | +pdep64 :: Word64 -> Word64 -> Word64
|
|
21 | +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
|
|
22 | +{-# NOINLINE pdep64 #-}
|
|
23 | + |
|
24 | +pext8 :: Word8 -> Word8 -> Word8
|
|
25 | +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
|
|
26 | +{-# NOINLINE pext8 #-}
|
|
27 | + |
|
28 | +pext16 :: Word16 -> Word16 -> Word16
|
|
29 | +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
|
|
30 | +{-# NOINLINE pext16 #-}
|
|
31 | + |
|
32 | +pext32 :: Word32 -> Word32 -> Word32
|
|
33 | +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
|
|
34 | +{-# NOINLINE pext32 #-}
|
|
35 | + |
|
36 | +pext64 :: Word64 -> Word64 -> Word64
|
|
37 | +pext64 (W64# a) (W64# b) = W64# (pext64# a b)
|
|
38 | +{-# NOINLINE pext64 #-}
|
|
39 | + |
|
40 | +valueSource :: Integral i => i
|
|
41 | +valueSource = fromInteger 0xA7F7A7F7A7F7A7F7
|
|
42 | + |
|
43 | +valueMask :: Integral i => i
|
|
44 | +valueMask = fromInteger 0x5555555555555555
|
|
45 | + |
|
46 | +printIntrinsicCall :: forall i. Integral i => String -> (i -> i -> i) -> IO ()
|
|
47 | +printIntrinsicCall label f =
|
|
48 | + let op1 = valueSource
|
|
49 | + op2 = valueMask
|
|
50 | + pad s =
|
|
51 | + let hex :: Integral a => a -> String
|
|
52 | + hex = flip showHex ""
|
|
53 | + str = toUpper <$> hex s
|
|
54 | + len = length $ hex (maxBound :: Word64)
|
|
55 | + n = length str
|
|
56 | + in "0x" <> replicate (len - n) '0' <> str
|
|
57 | + in putStrLn $ unwords [ label, pad op1, pad op2, "=", pad (f op1 op2) ]
|
|
58 | + |
|
59 | +main :: IO ()
|
|
60 | +main = do
|
|
61 | + printIntrinsicCall "pdep8 " pdep8
|
|
62 | + printIntrinsicCall "pdep16" pdep16
|
|
63 | + printIntrinsicCall "pdep32" pdep32
|
|
64 | + printIntrinsicCall "pdep64" pdep64
|
|
65 | + printIntrinsicCall "pext8 " pext8
|
|
66 | + printIntrinsicCall "pext16" pext16
|
|
67 | + printIntrinsicCall "pext32" pext32
|
|
68 | + printIntrinsicCall "pext64" pext64 |
1 | +pdep8 0x00000000000000F7 0x0000000000000055 = 0x0000000000000015
|
|
2 | +pdep16 0x000000000000A7F7 0x0000000000005555 = 0x0000000000005515
|
|
3 | +pdep32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000044155515
|
|
4 | +pdep64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x4415551544155515
|
|
5 | +pext8 0x00000000000000F7 0x0000000000000055 = 0x000000000000000F
|
|
6 | +pext16 0x000000000000A7F7 0x0000000000005555 = 0x000000000000003F
|
|
7 | +pext32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000000003F3F
|
|
8 | +pext64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x000000003F3F3F3F |
... | ... | @@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a |
18 | 18 | test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
|
19 | 19 | # T25730C.c contains Intel instrinsics, so only run this test on x86
|
20 | 20 | test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
|
21 | +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
|
|
22 | +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
|
|
23 | + unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
|
|
24 | + compile_and_run, ['-mbmi2'])
|
|
25 | + |