recursion-ninja pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC
Commits:
-
1db5d2a8
by Alex Washburn at 2025-08-25T10:32:53-04:00
4 changed files:
- compiler/GHC/CmmToLlvm/CodeGen.hs
- + testsuite/tests/llvm/should_run/T26065.hs
- + testsuite/tests/llvm/should_run/T26065.stdout
- testsuite/tests/llvm/should_run/all.T
Changes:
| ... | ... | @@ -240,12 +240,25 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args = |
| 240 | 240 | genCallSimpleCast w op dst args
|
| 241 | 241 | genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
|
| 242 | 242 | genCallSimpleCast w op dst args
|
| 243 | -genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
|
|
| 244 | - genCallSimpleCast w op dst args
|
|
| 245 | -genCall (PrimTarget op@(MO_Pext w)) [dst] args =
|
|
| 246 | - genCallSimpleCast w op dst args
|
|
| 247 | 243 | genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
|
| 248 | 244 | genCallSimpleCast w op dst args
|
| 245 | +{- Note [LLVM PDep/PExt intrinsics]
|
|
| 246 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
| 247 | +Since x86 PDep/PExt instructions only exist for 32/64 bit widths
|
|
| 248 | +we use the 32bit variant to compute the 8/16bit primops.
|
|
| 249 | +To do so we extend/truncate the argument/result around the
|
|
| 250 | +call.
|
|
| 251 | +-}
|
|
| 252 | +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
|
|
| 253 | + cfg <- getConfig
|
|
| 254 | + if llvmCgBmiVersion cfg >= Just BMI2
|
|
| 255 | + then genCallMinimumTruncationCast W32 w op dst args
|
|
| 256 | + else genCallSimpleCast w op dst args
|
|
| 257 | +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
|
|
| 258 | + cfg <- getConfig
|
|
| 259 | + if llvmCgBmiVersion cfg >= Just BMI2
|
|
| 260 | + then genCallMinimumTruncationCast W32 w op dst args
|
|
| 261 | + else genCallSimpleCast w op dst args
|
|
| 249 | 262 | |
| 250 | 263 | genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
|
| 251 | 264 | addrVar <- exprToVarW addr
|
| ... | ... | @@ -641,8 +654,15 @@ genCallExtract _ _ _ _ = |
| 641 | 654 | -- from i32 to i8 explicitly as LLVM is strict about types.
|
| 642 | 655 | genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
|
| 643 | 656 | -> LlvmM StmtData
|
| 644 | -genCallSimpleCast specW op dst args = do
|
|
| 645 | - let width = widthToLlvmInt specW
|
|
| 657 | +genCallSimpleCast w = genCallMinimumTruncationCast w w
|
|
| 658 | + |
|
| 659 | +-- Given the minimum machine bit-width to use and the logical bit-width of the
|
|
| 660 | +-- value range, perform a type-cast truncation and extension before and after the
|
|
| 661 | +-- specified operation, respectively.
|
|
| 662 | +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
|
|
| 663 | + -> [CmmActual] -> LlvmM StmtData
|
|
| 664 | +genCallMinimumTruncationCast minW specW op dst args = do
|
|
| 665 | + let width = widthToLlvmInt $ max minW specW
|
|
| 646 | 666 | argsW = const width <$> args
|
| 647 | 667 | dstType = cmmToLlvmType $ localRegType dst
|
| 648 | 668 | signage = cmmPrimOpRetValSignage op
|
| ... | ... | @@ -945,9 +965,10 @@ cmmPrimOpFunctions mop = do |
| 945 | 965 | W256 -> fsLit "llvm.cttz.i256"
|
| 946 | 966 | W512 -> fsLit "llvm.cttz.i512"
|
| 947 | 967 | MO_Pdep w
|
| 968 | + -- See Note [LLVM PDep/PExt intrinsics]
|
|
| 948 | 969 | | isBmi2Enabled -> case w of
|
| 949 | - W8 -> fsLit "llvm.x86.bmi.pdep.8"
|
|
| 950 | - W16 -> fsLit "llvm.x86.bmi.pdep.16"
|
|
| 970 | + W8 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
| 971 | + W16 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
| 951 | 972 | W32 -> fsLit "llvm.x86.bmi.pdep.32"
|
| 952 | 973 | W64 -> fsLit "llvm.x86.bmi.pdep.64"
|
| 953 | 974 | W128 -> fsLit "llvm.x86.bmi.pdep.128"
|
| ... | ... | @@ -963,8 +984,9 @@ cmmPrimOpFunctions mop = do |
| 963 | 984 | W512 -> fsLit "hs_pdep512"
|
| 964 | 985 | MO_Pext w
|
| 965 | 986 | | isBmi2Enabled -> case w of
|
| 966 | - W8 -> fsLit "llvm.x86.bmi.pext.8"
|
|
| 967 | - W16 -> fsLit "llvm.x86.bmi.pext.16"
|
|
| 987 | + -- See Note [LLVM PDep/PExt intrinsics]
|
|
| 988 | + W8 -> fsLit "llvm.x86.bmi.pext.32"
|
|
| 989 | + W16 -> fsLit "llvm.x86.bmi.pext.32"
|
|
| 968 | 990 | W32 -> fsLit "llvm.x86.bmi.pext.32"
|
| 969 | 991 | W64 -> fsLit "llvm.x86.bmi.pext.64"
|
| 970 | 992 | W128 -> fsLit "llvm.x86.bmi.pext.128"
|
| 1 | +{-# LANGUAGE MagicHash #-}
|
|
| 2 | +{-# LANGUAGE ScopedTypeVariables #-}
|
|
| 3 | +import Data.Char (toUpper)
|
|
| 4 | +import GHC.Exts
|
|
| 5 | +import GHC.Word
|
|
| 6 | +import Numeric (showHex)
|
|
| 7 | + |
|
| 8 | +pdep8 :: Word8 -> Word8 -> Word8
|
|
| 9 | +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
|
|
| 10 | +{-# NOINLINE pdep8 #-}
|
|
| 11 | + |
|
| 12 | +pdep16 :: Word16 -> Word16 -> Word16
|
|
| 13 | +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
|
|
| 14 | +{-# NOINLINE pdep16 #-}
|
|
| 15 | + |
|
| 16 | +pdep32 :: Word32 -> Word32 -> Word32
|
|
| 17 | +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
|
|
| 18 | +{-# NOINLINE pdep32 #-}
|
|
| 19 | + |
|
| 20 | +pdep64 :: Word64 -> Word64 -> Word64
|
|
| 21 | +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
|
|
| 22 | +{-# NOINLINE pdep64 #-}
|
|
| 23 | + |
|
| 24 | +pext8 :: Word8 -> Word8 -> Word8
|
|
| 25 | +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
|
|
| 26 | +{-# NOINLINE pext8 #-}
|
|
| 27 | + |
|
| 28 | +pext16 :: Word16 -> Word16 -> Word16
|
|
| 29 | +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
|
|
| 30 | +{-# NOINLINE pext16 #-}
|
|
| 31 | + |
|
| 32 | +pext32 :: Word32 -> Word32 -> Word32
|
|
| 33 | +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
|
|
| 34 | +{-# NOINLINE pext32 #-}
|
|
| 35 | + |
|
| 36 | +pext64 :: Word64 -> Word64 -> Word64
|
|
| 37 | +pext64 (W64# a) (W64# b) = W64# (pext64# a b)
|
|
| 38 | +{-# NOINLINE pext64 #-}
|
|
| 39 | + |
|
| 40 | +valueSource :: Integral i => i
|
|
| 41 | +valueSource = fromInteger 0xA7F7A7F7A7F7A7F7
|
|
| 42 | + |
|
| 43 | +valueMask :: Integral i => i
|
|
| 44 | +valueMask = fromInteger 0x5555555555555555
|
|
| 45 | + |
|
| 46 | +printIntrinsicCall :: forall i. Integral i => String -> (i -> i -> i) -> IO ()
|
|
| 47 | +printIntrinsicCall label f =
|
|
| 48 | + let op1 = valueSource
|
|
| 49 | + op2 = valueMask
|
|
| 50 | + pad s =
|
|
| 51 | + let hex :: Integral a => a -> String
|
|
| 52 | + hex = flip showHex ""
|
|
| 53 | + str = toUpper <$> hex s
|
|
| 54 | + len = length $ hex (maxBound :: Word64)
|
|
| 55 | + n = length str
|
|
| 56 | + in "0x" <> replicate (len - n) '0' <> str
|
|
| 57 | + in putStrLn $ unwords [ label, pad op1, pad op2, "=", pad (f op1 op2) ]
|
|
| 58 | + |
|
| 59 | +main :: IO ()
|
|
| 60 | +main = do
|
|
| 61 | + printIntrinsicCall "pdep8 " pdep8
|
|
| 62 | + printIntrinsicCall "pdep16" pdep16
|
|
| 63 | + printIntrinsicCall "pdep32" pdep32
|
|
| 64 | + printIntrinsicCall "pdep64" pdep64
|
|
| 65 | + printIntrinsicCall "pext8 " pext8
|
|
| 66 | + printIntrinsicCall "pext16" pext16
|
|
| 67 | + printIntrinsicCall "pext32" pext32
|
|
| 68 | + printIntrinsicCall "pext64" pext64 |
| 1 | +pdep8 0x00000000000000F7 0x0000000000000055 = 0x0000000000000015
|
|
| 2 | +pdep16 0x000000000000A7F7 0x0000000000005555 = 0x0000000000005515
|
|
| 3 | +pdep32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000044155515
|
|
| 4 | +pdep64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x4415551544155515
|
|
| 5 | +pext8 0x00000000000000F7 0x0000000000000055 = 0x000000000000000F
|
|
| 6 | +pext16 0x000000000000A7F7 0x0000000000005555 = 0x000000000000003F
|
|
| 7 | +pext32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000000003F3F
|
|
| 8 | +pext64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x000000003F3F3F3F |
| ... | ... | @@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a |
| 18 | 18 | test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
|
| 19 | 19 | # T25730C.c contains Intel instrinsics, so only run this test on x86
|
| 20 | 20 | test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
|
| 21 | +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
|
|
| 22 | +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
|
|
| 23 | + unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
|
|
| 24 | + compile_and_run, ['-mbmi2'])
|
|
| 25 | + |