Andreas Klebinger pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC

Commits:

4 changed files:

Changes:

  • compiler/GHC/CmmToLlvm/CodeGen.hs
    ... ... @@ -240,12 +240,25 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args =
    240 240
         genCallSimpleCast w op dst args
    
    241 241
     genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
    
    242 242
         genCallSimpleCast w op dst args
    
    243
    -genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
    
    244
    -    genCallSimpleCast w op dst args
    
    245
    -genCall (PrimTarget op@(MO_Pext w)) [dst] args =
    
    246
    -    genCallSimpleCast w op dst args
    
    247 243
     genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
    
    248 244
         genCallSimpleCast w op dst args
    
    245
    +{- Note [LLVM PDep/PExt intrinsics]
    
    246
    +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    247
    +Since x86 PDep/PExt instructions only exist for 32/64 bit widths
    
    248
    +we use the 32bit variant to compute the 8/16bit primops.
    
    249
    +To do so we extend/truncate the argument/result around the
    
    250
    +call.
    
    251
    +-}
    
    252
    +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
    
    253
    +    cfg <- getConfig
    
    254
    +    if  llvmCgBmiVersion cfg >= Just BMI2
    
    255
    +        then genCallMinimumTruncationCast W32 w op dst args
    
    256
    +        else genCallSimpleCast w op dst args
    
    257
    +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
    
    258
    +    cfg <- getConfig
    
    259
    +    if  llvmCgBmiVersion cfg >= Just BMI2
    
    260
    +        then genCallMinimumTruncationCast W32 w op dst args
    
    261
    +        else genCallSimpleCast w op dst args
    
    249 262
     
    
    250 263
     genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
    
    251 264
         addrVar <- exprToVarW addr
    
    ... ... @@ -641,8 +654,15 @@ genCallExtract _ _ _ _ =
    641 654
     -- from i32 to i8 explicitly as LLVM is strict about types.
    
    642 655
     genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
    
    643 656
                       -> LlvmM StmtData
    
    644
    -genCallSimpleCast specW op dst args = do
    
    645
    -    let width   = widthToLlvmInt specW
    
    657
    +genCallSimpleCast w = genCallMinimumTruncationCast w w
    
    658
    +
    
    659
    +-- Given the minimum machine bit-width to use and the logical bit-width of the
    
    660
    +-- value range, perform a type-cast truncation and extension before and after the
    
    661
    +-- specified operation, respectively.
    
    662
    +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
    
    663
    +                             -> [CmmActual] -> LlvmM StmtData
    
    664
    +genCallMinimumTruncationCast minW specW op dst args = do
    
    665
    +    let width   = widthToLlvmInt $ max minW specW
    
    646 666
             argsW   = const width <$> args
    
    647 667
             dstType = cmmToLlvmType $ localRegType dst
    
    648 668
             signage = cmmPrimOpRetValSignage op
    
    ... ... @@ -945,9 +965,10 @@ cmmPrimOpFunctions mop = do
    945 965
           W256 -> fsLit "llvm.cttz.i256"
    
    946 966
           W512 -> fsLit "llvm.cttz.i512"
    
    947 967
         MO_Pdep w
    
    968
    +      -- See Note [LLVM PDep/PExt intrinsics]
    
    948 969
           | isBmi2Enabled -> case w of
    
    949
    -          W8   -> fsLit "llvm.x86.bmi.pdep.8"
    
    950
    -          W16  -> fsLit "llvm.x86.bmi.pdep.16"
    
    970
    +          W8   -> fsLit "llvm.x86.bmi.pdep.32"
    
    971
    +          W16  -> fsLit "llvm.x86.bmi.pdep.32"
    
    951 972
               W32  -> fsLit "llvm.x86.bmi.pdep.32"
    
    952 973
               W64  -> fsLit "llvm.x86.bmi.pdep.64"
    
    953 974
               W128 -> fsLit "llvm.x86.bmi.pdep.128"
    
    ... ... @@ -963,8 +984,9 @@ cmmPrimOpFunctions mop = do
    963 984
               W512 -> fsLit "hs_pdep512"
    
    964 985
         MO_Pext w
    
    965 986
           | isBmi2Enabled -> case w of
    
    966
    -          W8   -> fsLit "llvm.x86.bmi.pext.8"
    
    967
    -          W16  -> fsLit "llvm.x86.bmi.pext.16"
    
    987
    +          -- See Note [LLVM PDep/PExt intrinsics]
    
    988
    +          W8   -> fsLit "llvm.x86.bmi.pext.32"
    
    989
    +          W16  -> fsLit "llvm.x86.bmi.pext.32"
    
    968 990
               W32  -> fsLit "llvm.x86.bmi.pext.32"
    
    969 991
               W64  -> fsLit "llvm.x86.bmi.pext.64"
    
    970 992
               W128 -> fsLit "llvm.x86.bmi.pext.128"
    

  • testsuite/tests/llvm/should_run/T26065.hs
    1
    +{-# LANGUAGE MagicHash #-}
    
    2
    +{-# LANGUAGE ScopedTypeVariables #-}
    
    3
    +import Data.Char (toUpper)
    
    4
    +import GHC.Exts
    
    5
    +import GHC.Word
    
    6
    +import Numeric (showHex)
    
    7
    +
    
    8
    +pdep8 :: Word8 -> Word8 -> Word8
    
    9
    +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
    
    10
    +{-# NOINLINE pdep8 #-}
    
    11
    +
    
    12
    +pdep16 :: Word16 -> Word16 -> Word16
    
    13
    +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
    
    14
    +{-# NOINLINE pdep16 #-}
    
    15
    +
    
    16
    +pdep32 :: Word32 -> Word32 -> Word32
    
    17
    +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
    
    18
    +{-# NOINLINE pdep32 #-}
    
    19
    +
    
    20
    +pdep64 :: Word64 -> Word64 -> Word64
    
    21
    +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
    
    22
    +{-# NOINLINE pdep64 #-}
    
    23
    +
    
    24
    +pext8 :: Word8 -> Word8 -> Word8
    
    25
    +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
    
    26
    +{-# NOINLINE pext8 #-}
    
    27
    +
    
    28
    +pext16 :: Word16 -> Word16 -> Word16
    
    29
    +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
    
    30
    +{-# NOINLINE pext16 #-}
    
    31
    +
    
    32
    +pext32 :: Word32 -> Word32 -> Word32
    
    33
    +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
    
    34
    +{-# NOINLINE pext32 #-}
    
    35
    +
    
    36
    +pext64 :: Word64 -> Word64 -> Word64
    
    37
    +pext64 (W64# a) (W64# b) = W64# (pext64# a b)
    
    38
    +{-# NOINLINE pext64 #-}
    
    39
    +
    
    40
    +valueSource :: Integral i => i
    
    41
    +valueSource = fromInteger 0xA7F7A7F7A7F7A7F7
    
    42
    +
    
    43
    +valueMask   :: Integral i => i
    
    44
    +valueMask   = fromInteger 0x5555555555555555
    
    45
    +
    
    46
    +printIntrinsicCall :: forall i. Integral i => String -> (i -> i -> i) -> IO ()
    
    47
    +printIntrinsicCall label f =
    
    48
    +  let op1 = valueSource
    
    49
    +      op2 = valueMask
    
    50
    +      pad s =
    
    51
    +          let hex :: Integral a => a -> String
    
    52
    +              hex = flip showHex ""
    
    53
    +              str = toUpper <$> hex s
    
    54
    +              len = length $ hex (maxBound :: Word64)
    
    55
    +              n   = length str
    
    56
    +          in  "0x" <> replicate (len - n) '0' <> str
    
    57
    +  in  putStrLn $ unwords [ label, pad op1, pad op2, "=", pad (f op1 op2) ]
    
    58
    +
    
    59
    +main :: IO ()
    
    60
    +main = do
    
    61
    +  printIntrinsicCall "pdep8 " pdep8
    
    62
    +  printIntrinsicCall "pdep16" pdep16
    
    63
    +  printIntrinsicCall "pdep32" pdep32
    
    64
    +  printIntrinsicCall "pdep64" pdep64
    
    65
    +  printIntrinsicCall "pext8 " pext8
    
    66
    +  printIntrinsicCall "pext16" pext16
    
    67
    +  printIntrinsicCall "pext32" pext32
    
    68
    +  printIntrinsicCall "pext64" pext64

  • testsuite/tests/llvm/should_run/T26065.stdout
    1
    +pdep8  0x00000000000000F7 0x0000000000000055 = 0x0000000000000015
    
    2
    +pdep16 0x000000000000A7F7 0x0000000000005555 = 0x0000000000005515
    
    3
    +pdep32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000044155515
    
    4
    +pdep64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x4415551544155515
    
    5
    +pext8  0x00000000000000F7 0x0000000000000055 = 0x000000000000000F
    
    6
    +pext16 0x000000000000A7F7 0x0000000000005555 = 0x000000000000003F
    
    7
    +pext32 0x00000000A7F7A7F7 0x0000000055555555 = 0x0000000000003F3F
    
    8
    +pext64 0xA7F7A7F7A7F7A7F7 0x5555555555555555 = 0x000000003F3F3F3F

  • testsuite/tests/llvm/should_run/all.T
    ... ... @@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a
    18 18
     test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
    
    19 19
       # T25730C.c contains Intel instrinsics, so only run this test on x86
    
    20 20
     test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
    
    21
    +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
    
    22
    +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
    
    23
    +                unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
    
    24
    +                compile_and_run, ['-mbmi2'])
    
    25
    +