recursion-ninja pushed to branch wip/fix-26065 at Glasgow Haskell Compiler / GHC

Commits:

4 changed files:

Changes:

  • compiler/GHC/CmmToLlvm/CodeGen.hs
    ... ... @@ -240,12 +240,22 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args =
    240 240
         genCallSimpleCast w op dst args
    
    241 241
     genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
    
    242 242
         genCallSimpleCast w op dst args
    
    243
    -genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
    
    244
    -    genCallSimpleCast w op dst args
    
    245
    -genCall (PrimTarget op@(MO_Pext w)) [dst] args =
    
    246
    -    genCallSimpleCast w op dst args
    
    247 243
     genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
    
    248 244
         genCallSimpleCast w op dst args
    
    245
    +-- Check if the Intel BMI are enabled, and if the bit-width is less than 'W32'.
    
    246
    +-- If so, we truncate to the 'W32' call the 32-bit intrinsic operation because
    
    247
    +-- LLVM does not expose a call to 'PDep' and 'PExt' operations for bit-widths
    
    248
    +-- of 'W8 and 'W16'.
    
    249
    +genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
    
    250
    +    cfg <- getConfig
    
    251
    +    if   llvmCgBmiVersion cfg >= Just BMI2
    
    252
    +    then genCallMinimumTruncationCast W32 w op dst args
    
    253
    +    else genCallSimpleCast w op dst args
    
    254
    +genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
    
    255
    +    cfg <- getConfig
    
    256
    +    if   llvmCgBmiVersion cfg >= Just BMI2
    
    257
    +    then genCallMinimumTruncationCast W32 w op dst args
    
    258
    +    else genCallSimpleCast w op dst args
    
    249 259
     
    
    250 260
     genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
    
    251 261
         addrVar <- exprToVarW addr
    
    ... ... @@ -641,8 +651,12 @@ genCallExtract _ _ _ _ =
    641 651
     -- from i32 to i8 explicitly as LLVM is strict about types.
    
    642 652
     genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
    
    643 653
                       -> LlvmM StmtData
    
    644
    -genCallSimpleCast specW op dst args = do
    
    645
    -    let width   = widthToLlvmInt specW
    
    654
    +genCallSimpleCast w = genCallMinimumTruncationCast w w
    
    655
    +
    
    656
    +genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
    
    657
    +                             -> [CmmActual] -> LlvmM StmtData
    
    658
    +genCallMinimumTruncationCast minW specW op dst args = do
    
    659
    +    let width   = widthToLlvmInt $ max minW specW
    
    646 660
             argsW   = const width <$> args
    
    647 661
             dstType = cmmToLlvmType $ localRegType dst
    
    648 662
             signage = cmmPrimOpRetValSignage op
    
    ... ... @@ -945,17 +959,24 @@ cmmPrimOpFunctions mop = do
    945 959
           W256 -> fsLit "llvm.cttz.i256"
    
    946 960
           W512 -> fsLit "llvm.cttz.i512"
    
    947 961
         MO_Pdep w
    
    962
    +      -- If the Intel BMI are enabled, then we will be calling the intrinsic operation
    
    963
    +      -- through the LLVM binding, unless however the bit-width is 'W8' or 'W16'.
    
    964
    +      -- In these cases, we truncate to the 'W32' bit-width and /directly/ call the
    
    965
    +      -- 32-bit BMI operation. This is necessary because the LLVM does not expose a
    
    966
    +      -- call to the 'PDep' and 'PExt' operation for bit-diths of 'W8 and 'W16'.
    
    967
    +      -- Hence the necessity to to call the BMI intrinsic operation directlky from
    
    968
    +      -- outside the LLVM.
    
    948 969
           | isBmi2Enabled -> case w of
    
    949
    -          W8   -> fsLit "llvm.x86.bmi.pdep.8"
    
    950
    -          W16  -> fsLit "llvm.x86.bmi.pdep.16"
    
    970
    +          W8   -> fsLit "llvm.x86.bmi.pdep.32"
    
    971
    +          W16  -> fsLit "llvm.x86.bmi.pdep.32"
    
    951 972
               W32  -> fsLit "llvm.x86.bmi.pdep.32"
    
    952 973
               W64  -> fsLit "llvm.x86.bmi.pdep.64"
    
    953 974
               W128 -> fsLit "llvm.x86.bmi.pdep.128"
    
    954 975
               W256 -> fsLit "llvm.x86.bmi.pdep.256"
    
    955 976
               W512 -> fsLit "llvm.x86.bmi.pdep.512"
    
    956 977
           | otherwise -> case w of
    
    957
    -          W8   -> fsLit "hs_pdep8"
    
    958
    -          W16  -> fsLit "hs_pdep16"
    
    978
    +          W8   -> fsLit "hs_pdep32"
    
    979
    +          W16  -> fsLit "hs_pdep32"
    
    959 980
               W32  -> fsLit "hs_pdep32"
    
    960 981
               W64  -> fsLit "hs_pdep64"
    
    961 982
               W128 -> fsLit "hs_pdep128"
    
    ... ... @@ -963,8 +984,10 @@ cmmPrimOpFunctions mop = do
    963 984
               W512 -> fsLit "hs_pdep512"
    
    964 985
         MO_Pext w
    
    965 986
           | isBmi2Enabled -> case w of
    
    966
    -          W8   -> fsLit "llvm.x86.bmi.pext.8"
    
    967
    -          W16  -> fsLit "llvm.x86.bmi.pext.16"
    
    987
    +          -- See the 'Mo_Pdep' commentary above as to why we call 'pext.32'
    
    988
    +          -- instead of calling 'pext.8' or 'pext.16' operations.
    
    989
    +          W8   -> fsLit "llvm.x86.bmi.pext.32"
    
    990
    +          W16  -> fsLit "llvm.x86.bmi.pext.32"
    
    968 991
               W32  -> fsLit "llvm.x86.bmi.pext.32"
    
    969 992
               W64  -> fsLit "llvm.x86.bmi.pext.64"
    
    970 993
               W128 -> fsLit "llvm.x86.bmi.pext.128"
    

  • testsuite/tests/llvm/should_run/T26065.hs
    1
    +{-# LANGUAGE MagicHash #-}
    
    2
    +import GHC.Exts
    
    3
    +import GHC.Word
    
    4
    +
    
    5
    +pdep8 :: Word8 -> Word8 -> Word8
    
    6
    +pdep8 (W8# a) (W8# b) = W8# (wordToWord8# (pdep8# (word8ToWord# a) (word8ToWord# b)))
    
    7
    +{-# NOINLINE pdep8 #-}
    
    8
    +
    
    9
    +pdep16 :: Word16 -> Word16 -> Word16
    
    10
    +pdep16 (W16# a) (W16# b) = W16# (wordToWord16# (pdep16# (word16ToWord# a) (word16ToWord# b)))
    
    11
    +{-# NOINLINE pdep16 #-}
    
    12
    +
    
    13
    +pdep32 :: Word32 -> Word32 -> Word32
    
    14
    +pdep32 (W32# a) (W32# b) = W32# (wordToWord32# (pdep32# (word32ToWord# a) (word32ToWord# b)))
    
    15
    +{-# NOINLINE pdep32 #-}
    
    16
    +
    
    17
    +pdep64 :: Word64 -> Word64 -> Word64
    
    18
    +pdep64 (W64# a) (W64# b) = W64# (pdep64# a b)
    
    19
    +{-# NOINLINE pdep64 #-}
    
    20
    +
    
    21
    +pext8 :: Word8 -> Word8 -> Word8
    
    22
    +pext8 (W8# a) (W8# b) = W8# (wordToWord8# (pext8# (word8ToWord# a) (word8ToWord# b)))
    
    23
    +{-# NOINLINE pext8 #-}
    
    24
    +
    
    25
    +pext16 :: Word16 -> Word16 -> Word16
    
    26
    +pext16 (W16# a) (W16# b) = W16# (wordToWord16# (pext16# (word16ToWord# a) (word16ToWord# b)))
    
    27
    +{-# NOINLINE pext16 #-}
    
    28
    +
    
    29
    +pext32 :: Word32 -> Word32 -> Word32
    
    30
    +pext32 (W32# a) (W32# b) = W32# (wordToWord32# (pext32# (word32ToWord# a) (word32ToWord# b)))
    
    31
    +{-# NOINLINE pext32 #-}
    
    32
    +
    
    33
    +pext64 :: Word64 -> Word64 -> Word64
    
    34
    +pext64 (W64# a) (W64# b) = W64# (pext64# a b)
    
    35
    +{-# NOINLINE pext64 #-}
    
    36
    +
    
    37
    +main :: IO ()
    
    38
    +main = do
    
    39
    +  putStr "pdep8:\t"  *> print (pdep8  7 3)
    
    40
    +  putStr "pdep16:\t" *> print (pdep16 7 3)
    
    41
    +  putStr "pdep32:\t" *> print (pdep32 7 3)
    
    42
    +  putStr "pdep64:\t" *> print (pdep64 7 3)
    
    43
    +  putStr "pext8:\t"  *> print (pext8  7 3)
    
    44
    +  putStr "pext16:\t" *> print (pext16 7 3)
    
    45
    +  putStr "pext32:\t" *> print (pext32 7 3)
    
    46
    +  putStr "pext64:\t" *> print (pext64 7 3)

  • testsuite/tests/llvm/should_run/T26065.stdout
    1
    +pdep8:	3
    
    2
    +pdep16:	3
    
    3
    +pdep32:	3
    
    4
    +pdep64:	3
    
    5
    +pext8:	3
    
    6
    +pext16:	3
    
    7
    +pext32:	3
    
    8
    +pext64:	3

  • testsuite/tests/llvm/should_run/all.T
    ... ... @@ -18,3 +18,8 @@ test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a
    18 18
     test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
    
    19 19
       # T25730C.c contains Intel instrinsics, so only run this test on x86
    
    20 20
     test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
    
    21
    +# T26065.c tests LLVM linking of Intel instrinsics, so only run this test on x86
    
    22
    +test('T26065', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"])),
    
    23
    +                unless((arch('x86_64') or arch('i386')) and have_cpu_feature('bmi2'),skip)],
    
    24
    +                compile_and_run, ['-mbmi2'])
    
    25
    +