| ... |
... |
@@ -240,12 +240,22 @@ genCall (PrimTarget op@(MO_BRev w)) [dst] args = |
|
240
|
240
|
genCallSimpleCast w op dst args
|
|
241
|
241
|
genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
|
|
242
|
242
|
genCallSimpleCast w op dst args
|
|
243
|
|
-genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
|
|
244
|
|
- genCallSimpleCast w op dst args
|
|
245
|
|
-genCall (PrimTarget op@(MO_Pext w)) [dst] args =
|
|
246
|
|
- genCallSimpleCast w op dst args
|
|
247
|
243
|
genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
|
|
248
|
244
|
genCallSimpleCast w op dst args
|
|
|
245
|
+-- Check if the Intel BMI are enabled, and if the bit-width is less than 'W32'.
|
|
|
246
|
+-- If so, we truncate to the 'W32' call the 32-bit intrinsic operation because
|
|
|
247
|
+-- LLVM does not expose a call to 'PDep' and 'PExt' operations for bit-widths
|
|
|
248
|
+-- of 'W8 and 'W16'.
|
|
|
249
|
+genCall (PrimTarget op@(MO_Pdep w)) [dst] args = do
|
|
|
250
|
+ cfg <- getConfig
|
|
|
251
|
+ if llvmCgBmiVersion cfg >= Just BMI2
|
|
|
252
|
+ then genCallMinimumTruncationCast W32 w op dst args
|
|
|
253
|
+ else genCallSimpleCast w op dst args
|
|
|
254
|
+genCall (PrimTarget op@(MO_Pext w)) [dst] args = do
|
|
|
255
|
+ cfg <- getConfig
|
|
|
256
|
+ if llvmCgBmiVersion cfg >= Just BMI2
|
|
|
257
|
+ then genCallMinimumTruncationCast W32 w op dst args
|
|
|
258
|
+ else genCallSimpleCast w op dst args
|
|
249
|
259
|
|
|
250
|
260
|
genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
|
|
251
|
261
|
addrVar <- exprToVarW addr
|
| ... |
... |
@@ -641,8 +651,12 @@ genCallExtract _ _ _ _ = |
|
641
|
651
|
-- from i32 to i8 explicitly as LLVM is strict about types.
|
|
642
|
652
|
genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
|
|
643
|
653
|
-> LlvmM StmtData
|
|
644
|
|
-genCallSimpleCast specW op dst args = do
|
|
645
|
|
- let width = widthToLlvmInt specW
|
|
|
654
|
+genCallSimpleCast w = genCallMinimumTruncationCast w w
|
|
|
655
|
+
|
|
|
656
|
+genCallMinimumTruncationCast :: Width -> Width -> CallishMachOp -> CmmFormal
|
|
|
657
|
+ -> [CmmActual] -> LlvmM StmtData
|
|
|
658
|
+genCallMinimumTruncationCast minW specW op dst args = do
|
|
|
659
|
+ let width = widthToLlvmInt $ max minW specW
|
|
646
|
660
|
argsW = const width <$> args
|
|
647
|
661
|
dstType = cmmToLlvmType $ localRegType dst
|
|
648
|
662
|
signage = cmmPrimOpRetValSignage op
|
| ... |
... |
@@ -945,17 +959,24 @@ cmmPrimOpFunctions mop = do |
|
945
|
959
|
W256 -> fsLit "llvm.cttz.i256"
|
|
946
|
960
|
W512 -> fsLit "llvm.cttz.i512"
|
|
947
|
961
|
MO_Pdep w
|
|
|
962
|
+ -- If the Intel BMI are enabled, then we will be calling the intrinsic operation
|
|
|
963
|
+ -- through the LLVM binding, unless however the bit-width is 'W8' or 'W16'.
|
|
|
964
|
+ -- In these cases, we truncate to the 'W32' bit-width and /directly/ call the
|
|
|
965
|
+ -- 32-bit BMI operation. This is necessary because the LLVM does not expose a
|
|
|
966
|
+ -- call to the 'PDep' and 'PExt' operation for bit-diths of 'W8 and 'W16'.
|
|
|
967
|
+ -- Hence the necessity to to call the BMI intrinsic operation directlky from
|
|
|
968
|
+ -- outside the LLVM.
|
|
948
|
969
|
| isBmi2Enabled -> case w of
|
|
949
|
|
- W8 -> fsLit "llvm.x86.bmi.pdep.8"
|
|
950
|
|
- W16 -> fsLit "llvm.x86.bmi.pdep.16"
|
|
|
970
|
+ W8 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
|
971
|
+ W16 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
951
|
972
|
W32 -> fsLit "llvm.x86.bmi.pdep.32"
|
|
952
|
973
|
W64 -> fsLit "llvm.x86.bmi.pdep.64"
|
|
953
|
974
|
W128 -> fsLit "llvm.x86.bmi.pdep.128"
|
|
954
|
975
|
W256 -> fsLit "llvm.x86.bmi.pdep.256"
|
|
955
|
976
|
W512 -> fsLit "llvm.x86.bmi.pdep.512"
|
|
956
|
977
|
| otherwise -> case w of
|
|
957
|
|
- W8 -> fsLit "hs_pdep8"
|
|
958
|
|
- W16 -> fsLit "hs_pdep16"
|
|
|
978
|
+ W8 -> fsLit "hs_pdep32"
|
|
|
979
|
+ W16 -> fsLit "hs_pdep32"
|
|
959
|
980
|
W32 -> fsLit "hs_pdep32"
|
|
960
|
981
|
W64 -> fsLit "hs_pdep64"
|
|
961
|
982
|
W128 -> fsLit "hs_pdep128"
|
| ... |
... |
@@ -963,8 +984,10 @@ cmmPrimOpFunctions mop = do |
|
963
|
984
|
W512 -> fsLit "hs_pdep512"
|
|
964
|
985
|
MO_Pext w
|
|
965
|
986
|
| isBmi2Enabled -> case w of
|
|
966
|
|
- W8 -> fsLit "llvm.x86.bmi.pext.8"
|
|
967
|
|
- W16 -> fsLit "llvm.x86.bmi.pext.16"
|
|
|
987
|
+ -- See the 'Mo_Pdep' commentary above as to why we call 'pext.32'
|
|
|
988
|
+ -- instead of calling 'pext.8' or 'pext.16' operations.
|
|
|
989
|
+ W8 -> fsLit "llvm.x86.bmi.pext.32"
|
|
|
990
|
+ W16 -> fsLit "llvm.x86.bmi.pext.32"
|
|
968
|
991
|
W32 -> fsLit "llvm.x86.bmi.pext.32"
|
|
969
|
992
|
W64 -> fsLit "llvm.x86.bmi.pext.64"
|
|
970
|
993
|
W128 -> fsLit "llvm.x86.bmi.pext.128"
|