Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC
Commits:
-
14f485ee
by ARATA Mizuki at 2026-02-17T09:09:24+09:00
21 changed files:
- compiler/GHC/CmmToAsm/Config.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/Driver/Config/CmmToAsm.hs
- compiler/GHC/Driver/DynFlags.hs
- compiler/GHC/Driver/Pipeline/Execute.hs
- compiler/GHC/Driver/Session.hs
- compiler/GHC/SysTools/Cpp.hs
- docs/users_guide/9.16.1-notes.rst
- docs/users_guide/phases.rst
- docs/users_guide/using.rst
- testsuite/driver/cpu_features.py
- testsuite/tests/codeGen/should_gen_asm/all.T
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.asm
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.hs
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.asm
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.hs
- + testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.asm
- + testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.hs
- testsuite/tests/simd/should_run/all.T
Changes:
| ... | ... | @@ -31,6 +31,9 @@ data NCGConfig = NCGConfig |
| 31 | 31 | , ncgDoConstantFolding :: !Bool -- ^ Perform CMM constant folding
|
| 32 | 32 | , ncgSseAvxVersion :: Maybe SseAvxVersion -- ^ (x86) SSE and AVX instructions
|
| 33 | 33 | , ncgAvx512fEnabled :: !Bool
|
| 34 | + , ncgAvx512vlEnabled :: !Bool
|
|
| 35 | + , ncgAvx512bwEnabled :: !Bool
|
|
| 36 | + , ncgAvx512dqEnabled :: !Bool
|
|
| 34 | 37 | , ncgBmiVersion :: Maybe BmiVersion -- ^ (x86) BMI instructions
|
| 35 | 38 | , ncgDumpRegAllocStages :: !Bool
|
| 36 | 39 | , ncgDumpAsmStats :: !Bool
|
| ... | ... | @@ -134,6 +134,12 @@ avx2Enabled = do |
| 134 | 134 | config <- getConfig
|
| 135 | 135 | return (ncgSseAvxVersion config >= Just AVX2)
|
| 136 | 136 | |
| 137 | +avx512vlEnabled :: NatM Bool
|
|
| 138 | +avx512vlEnabled = ncgAvx512vlEnabled <$> getConfig
|
|
| 139 | + |
|
| 140 | +avx512dqEnabled :: NatM Bool
|
|
| 141 | +avx512dqEnabled = ncgAvx512dqEnabled <$> getConfig
|
|
| 142 | + |
|
| 137 | 143 | cmmTopCodeGen
|
| 138 | 144 | :: RawCmmDecl
|
| 139 | 145 | -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr]
|
| ... | ... | @@ -1314,6 +1320,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps |
| 1314 | 1320 | sse4_1 <- sse4_1Enabled
|
| 1315 | 1321 | sse4_2 <- sse4_2Enabled
|
| 1316 | 1322 | avx <- avxEnabled
|
| 1323 | + avx512vl <- avx512vlEnabled
|
|
| 1324 | + avx512dq <- avx512dqEnabled
|
|
| 1317 | 1325 | case mop of
|
| 1318 | 1326 | MO_F_Eq _ -> condFltReg is32Bit EQQ x y
|
| 1319 | 1327 | MO_F_Ne _ -> condFltReg is32Bit NE x y
|
| ... | ... | @@ -1432,57 +1440,76 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps |
| 1432 | 1440 | MO_V_Sub l w | l * widthInBits w == 128 -> vector_int_op_sse PSUB l w x y
|
| 1433 | 1441 | | otherwise -> needLlvm mop
|
| 1434 | 1442 | MO_V_Mul 16 W8 -> vector_int8x16_mul_sse2 x y
|
| 1435 | - MO_V_Mul l@8 w@W16 -> vector_int_op_sse PMULL l w x y -- PMULLW (SSE2)
|
|
| 1436 | - MO_V_Mul l@4 w@W32 | sse4_1 -> vector_int_op_sse PMULL l w x y -- PMULLD (SSE4.1)
|
|
| 1443 | + MO_V_Mul l@8 w@W16 | avx -> vector_int_op_avx VPMULL l w x y -- VPMULLW (AVX)
|
|
| 1444 | + | otherwise -> vector_int_op_sse PMULL l w x y -- PMULLW (SSE2)
|
|
| 1445 | + MO_V_Mul l@4 w@W32 | avx -> vector_int_op_avx VPMULL l w x y -- VPMULLD (AVX)
|
|
| 1446 | + | sse4_1 -> vector_int_op_sse PMULL l w x y -- PMULLD (SSE4.1)
|
|
| 1437 | 1447 | | otherwise -> vector_int32x4_mul_sse2 x y
|
| 1438 | - MO_V_Mul 2 W64 -> vector_int64x2_mul_sse2 x y
|
|
| 1448 | + MO_V_Mul l@2 w@W64 | avx512dq && avx512vl -> vector_int_op_avx VPMULL l w x y -- VPMULLQ (AVX512DQ+VL)
|
|
| 1449 | + | otherwise -> vector_int64x2_mul_sse2 x y
|
|
| 1439 | 1450 | MO_V_Mul {} -> needLlvm mop
|
| 1440 | 1451 | |
| 1441 | 1452 | MO_VU_Min l@16 w@W8
|
| 1442 | - -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUB (SSE2)
|
|
| 1453 | + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUB (AVX)
|
|
| 1454 | + | otherwise -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUB (SSE2)
|
|
| 1443 | 1455 | MO_VU_Min l@8 w@W16
|
| 1456 | + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUW (AVX)
|
|
| 1444 | 1457 | | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUW (SSE4.1)
|
| 1445 | 1458 | | otherwise -> vector_word_minmax_sse Min l w x y
|
| 1446 | 1459 | MO_VU_Min l@4 w@W32
|
| 1460 | + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUD (AVX)
|
|
| 1447 | 1461 | | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUD (SSE4.1)
|
| 1448 | 1462 | | otherwise -> vector_word_minmax_sse Min l w x y
|
| 1449 | 1463 | MO_VU_Min l@2 w@W64
|
| 1464 | + | avx512vl -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUQ (AVX512F+VL)
|
|
| 1450 | 1465 | | sse4_2 -> vector_word_minmax_sse Min l w x y -- PCMPGTQ requires SSE4.2
|
| 1451 | 1466 | -- The SSE2 version is implemented as a C call (MO_W64X2_Min)
|
| 1452 | 1467 | MO_VU_Min {} -> needLlvm mop
|
| 1453 | 1468 | MO_VU_Max l@16 w@W8
|
| 1454 | - -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUB (SSE2)
|
|
| 1469 | + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUB (AVX)
|
|
| 1470 | + | otherwise -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUB (SSE2)
|
|
| 1455 | 1471 | MO_VU_Max l@8 w@W16
|
| 1472 | + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUW (AVX)
|
|
| 1456 | 1473 | | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUW (SSE4.1)
|
| 1457 | 1474 | | otherwise -> vector_word_minmax_sse Max l w x y
|
| 1458 | 1475 | MO_VU_Max l@4 w@W32
|
| 1476 | + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUD (AVX)
|
|
| 1459 | 1477 | | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUD (SSE4.1)
|
| 1460 | 1478 | | otherwise -> vector_word_minmax_sse Max l w x y
|
| 1461 | 1479 | MO_VU_Max l@2 w@W64
|
| 1480 | + | avx512vl -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUQ (AVX512F+VL)
|
|
| 1462 | 1481 | | sse4_2 -> vector_word_minmax_sse Max l w x y -- PCMPGTQ requires SSE4.2
|
| 1463 | 1482 | -- The SSE2 version is implemented as a C call (MO_W64X2_Max)
|
| 1464 | 1483 | MO_VU_Max {} -> needLlvm mop
|
| 1465 | 1484 | MO_VS_Min l@16 w@W8
|
| 1485 | + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSB (AVX)
|
|
| 1466 | 1486 | | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSB (SSE4.1)
|
| 1467 | 1487 | | otherwise -> vector_int_minmax_sse Min l w x y
|
| 1468 | 1488 | MO_VS_Min l@8 w@W16
|
| 1469 | - -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSW (SSE2)
|
|
| 1489 | + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSW (AVX)
|
|
| 1490 | + | otherwise -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSW (SSE2)
|
|
| 1470 | 1491 | MO_VS_Min l@4 w@W32
|
| 1492 | + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSD (AVX)
|
|
| 1471 | 1493 | | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSD (SSE4.1)
|
| 1472 | 1494 | | otherwise -> vector_int_minmax_sse Min l w x y
|
| 1473 | 1495 | MO_VS_Min l@2 w@W64
|
| 1496 | + | avx512vl -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSQ (AVX512F+VL)
|
|
| 1474 | 1497 | | sse4_2 -> vector_int_minmax_sse Min l w x y -- PCMPGTQ requires SSE4.2
|
| 1475 | 1498 | -- The SSE2 version is implemented as a C call (MO_I64X2_Min)
|
| 1476 | 1499 | MO_VS_Min {} -> needLlvm mop
|
| 1477 | 1500 | MO_VS_Max l@16 w@W8
|
| 1501 | + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSB (AVX)
|
|
| 1478 | 1502 | | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSB (SSE4.1)
|
| 1479 | 1503 | | otherwise -> vector_int_minmax_sse Max l w x y
|
| 1480 | 1504 | MO_VS_Max l@8 w@W16
|
| 1481 | - -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSW (SSE2)
|
|
| 1505 | + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSW (AVX)
|
|
| 1506 | + | otherwise -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSW (SSE2)
|
|
| 1482 | 1507 | MO_VS_Max l@4 w@W32
|
| 1508 | + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSD (AVX)
|
|
| 1483 | 1509 | | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSD (SSE4.1)
|
| 1484 | 1510 | | otherwise -> vector_int_minmax_sse Max l w x y
|
| 1485 | 1511 | MO_VS_Max l@2 w@W64
|
| 1512 | + | avx512vl -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSQ (AVX512F+VL)
|
|
| 1486 | 1513 | | sse4_2 -> vector_int_minmax_sse Max l w x y -- PCMPGTQ requires SSE4.2
|
| 1487 | 1514 | -- The SSE2 version is implemented as a C call (MO_I64X2_Max)
|
| 1488 | 1515 | MO_VS_Max {} -> needLlvm mop
|
| ... | ... | @@ -1975,7 +2002,6 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps |
| 1975 | 2002 | (PUNPCKLDQ format (OpReg tmpOdd1) dst) -- dst <- (dst[0],tmpOdd1[0],dst[1],tmpOdd1[1])
|
| 1976 | 2003 | return (Any format code)
|
| 1977 | 2004 | |
| 1978 | - -- TODO: We could use `VPMULLQ` if AVX-512 or AVX10.1 is available.
|
|
| 1979 | 2005 | vector_int64x2_mul_sse2 :: CmmExpr -> CmmExpr -> NatM Register
|
| 1980 | 2006 | vector_int64x2_mul_sse2 expr1 expr2 = do
|
| 1981 | 2007 | -- implement 64 bit multiplication using 32-bit PMULUDQ multiplication instructions
|
| ... | ... | @@ -338,6 +338,7 @@ data Instr |
| 338 | 338 | | PADD Format Operand Reg
|
| 339 | 339 | | PSUB Format Operand Reg
|
| 340 | 340 | | PMULL Format Operand Reg
|
| 341 | + | VPMULL Format Operand Reg Reg
|
|
| 341 | 342 | | PMULUDQ Format Operand Reg
|
| 342 | 343 | |
| 343 | 344 | -- SIMD compare
|
| ... | ... | @@ -601,6 +602,7 @@ regUsageOfInstr platform instr |
| 601 | 602 | PADD fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
|
| 602 | 603 | PSUB fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
|
| 603 | 604 | PMULL fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
|
| 605 | + VPMULL fmt s1 s2 dst -> mkRU (use_R fmt s1 [mk fmt s2]) [mk fmt dst]
|
|
| 604 | 606 | PMULUDQ fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
|
| 605 | 607 | |
| 606 | 608 | PCMPGT fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
|
| ... | ... | @@ -912,6 +914,7 @@ patchRegsOfInstr platform instr env |
| 912 | 914 | PADD fmt src dst -> PADD fmt (patchOp src) (env dst)
|
| 913 | 915 | PSUB fmt src dst -> PSUB fmt (patchOp src) (env dst)
|
| 914 | 916 | PMULL fmt src dst -> PMULL fmt (patchOp src) (env dst)
|
| 917 | + VPMULL fmt s1 s2 dst -> VPMULL fmt (patchOp s1) (env s2) (env dst)
|
|
| 915 | 918 | PMULUDQ fmt src dst -> PMULUDQ fmt (patchOp src) (env dst)
|
| 916 | 919 | |
| 917 | 920 | PCMPGT fmt src dst -> PCMPGT fmt (patchOp src) (env dst)
|
| ... | ... | @@ -1012,6 +1012,8 @@ pprInstr platform i = case i of |
| 1012 | 1012 | -> pprFormatOpReg (text "psub") format src dst
|
| 1013 | 1013 | PMULL format src dst
|
| 1014 | 1014 | -> pprFormatOpReg (text "pmull") format src dst
|
| 1015 | + VPMULL format s1 s2 dst
|
|
| 1016 | + -> pprFormatOpRegReg (text "vpmull") format s1 s2 dst
|
|
| 1015 | 1017 | PMULUDQ format src dst
|
| 1016 | 1018 | -> pprOpReg (text "pmuludq") format src dst
|
| 1017 | 1019 | PCMPGT format src dst
|
| ... | ... | @@ -1574,7 +1576,8 @@ pprInstr platform i = case i of |
| 1574 | 1576 | pprMinMax wantV minOrMax mmTy fmt regs
|
| 1575 | 1577 | = line $ hcat ( instr : intersperse comma ( map ( pprOperand platform fmt ) regs ) )
|
| 1576 | 1578 | where
|
| 1577 | - instr = (if wantV then text "v" else empty)
|
|
| 1579 | + instr = char '\t'
|
|
| 1580 | + <> (if wantV then text "v" else empty)
|
|
| 1578 | 1581 | <> (case mmTy of { IntVecMinMax {} -> text "p"; FloatMinMax -> empty })
|
| 1579 | 1582 | <> (case minOrMax of { Min -> text "min"; Max -> text "max" })
|
| 1580 | 1583 | <> (case mmTy of { IntVecMinMax wantSigned -> if wantSigned then text "s" else text "u"; FloatMinMax -> empty })
|
| ... | ... | @@ -65,6 +65,9 @@ initNCGConfig dflags this_mod = NCGConfig |
| 65 | 65 | ArchX86 -> v
|
| 66 | 66 | _ -> Nothing
|
| 67 | 67 | , ncgAvx512fEnabled = isAvx512fEnabled dflags
|
| 68 | + , ncgAvx512vlEnabled = isAvx512vlEnabled dflags
|
|
| 69 | + , ncgAvx512bwEnabled = isAvx512bwEnabled dflags
|
|
| 70 | + , ncgAvx512dqEnabled = isAvx512dqEnabled dflags
|
|
| 68 | 71 | |
| 69 | 72 | , ncgLa664Enabled = isLa664Enabled dflags
|
| 70 | 73 |
| ... | ... | @@ -83,11 +83,15 @@ module GHC.Driver.DynFlags ( |
| 83 | 83 | isSse4_2Enabled,
|
| 84 | 84 | isAvxEnabled,
|
| 85 | 85 | isAvx2Enabled,
|
| 86 | + isAvx512bwEnabled,
|
|
| 86 | 87 | isAvx512cdEnabled,
|
| 88 | + isAvx512dqEnabled,
|
|
| 87 | 89 | isAvx512erEnabled,
|
| 88 | 90 | isAvx512fEnabled,
|
| 89 | 91 | isAvx512pfEnabled,
|
| 92 | + isAvx512vlEnabled,
|
|
| 90 | 93 | isFmaEnabled,
|
| 94 | + isGfniEnabled,
|
|
| 91 | 95 | isBmiEnabled,
|
| 92 | 96 | isBmi2Enabled,
|
| 93 | 97 | -- For LoongArch platform
|
| ... | ... | @@ -454,12 +458,16 @@ data DynFlags = DynFlags { |
| 454 | 458 | -- | Machine dependent flags (-m\<blah> stuff)
|
| 455 | 459 | sseAvxVersion :: Maybe SseAvxVersion,
|
| 456 | 460 | bmiVersion :: Maybe BmiVersion,
|
| 457 | - avx512cd :: Bool, -- Enable AVX-512 Conflict Detection Instructions.
|
|
| 458 | - avx512er :: Bool, -- Enable AVX-512 Exponential and Reciprocal Instructions.
|
|
| 459 | - avx512f :: Bool, -- Enable AVX-512 instructions.
|
|
| 460 | - avx512pf :: Bool, -- Enable AVX-512 PreFetch Instructions.
|
|
| 461 | + avx512bw :: Bool, -- ^ Enable AVX-512BW Instructions.
|
|
| 462 | + avx512cd :: Bool, -- ^ Enable AVX-512 Conflict Detection Instructions.
|
|
| 463 | + avx512dq :: Bool, -- ^ Enable AVX-512DQ Instructions.
|
|
| 464 | + avx512er :: Bool, -- ^ Enable AVX-512 Exponential and Reciprocal Instructions.
|
|
| 465 | + avx512f :: Bool, -- ^ Enable AVX-512 instructions.
|
|
| 466 | + avx512pf :: Bool, -- ^ Enable AVX-512 PreFetch Instructions.
|
|
| 467 | + avx512vl :: Bool, -- ^ Enable AVX-512VL Instructions.
|
|
| 461 | 468 | fma :: Bool, -- ^ Enable FMA instructions.
|
| 462 | - la664 :: Bool, -- Enable LA664 instructions
|
|
| 469 | + gfni :: Bool, -- ^ Enable GFNI Instructions.
|
|
| 470 | + la664 :: Bool, -- ^ Enable LA664 instructions
|
|
| 463 | 471 | |
| 464 | 472 | -- Constants used to control the amount of optimization done.
|
| 465 | 473 | |
| ... | ... | @@ -737,12 +745,16 @@ defaultDynFlags mySettings = |
| 737 | 745 | interactivePrint = Nothing,
|
| 738 | 746 | sseAvxVersion = Nothing,
|
| 739 | 747 | bmiVersion = Nothing,
|
| 748 | + avx512bw = False,
|
|
| 740 | 749 | avx512cd = False,
|
| 750 | + avx512dq = False,
|
|
| 741 | 751 | avx512er = False,
|
| 742 | 752 | avx512f = False,
|
| 743 | 753 | avx512pf = False,
|
| 754 | + avx512vl = False,
|
|
| 744 | 755 | -- Use FMA by default on AArch64
|
| 745 | 756 | fma = (platformArch . sTargetPlatform $ mySettings) == ArchAArch64,
|
| 757 | + gfni = False,
|
|
| 746 | 758 | -- For LoongArch, la464 is used by default.
|
| 747 | 759 | la664 = False,
|
| 748 | 760 | |
| ... | ... | @@ -1616,18 +1628,27 @@ isAvxEnabled dflags = sseAvxVersion dflags >= Just AVX1 || (isX86 && fma dflags) |
| 1616 | 1628 | isAvx2Enabled :: DynFlags -> Bool
|
| 1617 | 1629 | isAvx2Enabled dflags = sseAvxVersion dflags >= Just AVX2 || isAvx512fEnabled dflags
|
| 1618 | 1630 | |
| 1631 | +isAvx512bwEnabled :: DynFlags -> Bool
|
|
| 1632 | +isAvx512bwEnabled dflags = avx512bw dflags
|
|
| 1633 | + |
|
| 1619 | 1634 | isAvx512cdEnabled :: DynFlags -> Bool
|
| 1620 | 1635 | isAvx512cdEnabled dflags = avx512cd dflags
|
| 1621 | 1636 | |
| 1637 | +isAvx512dqEnabled :: DynFlags -> Bool
|
|
| 1638 | +isAvx512dqEnabled dflags = avx512dq dflags
|
|
| 1639 | + |
|
| 1622 | 1640 | isAvx512erEnabled :: DynFlags -> Bool
|
| 1623 | 1641 | isAvx512erEnabled dflags = avx512er dflags
|
| 1624 | 1642 | |
| 1625 | 1643 | isAvx512fEnabled :: DynFlags -> Bool
|
| 1626 | -isAvx512fEnabled dflags = avx512f dflags || avx512cd dflags || avx512er dflags || avx512pf dflags
|
|
| 1644 | +isAvx512fEnabled dflags = avx512f dflags || avx512bw dflags || avx512cd dflags || avx512dq dflags || avx512er dflags || avx512pf dflags || avx512vl dflags
|
|
| 1627 | 1645 | |
| 1628 | 1646 | isAvx512pfEnabled :: DynFlags -> Bool
|
| 1629 | 1647 | isAvx512pfEnabled dflags = avx512pf dflags
|
| 1630 | 1648 | |
| 1649 | +isAvx512vlEnabled :: DynFlags -> Bool
|
|
| 1650 | +isAvx512vlEnabled dflags = avx512vl dflags
|
|
| 1651 | + |
|
| 1631 | 1652 | isFmaEnabled :: DynFlags -> Bool
|
| 1632 | 1653 | isFmaEnabled dflags = fma dflags || (isX86 && isAvx512fEnabled dflags)
|
| 1633 | 1654 | where
|
| ... | ... | @@ -1637,6 +1658,9 @@ isFmaEnabled dflags = fma dflags || (isX86 && isAvx512fEnabled dflags) |
| 1637 | 1658 | ArchX86 -> True
|
| 1638 | 1659 | _ -> False
|
| 1639 | 1660 | |
| 1661 | +isGfniEnabled :: DynFlags -> Bool
|
|
| 1662 | +isGfniEnabled dflags = gfni dflags
|
|
| 1663 | + |
|
| 1640 | 1664 | {- Note [Implications between X86 CPU feature flags]
|
| 1641 | 1665 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
| 1642 | 1666 | Many X86 CPU feature flags (such as -mavx, -mfma or -msse4) imply other
|
| ... | ... | @@ -1649,7 +1673,7 @@ structures: |
| 1649 | 1673 | together with other implications such as
|
| 1650 | 1674 | |
| 1651 | 1675 | 3. FMA -> AVX
|
| 1652 | - 4. AVX512{CD,ED,PF} -> AVX512F -> AVX2
|
|
| 1676 | + 4. AVX512{BW,CD,DQ,ER,PF,VL} -> AVX512F -> AVX2
|
|
| 1653 | 1677 | |
| 1654 | 1678 | |
| 1655 | 1679 | We handle this as follows:
|
| ... | ... | @@ -984,13 +984,17 @@ llvmOptions llvm_config llvm_version dflags = |
| 984 | 984 | -- It may become deprecated in a future LLVM version, though.
|
| 985 | 985 | ++ ["+avx2" | isAvx2Enabled dflags ]
|
| 986 | 986 | ++ ["+avx" | isAvxEnabled dflags ]
|
| 987 | + ++ ["+avx512bw"| isAvx512bwEnabled dflags ]
|
|
| 987 | 988 | ++ ["+avx512cd"| isAvx512cdEnabled dflags ]
|
| 989 | + ++ ["+avx512dq"| isAvx512dqEnabled dflags ]
|
|
| 988 | 990 | ++ ["+avx512er"| isAvx512erEnabled dflags ]
|
| 989 | 991 | ++ ["+avx512pf"| isAvx512pfEnabled dflags ]
|
| 990 | - -- For Arch64 +fma is not a option (it's unconditionally available).
|
|
| 992 | + ++ ["+avx512vl"| isAvx512vlEnabled dflags ]
|
|
| 993 | + -- For AArch64 +fma is not a option (it's unconditionally available).
|
|
| 991 | 994 | ++ ["+fma" | isFmaEnabled dflags && (arch /= ArchAArch64) ]
|
| 992 | 995 | ++ ["+bmi" | isBmiEnabled dflags ]
|
| 993 | 996 | ++ ["+bmi2" | isBmi2Enabled dflags ]
|
| 997 | + ++ ["+gfni" | isGfniEnabled dflags ]
|
|
| 994 | 998 | |
| 995 | 999 | abi :: String
|
| 996 | 1000 | abi = case platformArch (targetPlatform dflags) of
|
| ... | ... | @@ -212,11 +212,15 @@ module GHC.Driver.Session ( |
| 212 | 212 | isBmi2Enabled,
|
| 213 | 213 | isAvxEnabled,
|
| 214 | 214 | isAvx2Enabled,
|
| 215 | + isAvx512bwEnabled,
|
|
| 215 | 216 | isAvx512cdEnabled,
|
| 217 | + isAvx512dqEnabled,
|
|
| 216 | 218 | isAvx512erEnabled,
|
| 217 | 219 | isAvx512fEnabled,
|
| 218 | 220 | isAvx512pfEnabled,
|
| 221 | + isAvx512vlEnabled,
|
|
| 219 | 222 | isFmaEnabled,
|
| 223 | + isGfniEnabled,
|
|
| 220 | 224 | |
| 221 | 225 | -- LoongArch: ISA version: la664, la464(default)
|
| 222 | 226 | isLa664Enabled,
|
| ... | ... | @@ -1723,14 +1727,17 @@ dynamic_flags_deps = [ |
| 1723 | 1727 | d { sseAvxVersion = max (Just AVX1) (sseAvxVersion d) }))
|
| 1724 | 1728 | , make_ord_flag defGhcFlag "mavx2" (noArg (\d ->
|
| 1725 | 1729 | d { sseAvxVersion = max (Just AVX2) (sseAvxVersion d) }))
|
| 1726 | - , make_ord_flag defGhcFlag "mavx512cd" (noArg (\d ->
|
|
| 1727 | - d { avx512cd = True }))
|
|
| 1728 | - , make_ord_flag defGhcFlag "mavx512er" (noArg (\d ->
|
|
| 1729 | - d { avx512er = True }))
|
|
| 1730 | + , make_ord_flag defGhcFlag "mavx512bw" (noArg (\d -> d { avx512bw = True }))
|
|
| 1731 | + , make_ord_flag defGhcFlag "mavx512cd" (noArg (\d -> d { avx512cd = True }))
|
|
| 1732 | + , make_ord_flag defGhcFlag "mavx512dq" (noArg (\d -> d { avx512dq = True }))
|
|
| 1733 | + , make_dep_flag defGhcFlag "mavx512er" (noArg (\d -> d { avx512er = True }))
|
|
| 1734 | + "AVX-512ER was only available on Xeon Phi"
|
|
| 1730 | 1735 | , make_ord_flag defGhcFlag "mavx512f" (noArg (\d -> d { avx512f = True }))
|
| 1731 | - , make_ord_flag defGhcFlag "mavx512pf" (noArg (\d ->
|
|
| 1732 | - d { avx512pf = True }))
|
|
| 1736 | + , make_dep_flag defGhcFlag "mavx512pf" (noArg (\d -> d { avx512pf = True }))
|
|
| 1737 | + "AVX-512PF was only available on Xeon Phi"
|
|
| 1738 | + , make_ord_flag defGhcFlag "mavx512vl" (noArg (\d -> d { avx512vl = True }))
|
|
| 1733 | 1739 | , make_ord_flag defGhcFlag "mfma" (noArg (\d -> d { fma = True }))
|
| 1740 | + , make_ord_flag defGhcFlag "mgfni" (noArg (\d -> d { gfni = True }))
|
|
| 1734 | 1741 | |
| 1735 | 1742 | |
| 1736 | 1743 | , make_ord_flag defGhcFlag "mla664" (noArg (\d -> d { la664 = True }))
|
| ... | ... | @@ -165,10 +165,16 @@ doCpp logger tmpfs dflags unit_env opts input_fn output_fn = do |
| 165 | 165 | let avx_defs =
|
| 166 | 166 | [ "-D__AVX__" | isAvxEnabled dflags ] ++
|
| 167 | 167 | [ "-D__AVX2__" | isAvx2Enabled dflags ] ++
|
| 168 | + [ "-D__AVX512BW__" | isAvx512bwEnabled dflags ] ++
|
|
| 168 | 169 | [ "-D__AVX512CD__" | isAvx512cdEnabled dflags ] ++
|
| 170 | + [ "-D__AVX512DQ__" | isAvx512dqEnabled dflags ] ++
|
|
| 169 | 171 | [ "-D__AVX512ER__" | isAvx512erEnabled dflags ] ++
|
| 170 | 172 | [ "-D__AVX512F__" | isAvx512fEnabled dflags ] ++
|
| 171 | - [ "-D__AVX512PF__" | isAvx512pfEnabled dflags ]
|
|
| 173 | + [ "-D__AVX512PF__" | isAvx512pfEnabled dflags ] ++
|
|
| 174 | + [ "-D__AVX512VL__" | isAvx512vlEnabled dflags ]
|
|
| 175 | + |
|
| 176 | + let gfni_def =
|
|
| 177 | + [ "-D__GFNI__" | isGfniEnabled dflags ]
|
|
| 172 | 178 | |
| 173 | 179 | backend_defs <- applyCDefs (backendCDefs $ backend dflags) logger dflags
|
| 174 | 180 | |
| ... | ... | @@ -209,6 +215,7 @@ doCpp logger tmpfs dflags unit_env opts input_fn output_fn = do |
| 209 | 215 | ++ map GHC.SysTools.Option sse_defs
|
| 210 | 216 | ++ map GHC.SysTools.Option fma_def
|
| 211 | 217 | ++ map GHC.SysTools.Option avx_defs
|
| 218 | + ++ map GHC.SysTools.Option gfni_def
|
|
| 212 | 219 | ++ map GHC.SysTools.Option io_manager_defs
|
| 213 | 220 | ++ mb_macro_include
|
| 214 | 221 | ++ line_pragmas
|
| ... | ... | @@ -101,6 +101,9 @@ to |
| 101 | 101 | |
| 102 | 102 | See :ghc-ticket:`25345`.
|
| 103 | 103 | |
| 104 | +- Add several options for x86 extensions: :ghc-flag:`-mavx512bw`,
|
|
| 105 | + :ghc-flag:`-mavx512dq`, :ghc-flag:`-mavx512vl`, and :ghc-flag:`-mgfni`.
|
|
| 106 | + |
|
| 104 | 107 | GHCi
|
| 105 | 108 | ~~~~
|
| 106 | 109 |
| ... | ... | @@ -553,8 +553,10 @@ SIMD macros |
| 553 | 553 | These are defined conditionally based on the SIMD
|
| 554 | 554 | flags used for compilation:
|
| 555 | 555 | |
| 556 | - ``__SSE__``, ``__SSE2__``, ``__SSE4_2__``, ``__FMA__``,
|
|
| 557 | - ``__AVX__``, ``__AVX2__``, ``__AVX512CD__``, ``__AVX512ER__``, ``__AVX512F__``, ``__AVX512PF__``,
|
|
| 556 | + ``__SSE__``, ``__SSE2__``, ``__SSE3__``, ``__SSSE3__``,
|
|
| 557 | + ``__SSE4_1__``, ``__SSE4_2__``, ``__FMA__``, ``__AVX__``, ``__AVX2__``,
|
|
| 558 | + ``__AVX512BW__``, ``__AVX512CD__``, ``__AVX512DQ__``, ``__AVX512ER__``,
|
|
| 559 | + ``__AVX512F__``, ``__AVX512PF__``, ``__AVX512VL__``, ``__GFNI__``
|
|
| 558 | 560 | |
| 559 | 561 | .. _cpp-string-gaps:
|
| 560 | 562 |
| ... | ... | @@ -1601,7 +1601,7 @@ Some flags only make sense for particular target platforms. |
| 1601 | 1601 | :implies: :ghc-flag:`-msse4.2`
|
| 1602 | 1602 | |
| 1603 | 1603 | (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
| 1604 | - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX instructions.
|
|
| 1604 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX instructions.
|
|
| 1605 | 1605 | |
| 1606 | 1606 | .. ghc-flag:: -mavx2
|
| 1607 | 1607 | :shortdesc: (x86 only) Enable support for AVX2 SIMD extensions
|
| ... | ... | @@ -1611,47 +1611,84 @@ Some flags only make sense for particular target platforms. |
| 1611 | 1611 | :implies: :ghc-flag:`-mavx`
|
| 1612 | 1612 | |
| 1613 | 1613 | (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
| 1614 | - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX2 instructions.
|
|
| 1614 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX2 instructions.
|
|
| 1615 | + |
|
| 1616 | +.. ghc-flag:: -mavx512bw
|
|
| 1617 | + :shortdesc: (x86 only) Enable support for AVX-512BW SIMD extensions
|
|
| 1618 | + :type: dynamic
|
|
| 1619 | + :category: platform-options
|
|
| 1620 | + |
|
| 1621 | + :since: 9.16.1
|
|
| 1622 | + :implies: :ghc-flag:`-mavx512f`
|
|
| 1623 | + |
|
| 1624 | + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
|
| 1625 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512BW instructions.
|
|
| 1615 | 1626 | |
| 1616 | 1627 | .. ghc-flag:: -mavx512cd
|
| 1617 | - :shortdesc: (x86 only) Enable support for AVX512-CD SIMD extensions
|
|
| 1628 | + :shortdesc: (x86 only) Enable support for AVX-512CD SIMD extensions
|
|
| 1618 | 1629 | :type: dynamic
|
| 1619 | 1630 | :category: platform-options
|
| 1620 | 1631 | |
| 1621 | 1632 | :implies: :ghc-flag:`-mavx512f`
|
| 1622 | 1633 | |
| 1623 | 1634 | (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
| 1624 | - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-CD instructions.
|
|
| 1635 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512CD instructions.
|
|
| 1636 | + |
|
| 1637 | +.. ghc-flag:: -mavx512dq
|
|
| 1638 | + :shortdesc: (x86 only) Enable support for AVX-512DQ SIMD extensions
|
|
| 1639 | + :type: dynamic
|
|
| 1640 | + :category: platform-options
|
|
| 1641 | + |
|
| 1642 | + :since: 9.16.1
|
|
| 1643 | + :implies: :ghc-flag:`-mavx512f`
|
|
| 1644 | + |
|
| 1645 | + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
|
| 1646 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512DQ instructions.
|
|
| 1625 | 1647 | |
| 1626 | 1648 | .. ghc-flag:: -mavx512er
|
| 1627 | - :shortdesc: (x86 only) Enable support for AVX512-ER SIMD extensions
|
|
| 1649 | + :shortdesc: (x86 only, deprecated) Enable support for AVX-512ER SIMD extensions
|
|
| 1628 | 1650 | :type: dynamic
|
| 1629 | 1651 | :category: platform-options
|
| 1630 | 1652 | |
| 1631 | 1653 | :implies: :ghc-flag:`-mavx512f`
|
| 1632 | 1654 | |
| 1633 | 1655 | (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
| 1634 | - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-ER instructions.
|
|
| 1656 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512ER instructions.
|
|
| 1657 | + |
|
| 1658 | + The AVX-512ER extension is deprecated and not supported by newer LLVM versions.
|
|
| 1635 | 1659 | |
| 1636 | 1660 | .. ghc-flag:: -mavx512f
|
| 1637 | - :shortdesc: (x86 only) Enable support for AVX512-F SIMD extensions
|
|
| 1661 | + :shortdesc: (x86 only) Enable support for AVX-512F SIMD extensions
|
|
| 1638 | 1662 | :type: dynamic
|
| 1639 | 1663 | :category: platform-options
|
| 1640 | 1664 | |
| 1641 | 1665 | :implies: :ghc-flag:`-mavx2`, :ghc-flag:`-mfma`
|
| 1642 | 1666 | |
| 1643 | 1667 | (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
| 1644 | - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-F instructions.
|
|
| 1668 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512F instructions.
|
|
| 1645 | 1669 | |
| 1646 | 1670 | .. ghc-flag:: -mavx512pf
|
| 1647 | - :shortdesc: (x86 only) Enable support for AVX512-PF SIMD extensions
|
|
| 1671 | + :shortdesc: (x86 only, deprecated) Enable support for AVX-512PF SIMD extensions
|
|
| 1648 | 1672 | :type: dynamic
|
| 1649 | 1673 | :category: platform-options
|
| 1650 | 1674 | |
| 1651 | 1675 | :implies: :ghc-flag:`-mavx512f`
|
| 1652 | 1676 | |
| 1653 | 1677 | (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
| 1654 | - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-PF instructions.
|
|
| 1678 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512PF instructions.
|
|
| 1679 | + |
|
| 1680 | + The AVX-512PF extension is deprecated and not supported by newer LLVM versions.
|
|
| 1681 | + |
|
| 1682 | +.. ghc-flag:: -mavx512vl
|
|
| 1683 | + :shortdesc: (x86 only) Enable support for AVX-512VL SIMD extensions
|
|
| 1684 | + :type: dynamic
|
|
| 1685 | + :category: platform-options
|
|
| 1686 | + |
|
| 1687 | + :since: 9.16.1
|
|
| 1688 | + :implies: :ghc-flag:`-mavx512f`
|
|
| 1689 | + |
|
| 1690 | + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
|
| 1691 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512VL instructions.
|
|
| 1655 | 1692 | |
| 1656 | 1693 | .. ghc-flag:: -msse
|
| 1657 | 1694 | :shortdesc: (x86 only) Use SSE for floating-point operations
|
| ... | ... | @@ -1714,13 +1751,13 @@ Some flags only make sense for particular target platforms. |
| 1714 | 1751 | or the :ref:`LLVM backend <llvm-code-gen>`).
|
| 1715 | 1752 | |
| 1716 | 1753 | .. ghc-flag:: -msse4
|
| 1717 | - :shortdesc: (x86 only) Use SSE4 for floating-point operations
|
|
| 1754 | + :shortdesc: (x86 only) Use SSE4.1 for floating-point operations
|
|
| 1718 | 1755 | :type: dynamic
|
| 1719 | 1756 | :category: platform-options
|
| 1720 | 1757 | |
| 1721 | 1758 | :implies: :ghc-flag:`-mssse3`
|
| 1722 | 1759 | |
| 1723 | - (x86 only) Use the SSE4 instruction set to
|
|
| 1760 | + (x86 only) Use the SSE4.1 instruction set to
|
|
| 1724 | 1761 | implement some floating point and bit operations(whether using the :ref:`native code generator <native-code-gen>`
|
| 1725 | 1762 | or the :ref:`LLVM backend <llvm-code-gen>`).
|
| 1726 | 1763 | |
| ... | ... | @@ -1781,6 +1818,16 @@ Some flags only make sense for particular target platforms. |
| 1781 | 1818 | multiply-add, which might perform non-IEEE-compliant software emulation on
|
| 1782 | 1819 | some platforms (depending on the implementation of the C standard library).
|
| 1783 | 1820 | |
| 1821 | +.. ghc-flag:: -mgfni
|
|
| 1822 | + :shortdesc: (x86 only) Use GFNI for advanced bit manipulations
|
|
| 1823 | + :type: dynamic
|
|
| 1824 | + :category: platform-options
|
|
| 1825 | + |
|
| 1826 | + :since: 9.16.1
|
|
| 1827 | + |
|
| 1828 | + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
|
|
| 1829 | + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 GFNI instructions.
|
|
| 1830 | + |
|
| 1784 | 1831 | .. ghc-flag:: -mla664
|
| 1785 | 1832 | :shortdesc: (LoongArch only) Used for new instructions for la664 uarch
|
| 1786 | 1833 | :type: dynamic
|
| ... | ... | @@ -9,9 +9,9 @@ SUPPORTED_CPU_FEATURES = { |
| 9 | 9 | |
| 10 | 10 | # x86:
|
| 11 | 11 | 'sse', 'sse2', 'sse3', 'pni', 'ssse3', 'sse4_1', 'sse4_2',
|
| 12 | - 'avx', 'avx2', 'avx512f',
|
|
| 12 | + 'avx', 'avx2', 'avx512f', 'avx512vl', 'avx512bw', 'avx512dq',
|
|
| 13 | 13 | 'fma',
|
| 14 | - 'popcnt', 'bmi1', 'bmi2'
|
|
| 14 | + 'popcnt', 'bmi1', 'bmi2', 'gfni',
|
|
| 15 | 15 | }
|
| 16 | 16 | |
| 17 | 17 | cpu_feature_cache = None
|
| ... | ... | @@ -17,3 +17,9 @@ test('msse-option-order', [unless(arch('x86_64') or arch('i386'), skip), |
| 17 | 17 | when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-msse4.2 -msse2'])
|
| 18 | 18 | test('mavx-should-enable-popcnt', [unless(arch('x86_64') or arch('i386'), skip),
|
| 19 | 19 | when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-mavx'])
|
| 20 | +test('avx512-int64-mul', [unless(arch('x86_64'), skip),
|
|
| 21 | + when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512dq -mavx512vl'])
|
|
| 22 | +test('avx512-int64-minmax', [unless(arch('x86_64'), skip),
|
|
| 23 | + when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512vl'])
|
|
| 24 | +test('avx512-word64-minmax', [unless(arch('x86_64'), skip),
|
|
| 25 | + when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512vl']) |
| 1 | +vpminsq
|
|
| 2 | +vpmaxsq |
| 1 | +{-# LANGUAGE BangPatterns #-}
|
|
| 2 | +{-# LANGUAGE ExtendedLiterals #-}
|
|
| 3 | +{-# LANGUAGE MagicHash #-}
|
|
| 4 | +{-# LANGUAGE UnboxedTuples #-}
|
|
| 5 | +import GHC.Exts
|
|
| 6 | +import GHC.Prim
|
|
| 7 | +import GHC.Int
|
|
| 8 | + |
|
| 9 | +{-# NOINLINE f #-}
|
|
| 10 | +f :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2#
|
|
| 11 | +f x y z = minInt64X2# x (plusInt64X2# y z)
|
|
| 12 | + |
|
| 13 | +{-# NOINLINE g #-}
|
|
| 14 | +g :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2#
|
|
| 15 | +g x y z = maxInt64X2# x (plusInt64X2# y z)
|
|
| 16 | + |
|
| 17 | +main :: IO ()
|
|
| 18 | +main = do
|
|
| 19 | + let !x = packInt64X2# (# 1#Int64, 10#Int64 #)
|
|
| 20 | + !y = packInt64X2# (# 4#Int64, 2#Int64 #)
|
|
| 21 | + !z = broadcastInt64X2# 5#Int64
|
|
| 22 | + !w = f x y z
|
|
| 23 | + (# w0, w1 #) = unpackInt64X2# w
|
|
| 24 | + !v = g x y z
|
|
| 25 | + (# v0, v1 #) = unpackInt64X2# v
|
|
| 26 | + print (I64# w0, I64# w1)
|
|
| 27 | + print (I64# v0, I64# v1) |
| 1 | +vpmullq |
| 1 | +{-# LANGUAGE BangPatterns #-}
|
|
| 2 | +{-# LANGUAGE ExtendedLiterals #-}
|
|
| 3 | +{-# LANGUAGE MagicHash #-}
|
|
| 4 | +{-# LANGUAGE UnboxedTuples #-}
|
|
| 5 | +import GHC.Exts
|
|
| 6 | +import GHC.Int
|
|
| 7 | + |
|
| 8 | +{-# NOINLINE f #-}
|
|
| 9 | +f :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2#
|
|
| 10 | +f x y z = timesInt64X2# x (plusInt64X2# y z)
|
|
| 11 | + |
|
| 12 | +main :: IO ()
|
|
| 13 | +main = do
|
|
| 14 | + let !x = packInt64X2# (# 1#Int64, 3#Int64 #)
|
|
| 15 | + !y = packInt64X2# (# 4#Int64, 2#Int64 #)
|
|
| 16 | + !z = broadcastInt64X2# 5#Int64
|
|
| 17 | + !w = f x y z
|
|
| 18 | + (# w0, w1 #) = unpackInt64X2# w
|
|
| 19 | + print (I64# w0, I64# w1) |
| 1 | +vpminuq
|
|
| 2 | +vpmaxuq |
| 1 | +{-# LANGUAGE BangPatterns #-}
|
|
| 2 | +{-# LANGUAGE ExtendedLiterals #-}
|
|
| 3 | +{-# LANGUAGE MagicHash #-}
|
|
| 4 | +{-# LANGUAGE UnboxedTuples #-}
|
|
| 5 | +import GHC.Exts
|
|
| 6 | +import GHC.Prim
|
|
| 7 | +import GHC.Word
|
|
| 8 | + |
|
| 9 | +{-# NOINLINE f #-}
|
|
| 10 | +f :: Word64X2# -> Word64X2# -> Word64X2# -> Word64X2#
|
|
| 11 | +f x y z = minWord64X2# x (plusWord64X2# y z)
|
|
| 12 | + |
|
| 13 | +{-# NOINLINE g #-}
|
|
| 14 | +g :: Word64X2# -> Word64X2# -> Word64X2# -> Word64X2#
|
|
| 15 | +g x y z = maxWord64X2# x (plusWord64X2# y z)
|
|
| 16 | + |
|
| 17 | +main :: IO ()
|
|
| 18 | +main = do
|
|
| 19 | + let !x = packWord64X2# (# 1#Word64, 10#Word64 #)
|
|
| 20 | + !y = packWord64X2# (# 4#Word64, 2#Word64 #)
|
|
| 21 | + !z = broadcastWord64X2# 5#Word64
|
|
| 22 | + !w = f x y z
|
|
| 23 | + (# w0, w1 #) = unpackWord64X2# w
|
|
| 24 | + !v = g x y z
|
|
| 25 | + (# v0, v1 #) = unpackWord64X2# v
|
|
| 26 | + print (W64# w0, W64# w1)
|
|
| 27 | + print (W64# v0, W64# v1) |
| ... | ... | @@ -66,6 +66,9 @@ setTestOpts( |
| 66 | 66 | , when(have_cpu_feature('avx'), extra_hc_opts('-mavx'))
|
| 67 | 67 | , when(have_cpu_feature('avx2'), extra_hc_opts('-mavx2'))
|
| 68 | 68 | , when(have_cpu_feature('avx512f'), extra_hc_opts('-mavx512f'))
|
| 69 | + , when(have_cpu_feature('avx512vl'), extra_hc_opts('-mavx512vl'))
|
|
| 70 | + , when(have_cpu_feature('avx512bw'), extra_hc_opts('-mavx512bw'))
|
|
| 71 | + , when(have_cpu_feature('avx512dq'), extra_hc_opts('-mavx512dq'))
|
|
| 69 | 72 | ])
|
| 70 | 73 | |
| 71 | 74 | test('simd000', [], compile_and_run, [''])
|