Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC Commits: 14f485ee by ARATA Mizuki at 2026-02-17T09:09:24+09:00 Support more x86 extensions: AVX-512 {BW,DQ,VL} and GFNI Also, mark AVX-512 ER and PF as deprecated. AVX-512 instructions can be used for certain 64-bit integer vector operations. GFNI can be used to implement bitReverse (currently not used by NCG, but LLVM may use it). Closes #26406 Addresses #26509 - - - - - 21 changed files: - compiler/GHC/CmmToAsm/Config.hs - compiler/GHC/CmmToAsm/X86/CodeGen.hs - compiler/GHC/CmmToAsm/X86/Instr.hs - compiler/GHC/CmmToAsm/X86/Ppr.hs - compiler/GHC/Driver/Config/CmmToAsm.hs - compiler/GHC/Driver/DynFlags.hs - compiler/GHC/Driver/Pipeline/Execute.hs - compiler/GHC/Driver/Session.hs - compiler/GHC/SysTools/Cpp.hs - docs/users_guide/9.16.1-notes.rst - docs/users_guide/phases.rst - docs/users_guide/using.rst - testsuite/driver/cpu_features.py - testsuite/tests/codeGen/should_gen_asm/all.T - + testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.asm - + testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.hs - + testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.asm - + testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.hs - + testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.asm - + testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.hs - testsuite/tests/simd/should_run/all.T Changes: ===================================== compiler/GHC/CmmToAsm/Config.hs ===================================== @@ -31,6 +31,9 @@ data NCGConfig = NCGConfig , ncgDoConstantFolding :: !Bool -- ^ Perform CMM constant folding , ncgSseAvxVersion :: Maybe SseAvxVersion -- ^ (x86) SSE and AVX instructions , ncgAvx512fEnabled :: !Bool + , ncgAvx512vlEnabled :: !Bool + , ncgAvx512bwEnabled :: !Bool + , ncgAvx512dqEnabled :: !Bool , ncgBmiVersion :: Maybe BmiVersion -- ^ (x86) BMI instructions , ncgDumpRegAllocStages :: !Bool , ncgDumpAsmStats :: !Bool ===================================== compiler/GHC/CmmToAsm/X86/CodeGen.hs ===================================== @@ -134,6 +134,12 @@ avx2Enabled = do config <- getConfig return (ncgSseAvxVersion config >= Just AVX2) +avx512vlEnabled :: NatM Bool +avx512vlEnabled = ncgAvx512vlEnabled <$> getConfig + +avx512dqEnabled :: NatM Bool +avx512dqEnabled = ncgAvx512dqEnabled <$> getConfig + cmmTopCodeGen :: RawCmmDecl -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr] @@ -1314,6 +1320,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps sse4_1 <- sse4_1Enabled sse4_2 <- sse4_2Enabled avx <- avxEnabled + avx512vl <- avx512vlEnabled + avx512dq <- avx512dqEnabled case mop of MO_F_Eq _ -> condFltReg is32Bit EQQ x y MO_F_Ne _ -> condFltReg is32Bit NE x y @@ -1432,57 +1440,76 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps MO_V_Sub l w | l * widthInBits w == 128 -> vector_int_op_sse PSUB l w x y | otherwise -> needLlvm mop MO_V_Mul 16 W8 -> vector_int8x16_mul_sse2 x y - MO_V_Mul l@8 w@W16 -> vector_int_op_sse PMULL l w x y -- PMULLW (SSE2) - MO_V_Mul l@4 w@W32 | sse4_1 -> vector_int_op_sse PMULL l w x y -- PMULLD (SSE4.1) + MO_V_Mul l@8 w@W16 | avx -> vector_int_op_avx VPMULL l w x y -- VPMULLW (AVX) + | otherwise -> vector_int_op_sse PMULL l w x y -- PMULLW (SSE2) + MO_V_Mul l@4 w@W32 | avx -> vector_int_op_avx VPMULL l w x y -- VPMULLD (AVX) + | sse4_1 -> vector_int_op_sse PMULL l w x y -- PMULLD (SSE4.1) | otherwise -> vector_int32x4_mul_sse2 x y - MO_V_Mul 2 W64 -> vector_int64x2_mul_sse2 x y + MO_V_Mul l@2 w@W64 | avx512dq && avx512vl -> vector_int_op_avx VPMULL l w x y -- VPMULLQ (AVX512DQ+VL) + | otherwise -> vector_int64x2_mul_sse2 x y MO_V_Mul {} -> needLlvm mop MO_VU_Min l@16 w@W8 - -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUB (SSE2) + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUB (AVX) + | otherwise -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUB (SSE2) MO_VU_Min l@8 w@W16 + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUW (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUW (SSE4.1) | otherwise -> vector_word_minmax_sse Min l w x y MO_VU_Min l@4 w@W32 + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUD (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUD (SSE4.1) | otherwise -> vector_word_minmax_sse Min l w x y MO_VU_Min l@2 w@W64 + | avx512vl -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUQ (AVX512F+VL) | sse4_2 -> vector_word_minmax_sse Min l w x y -- PCMPGTQ requires SSE4.2 -- The SSE2 version is implemented as a C call (MO_W64X2_Min) MO_VU_Min {} -> needLlvm mop MO_VU_Max l@16 w@W8 - -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUB (SSE2) + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUB (AVX) + | otherwise -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUB (SSE2) MO_VU_Max l@8 w@W16 + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUW (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUW (SSE4.1) | otherwise -> vector_word_minmax_sse Max l w x y MO_VU_Max l@4 w@W32 + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUD (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUD (SSE4.1) | otherwise -> vector_word_minmax_sse Max l w x y MO_VU_Max l@2 w@W64 + | avx512vl -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUQ (AVX512F+VL) | sse4_2 -> vector_word_minmax_sse Max l w x y -- PCMPGTQ requires SSE4.2 -- The SSE2 version is implemented as a C call (MO_W64X2_Max) MO_VU_Max {} -> needLlvm mop MO_VS_Min l@16 w@W8 + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSB (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSB (SSE4.1) | otherwise -> vector_int_minmax_sse Min l w x y MO_VS_Min l@8 w@W16 - -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSW (SSE2) + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSW (AVX) + | otherwise -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSW (SSE2) MO_VS_Min l@4 w@W32 + | avx -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSD (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSD (SSE4.1) | otherwise -> vector_int_minmax_sse Min l w x y MO_VS_Min l@2 w@W64 + | avx512vl -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSQ (AVX512F+VL) | sse4_2 -> vector_int_minmax_sse Min l w x y -- PCMPGTQ requires SSE4.2 -- The SSE2 version is implemented as a C call (MO_I64X2_Min) MO_VS_Min {} -> needLlvm mop MO_VS_Max l@16 w@W8 + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSB (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSB (SSE4.1) | otherwise -> vector_int_minmax_sse Max l w x y MO_VS_Max l@8 w@W16 - -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSW (SSE2) + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSW (AVX) + | otherwise -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSW (SSE2) MO_VS_Max l@4 w@W32 + | avx -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSD (AVX) | sse4_1 -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSD (SSE4.1) | otherwise -> vector_int_minmax_sse Max l w x y MO_VS_Max l@2 w@W64 + | avx512vl -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSQ (AVX512F+VL) | sse4_2 -> vector_int_minmax_sse Max l w x y -- PCMPGTQ requires SSE4.2 -- The SSE2 version is implemented as a C call (MO_I64X2_Max) MO_VS_Max {} -> needLlvm mop @@ -1975,7 +2002,6 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps (PUNPCKLDQ format (OpReg tmpOdd1) dst) -- dst <- (dst[0],tmpOdd1[0],dst[1],tmpOdd1[1]) return (Any format code) - -- TODO: We could use `VPMULLQ` if AVX-512 or AVX10.1 is available. vector_int64x2_mul_sse2 :: CmmExpr -> CmmExpr -> NatM Register vector_int64x2_mul_sse2 expr1 expr2 = do -- implement 64 bit multiplication using 32-bit PMULUDQ multiplication instructions ===================================== compiler/GHC/CmmToAsm/X86/Instr.hs ===================================== @@ -338,6 +338,7 @@ data Instr | PADD Format Operand Reg | PSUB Format Operand Reg | PMULL Format Operand Reg + | VPMULL Format Operand Reg Reg | PMULUDQ Format Operand Reg -- SIMD compare @@ -601,6 +602,7 @@ regUsageOfInstr platform instr PADD fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst] PSUB fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst] PMULL fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst] + VPMULL fmt s1 s2 dst -> mkRU (use_R fmt s1 [mk fmt s2]) [mk fmt dst] PMULUDQ fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst] PCMPGT fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst] @@ -912,6 +914,7 @@ patchRegsOfInstr platform instr env PADD fmt src dst -> PADD fmt (patchOp src) (env dst) PSUB fmt src dst -> PSUB fmt (patchOp src) (env dst) PMULL fmt src dst -> PMULL fmt (patchOp src) (env dst) + VPMULL fmt s1 s2 dst -> VPMULL fmt (patchOp s1) (env s2) (env dst) PMULUDQ fmt src dst -> PMULUDQ fmt (patchOp src) (env dst) PCMPGT fmt src dst -> PCMPGT fmt (patchOp src) (env dst) ===================================== compiler/GHC/CmmToAsm/X86/Ppr.hs ===================================== @@ -1012,6 +1012,8 @@ pprInstr platform i = case i of -> pprFormatOpReg (text "psub") format src dst PMULL format src dst -> pprFormatOpReg (text "pmull") format src dst + VPMULL format s1 s2 dst + -> pprFormatOpRegReg (text "vpmull") format s1 s2 dst PMULUDQ format src dst -> pprOpReg (text "pmuludq") format src dst PCMPGT format src dst @@ -1574,7 +1576,8 @@ pprInstr platform i = case i of pprMinMax wantV minOrMax mmTy fmt regs = line $ hcat ( instr : intersperse comma ( map ( pprOperand platform fmt ) regs ) ) where - instr = (if wantV then text "v" else empty) + instr = char '\t' + <> (if wantV then text "v" else empty) <> (case mmTy of { IntVecMinMax {} -> text "p"; FloatMinMax -> empty }) <> (case minOrMax of { Min -> text "min"; Max -> text "max" }) <> (case mmTy of { IntVecMinMax wantSigned -> if wantSigned then text "s" else text "u"; FloatMinMax -> empty }) ===================================== compiler/GHC/Driver/Config/CmmToAsm.hs ===================================== @@ -65,6 +65,9 @@ initNCGConfig dflags this_mod = NCGConfig ArchX86 -> v _ -> Nothing , ncgAvx512fEnabled = isAvx512fEnabled dflags + , ncgAvx512vlEnabled = isAvx512vlEnabled dflags + , ncgAvx512bwEnabled = isAvx512bwEnabled dflags + , ncgAvx512dqEnabled = isAvx512dqEnabled dflags , ncgLa664Enabled = isLa664Enabled dflags ===================================== compiler/GHC/Driver/DynFlags.hs ===================================== @@ -83,11 +83,15 @@ module GHC.Driver.DynFlags ( isSse4_2Enabled, isAvxEnabled, isAvx2Enabled, + isAvx512bwEnabled, isAvx512cdEnabled, + isAvx512dqEnabled, isAvx512erEnabled, isAvx512fEnabled, isAvx512pfEnabled, + isAvx512vlEnabled, isFmaEnabled, + isGfniEnabled, isBmiEnabled, isBmi2Enabled, -- For LoongArch platform @@ -454,12 +458,16 @@ data DynFlags = DynFlags { -- | Machine dependent flags (-m\<blah> stuff) sseAvxVersion :: Maybe SseAvxVersion, bmiVersion :: Maybe BmiVersion, - avx512cd :: Bool, -- Enable AVX-512 Conflict Detection Instructions. - avx512er :: Bool, -- Enable AVX-512 Exponential and Reciprocal Instructions. - avx512f :: Bool, -- Enable AVX-512 instructions. - avx512pf :: Bool, -- Enable AVX-512 PreFetch Instructions. + avx512bw :: Bool, -- ^ Enable AVX-512BW Instructions. + avx512cd :: Bool, -- ^ Enable AVX-512 Conflict Detection Instructions. + avx512dq :: Bool, -- ^ Enable AVX-512DQ Instructions. + avx512er :: Bool, -- ^ Enable AVX-512 Exponential and Reciprocal Instructions. + avx512f :: Bool, -- ^ Enable AVX-512 instructions. + avx512pf :: Bool, -- ^ Enable AVX-512 PreFetch Instructions. + avx512vl :: Bool, -- ^ Enable AVX-512VL Instructions. fma :: Bool, -- ^ Enable FMA instructions. - la664 :: Bool, -- Enable LA664 instructions + gfni :: Bool, -- ^ Enable GFNI Instructions. + la664 :: Bool, -- ^ Enable LA664 instructions -- Constants used to control the amount of optimization done. @@ -737,12 +745,16 @@ defaultDynFlags mySettings = interactivePrint = Nothing, sseAvxVersion = Nothing, bmiVersion = Nothing, + avx512bw = False, avx512cd = False, + avx512dq = False, avx512er = False, avx512f = False, avx512pf = False, + avx512vl = False, -- Use FMA by default on AArch64 fma = (platformArch . sTargetPlatform $ mySettings) == ArchAArch64, + gfni = False, -- For LoongArch, la464 is used by default. la664 = False, @@ -1616,18 +1628,27 @@ isAvxEnabled dflags = sseAvxVersion dflags >= Just AVX1 || (isX86 && fma dflags) isAvx2Enabled :: DynFlags -> Bool isAvx2Enabled dflags = sseAvxVersion dflags >= Just AVX2 || isAvx512fEnabled dflags +isAvx512bwEnabled :: DynFlags -> Bool +isAvx512bwEnabled dflags = avx512bw dflags + isAvx512cdEnabled :: DynFlags -> Bool isAvx512cdEnabled dflags = avx512cd dflags +isAvx512dqEnabled :: DynFlags -> Bool +isAvx512dqEnabled dflags = avx512dq dflags + isAvx512erEnabled :: DynFlags -> Bool isAvx512erEnabled dflags = avx512er dflags isAvx512fEnabled :: DynFlags -> Bool -isAvx512fEnabled dflags = avx512f dflags || avx512cd dflags || avx512er dflags || avx512pf dflags +isAvx512fEnabled dflags = avx512f dflags || avx512bw dflags || avx512cd dflags || avx512dq dflags || avx512er dflags || avx512pf dflags || avx512vl dflags isAvx512pfEnabled :: DynFlags -> Bool isAvx512pfEnabled dflags = avx512pf dflags +isAvx512vlEnabled :: DynFlags -> Bool +isAvx512vlEnabled dflags = avx512vl dflags + isFmaEnabled :: DynFlags -> Bool isFmaEnabled dflags = fma dflags || (isX86 && isAvx512fEnabled dflags) where @@ -1637,6 +1658,9 @@ isFmaEnabled dflags = fma dflags || (isX86 && isAvx512fEnabled dflags) ArchX86 -> True _ -> False +isGfniEnabled :: DynFlags -> Bool +isGfniEnabled dflags = gfni dflags + {- Note [Implications between X86 CPU feature flags] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Many X86 CPU feature flags (such as -mavx, -mfma or -msse4) imply other @@ -1649,7 +1673,7 @@ structures: together with other implications such as 3. FMA -> AVX - 4. AVX512{CD,ED,PF} -> AVX512F -> AVX2 + 4. AVX512{BW,CD,DQ,ER,PF,VL} -> AVX512F -> AVX2 We handle this as follows: ===================================== compiler/GHC/Driver/Pipeline/Execute.hs ===================================== @@ -984,13 +984,17 @@ llvmOptions llvm_config llvm_version dflags = -- It may become deprecated in a future LLVM version, though. ++ ["+avx2" | isAvx2Enabled dflags ] ++ ["+avx" | isAvxEnabled dflags ] + ++ ["+avx512bw"| isAvx512bwEnabled dflags ] ++ ["+avx512cd"| isAvx512cdEnabled dflags ] + ++ ["+avx512dq"| isAvx512dqEnabled dflags ] ++ ["+avx512er"| isAvx512erEnabled dflags ] ++ ["+avx512pf"| isAvx512pfEnabled dflags ] - -- For Arch64 +fma is not a option (it's unconditionally available). + ++ ["+avx512vl"| isAvx512vlEnabled dflags ] + -- For AArch64 +fma is not a option (it's unconditionally available). ++ ["+fma" | isFmaEnabled dflags && (arch /= ArchAArch64) ] ++ ["+bmi" | isBmiEnabled dflags ] ++ ["+bmi2" | isBmi2Enabled dflags ] + ++ ["+gfni" | isGfniEnabled dflags ] abi :: String abi = case platformArch (targetPlatform dflags) of ===================================== compiler/GHC/Driver/Session.hs ===================================== @@ -212,11 +212,15 @@ module GHC.Driver.Session ( isBmi2Enabled, isAvxEnabled, isAvx2Enabled, + isAvx512bwEnabled, isAvx512cdEnabled, + isAvx512dqEnabled, isAvx512erEnabled, isAvx512fEnabled, isAvx512pfEnabled, + isAvx512vlEnabled, isFmaEnabled, + isGfniEnabled, -- LoongArch: ISA version: la664, la464(default) isLa664Enabled, @@ -1723,14 +1727,17 @@ dynamic_flags_deps = [ d { sseAvxVersion = max (Just AVX1) (sseAvxVersion d) })) , make_ord_flag defGhcFlag "mavx2" (noArg (\d -> d { sseAvxVersion = max (Just AVX2) (sseAvxVersion d) })) - , make_ord_flag defGhcFlag "mavx512cd" (noArg (\d -> - d { avx512cd = True })) - , make_ord_flag defGhcFlag "mavx512er" (noArg (\d -> - d { avx512er = True })) + , make_ord_flag defGhcFlag "mavx512bw" (noArg (\d -> d { avx512bw = True })) + , make_ord_flag defGhcFlag "mavx512cd" (noArg (\d -> d { avx512cd = True })) + , make_ord_flag defGhcFlag "mavx512dq" (noArg (\d -> d { avx512dq = True })) + , make_dep_flag defGhcFlag "mavx512er" (noArg (\d -> d { avx512er = True })) + "AVX-512ER was only available on Xeon Phi" , make_ord_flag defGhcFlag "mavx512f" (noArg (\d -> d { avx512f = True })) - , make_ord_flag defGhcFlag "mavx512pf" (noArg (\d -> - d { avx512pf = True })) + , make_dep_flag defGhcFlag "mavx512pf" (noArg (\d -> d { avx512pf = True })) + "AVX-512PF was only available on Xeon Phi" + , make_ord_flag defGhcFlag "mavx512vl" (noArg (\d -> d { avx512vl = True })) , make_ord_flag defGhcFlag "mfma" (noArg (\d -> d { fma = True })) + , make_ord_flag defGhcFlag "mgfni" (noArg (\d -> d { gfni = True })) , make_ord_flag defGhcFlag "mla664" (noArg (\d -> d { la664 = True })) ===================================== compiler/GHC/SysTools/Cpp.hs ===================================== @@ -165,10 +165,16 @@ doCpp logger tmpfs dflags unit_env opts input_fn output_fn = do let avx_defs = [ "-D__AVX__" | isAvxEnabled dflags ] ++ [ "-D__AVX2__" | isAvx2Enabled dflags ] ++ + [ "-D__AVX512BW__" | isAvx512bwEnabled dflags ] ++ [ "-D__AVX512CD__" | isAvx512cdEnabled dflags ] ++ + [ "-D__AVX512DQ__" | isAvx512dqEnabled dflags ] ++ [ "-D__AVX512ER__" | isAvx512erEnabled dflags ] ++ [ "-D__AVX512F__" | isAvx512fEnabled dflags ] ++ - [ "-D__AVX512PF__" | isAvx512pfEnabled dflags ] + [ "-D__AVX512PF__" | isAvx512pfEnabled dflags ] ++ + [ "-D__AVX512VL__" | isAvx512vlEnabled dflags ] + + let gfni_def = + [ "-D__GFNI__" | isGfniEnabled dflags ] backend_defs <- applyCDefs (backendCDefs $ backend dflags) logger dflags @@ -209,6 +215,7 @@ doCpp logger tmpfs dflags unit_env opts input_fn output_fn = do ++ map GHC.SysTools.Option sse_defs ++ map GHC.SysTools.Option fma_def ++ map GHC.SysTools.Option avx_defs + ++ map GHC.SysTools.Option gfni_def ++ map GHC.SysTools.Option io_manager_defs ++ mb_macro_include ++ line_pragmas ===================================== docs/users_guide/9.16.1-notes.rst ===================================== @@ -101,6 +101,9 @@ to See :ghc-ticket:`25345`. +- Add several options for x86 extensions: :ghc-flag:`-mavx512bw`, + :ghc-flag:`-mavx512dq`, :ghc-flag:`-mavx512vl`, and :ghc-flag:`-mgfni`. + GHCi ~~~~ ===================================== docs/users_guide/phases.rst ===================================== @@ -553,8 +553,10 @@ SIMD macros These are defined conditionally based on the SIMD flags used for compilation: - ``__SSE__``, ``__SSE2__``, ``__SSE4_2__``, ``__FMA__``, - ``__AVX__``, ``__AVX2__``, ``__AVX512CD__``, ``__AVX512ER__``, ``__AVX512F__``, ``__AVX512PF__``, + ``__SSE__``, ``__SSE2__``, ``__SSE3__``, ``__SSSE3__``, + ``__SSE4_1__``, ``__SSE4_2__``, ``__FMA__``, ``__AVX__``, ``__AVX2__``, + ``__AVX512BW__``, ``__AVX512CD__``, ``__AVX512DQ__``, ``__AVX512ER__``, + ``__AVX512F__``, ``__AVX512PF__``, ``__AVX512VL__``, ``__GFNI__`` .. _cpp-string-gaps: ===================================== docs/users_guide/using.rst ===================================== @@ -1601,7 +1601,7 @@ Some flags only make sense for particular target platforms. :implies: :ghc-flag:`-msse4.2` (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX instructions. + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX instructions. .. ghc-flag:: -mavx2 :shortdesc: (x86 only) Enable support for AVX2 SIMD extensions @@ -1611,47 +1611,84 @@ Some flags only make sense for particular target platforms. :implies: :ghc-flag:`-mavx` (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX2 instructions. + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX2 instructions. + +.. ghc-flag:: -mavx512bw + :shortdesc: (x86 only) Enable support for AVX-512BW SIMD extensions + :type: dynamic + :category: platform-options + + :since: 9.16.1 + :implies: :ghc-flag:`-mavx512f` + + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512BW instructions. .. ghc-flag:: -mavx512cd - :shortdesc: (x86 only) Enable support for AVX512-CD SIMD extensions + :shortdesc: (x86 only) Enable support for AVX-512CD SIMD extensions :type: dynamic :category: platform-options :implies: :ghc-flag:`-mavx512f` (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-CD instructions. + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512CD instructions. + +.. ghc-flag:: -mavx512dq + :shortdesc: (x86 only) Enable support for AVX-512DQ SIMD extensions + :type: dynamic + :category: platform-options + + :since: 9.16.1 + :implies: :ghc-flag:`-mavx512f` + + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512DQ instructions. .. ghc-flag:: -mavx512er - :shortdesc: (x86 only) Enable support for AVX512-ER SIMD extensions + :shortdesc: (x86 only, deprecated) Enable support for AVX-512ER SIMD extensions :type: dynamic :category: platform-options :implies: :ghc-flag:`-mavx512f` (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-ER instructions. + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512ER instructions. + + The AVX-512ER extension is deprecated and not supported by newer LLVM versions. .. ghc-flag:: -mavx512f - :shortdesc: (x86 only) Enable support for AVX512-F SIMD extensions + :shortdesc: (x86 only) Enable support for AVX-512F SIMD extensions :type: dynamic :category: platform-options :implies: :ghc-flag:`-mavx2`, :ghc-flag:`-mfma` (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-F instructions. + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512F instructions. .. ghc-flag:: -mavx512pf - :shortdesc: (x86 only) Enable support for AVX512-PF SIMD extensions + :shortdesc: (x86 only, deprecated) Enable support for AVX-512PF SIMD extensions :type: dynamic :category: platform-options :implies: :ghc-flag:`-mavx512f` (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` - or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-PF instructions. + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512PF instructions. + + The AVX-512PF extension is deprecated and not supported by newer LLVM versions. + +.. ghc-flag:: -mavx512vl + :shortdesc: (x86 only) Enable support for AVX-512VL SIMD extensions + :type: dynamic + :category: platform-options + + :since: 9.16.1 + :implies: :ghc-flag:`-mavx512f` + + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512VL instructions. .. ghc-flag:: -msse :shortdesc: (x86 only) Use SSE for floating-point operations @@ -1714,13 +1751,13 @@ Some flags only make sense for particular target platforms. or the :ref:`LLVM backend <llvm-code-gen>`). .. ghc-flag:: -msse4 - :shortdesc: (x86 only) Use SSE4 for floating-point operations + :shortdesc: (x86 only) Use SSE4.1 for floating-point operations :type: dynamic :category: platform-options :implies: :ghc-flag:`-mssse3` - (x86 only) Use the SSE4 instruction set to + (x86 only) Use the SSE4.1 instruction set to implement some floating point and bit operations(whether using the :ref:`native code generator <native-code-gen>` or the :ref:`LLVM backend <llvm-code-gen>`). @@ -1781,6 +1818,16 @@ Some flags only make sense for particular target platforms. multiply-add, which might perform non-IEEE-compliant software emulation on some platforms (depending on the implementation of the C standard library). +.. ghc-flag:: -mgfni + :shortdesc: (x86 only) Use GFNI for advanced bit manipulations + :type: dynamic + :category: platform-options + + :since: 9.16.1 + + (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>` + or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 GFNI instructions. + .. ghc-flag:: -mla664 :shortdesc: (LoongArch only) Used for new instructions for la664 uarch :type: dynamic ===================================== testsuite/driver/cpu_features.py ===================================== @@ -9,9 +9,9 @@ SUPPORTED_CPU_FEATURES = { # x86: 'sse', 'sse2', 'sse3', 'pni', 'ssse3', 'sse4_1', 'sse4_2', - 'avx', 'avx2', 'avx512f', + 'avx', 'avx2', 'avx512f', 'avx512vl', 'avx512bw', 'avx512dq', 'fma', - 'popcnt', 'bmi1', 'bmi2' + 'popcnt', 'bmi1', 'bmi2', 'gfni', } cpu_feature_cache = None ===================================== testsuite/tests/codeGen/should_gen_asm/all.T ===================================== @@ -17,3 +17,9 @@ test('msse-option-order', [unless(arch('x86_64') or arch('i386'), skip), when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-msse4.2 -msse2']) test('mavx-should-enable-popcnt', [unless(arch('x86_64') or arch('i386'), skip), when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-mavx']) +test('avx512-int64-mul', [unless(arch('x86_64'), skip), + when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512dq -mavx512vl']) +test('avx512-int64-minmax', [unless(arch('x86_64'), skip), + when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512vl']) +test('avx512-word64-minmax', [unless(arch('x86_64'), skip), + when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512vl']) ===================================== testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.asm ===================================== @@ -0,0 +1,2 @@ +vpminsq +vpmaxsq ===================================== testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.hs ===================================== @@ -0,0 +1,27 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ExtendedLiterals #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnboxedTuples #-} +import GHC.Exts +import GHC.Prim +import GHC.Int + +{-# NOINLINE f #-} +f :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2# +f x y z = minInt64X2# x (plusInt64X2# y z) + +{-# NOINLINE g #-} +g :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2# +g x y z = maxInt64X2# x (plusInt64X2# y z) + +main :: IO () +main = do + let !x = packInt64X2# (# 1#Int64, 10#Int64 #) + !y = packInt64X2# (# 4#Int64, 2#Int64 #) + !z = broadcastInt64X2# 5#Int64 + !w = f x y z + (# w0, w1 #) = unpackInt64X2# w + !v = g x y z + (# v0, v1 #) = unpackInt64X2# v + print (I64# w0, I64# w1) + print (I64# v0, I64# v1) ===================================== testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.asm ===================================== @@ -0,0 +1 @@ +vpmullq ===================================== testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.hs ===================================== @@ -0,0 +1,19 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ExtendedLiterals #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnboxedTuples #-} +import GHC.Exts +import GHC.Int + +{-# NOINLINE f #-} +f :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2# +f x y z = timesInt64X2# x (plusInt64X2# y z) + +main :: IO () +main = do + let !x = packInt64X2# (# 1#Int64, 3#Int64 #) + !y = packInt64X2# (# 4#Int64, 2#Int64 #) + !z = broadcastInt64X2# 5#Int64 + !w = f x y z + (# w0, w1 #) = unpackInt64X2# w + print (I64# w0, I64# w1) ===================================== testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.asm ===================================== @@ -0,0 +1,2 @@ +vpminuq +vpmaxuq ===================================== testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.hs ===================================== @@ -0,0 +1,27 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ExtendedLiterals #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnboxedTuples #-} +import GHC.Exts +import GHC.Prim +import GHC.Word + +{-# NOINLINE f #-} +f :: Word64X2# -> Word64X2# -> Word64X2# -> Word64X2# +f x y z = minWord64X2# x (plusWord64X2# y z) + +{-# NOINLINE g #-} +g :: Word64X2# -> Word64X2# -> Word64X2# -> Word64X2# +g x y z = maxWord64X2# x (plusWord64X2# y z) + +main :: IO () +main = do + let !x = packWord64X2# (# 1#Word64, 10#Word64 #) + !y = packWord64X2# (# 4#Word64, 2#Word64 #) + !z = broadcastWord64X2# 5#Word64 + !w = f x y z + (# w0, w1 #) = unpackWord64X2# w + !v = g x y z + (# v0, v1 #) = unpackWord64X2# v + print (W64# w0, W64# w1) + print (W64# v0, W64# v1) ===================================== testsuite/tests/simd/should_run/all.T ===================================== @@ -66,6 +66,9 @@ setTestOpts( , when(have_cpu_feature('avx'), extra_hc_opts('-mavx')) , when(have_cpu_feature('avx2'), extra_hc_opts('-mavx2')) , when(have_cpu_feature('avx512f'), extra_hc_opts('-mavx512f')) + , when(have_cpu_feature('avx512vl'), extra_hc_opts('-mavx512vl')) + , when(have_cpu_feature('avx512bw'), extra_hc_opts('-mavx512bw')) + , when(have_cpu_feature('avx512dq'), extra_hc_opts('-mavx512dq')) ]) test('simd000', [], compile_and_run, ['']) View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/14f485ee92ee8bcb25d083c3298c93a8... -- View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/14f485ee92ee8bcb25d083c3298c93a8... You're receiving this email because of your account on gitlab.haskell.org.