[Git][ghc/ghc][master] Support more x86 extensions: AVX-512 {BW,DQ,VL} and GFNI

17 Feb 2026


      Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
14f485ee by ARATA Mizuki at 2026-02-17T09:09:24+09:00
Support more x86 extensions: AVX-512 {BW,DQ,VL} and GFNI

Also, mark AVX-512 ER and PF as deprecated.

AVX-512 instructions can be used for certain 64-bit integer vector operations.

GFNI can be used to implement bitReverse (currently not used by NCG, but LLVM may use it).

Closes #26406
Addresses #26509

- - - - -


21 changed files:

- compiler/GHC/CmmToAsm/Config.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/Driver/Config/CmmToAsm.hs
- compiler/GHC/Driver/DynFlags.hs
- compiler/GHC/Driver/Pipeline/Execute.hs
- compiler/GHC/Driver/Session.hs
- compiler/GHC/SysTools/Cpp.hs
- docs/users_guide/9.16.1-notes.rst
- docs/users_guide/phases.rst
- docs/users_guide/using.rst
- testsuite/driver/cpu_features.py
- testsuite/tests/codeGen/should_gen_asm/all.T
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.asm
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.hs
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.asm
- + testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.hs
- + testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.asm
- + testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.hs
- testsuite/tests/simd/should_run/all.T


Changes:

=====================================
compiler/GHC/CmmToAsm/Config.hs
=====================================
@@ -31,6 +31,9 @@ data NCGConfig = NCGConfig
    , ncgDoConstantFolding     :: !Bool            -- ^ Perform CMM constant folding
    , ncgSseAvxVersion         :: Maybe SseAvxVersion -- ^ (x86) SSE and AVX instructions
    , ncgAvx512fEnabled        :: !Bool
+   , ncgAvx512vlEnabled       :: !Bool
+   , ncgAvx512bwEnabled       :: !Bool
+   , ncgAvx512dqEnabled       :: !Bool
    , ncgBmiVersion            :: Maybe BmiVersion -- ^ (x86) BMI instructions
    , ncgDumpRegAllocStages    :: !Bool
    , ncgDumpAsmStats          :: !Bool


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -134,6 +134,12 @@ avx2Enabled = do
   config <- getConfig
   return (ncgSseAvxVersion config >= Just AVX2)
 
+avx512vlEnabled :: NatM Bool
+avx512vlEnabled = ncgAvx512vlEnabled <$> getConfig
+
+avx512dqEnabled :: NatM Bool
+avx512dqEnabled = ncgAvx512dqEnabled <$> getConfig
+
 cmmTopCodeGen
         :: RawCmmDecl
         -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr]
@@ -1314,6 +1320,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
   sse4_1 <- sse4_1Enabled
   sse4_2 <- sse4_2Enabled
   avx <- avxEnabled
+  avx512vl <- avx512vlEnabled
+  avx512dq <- avx512dqEnabled
   case mop of
       MO_F_Eq _ -> condFltReg is32Bit EQQ x y
       MO_F_Ne _ -> condFltReg is32Bit NE  x y
@@ -1432,57 +1440,76 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_V_Sub l w | l * widthInBits w == 128 -> vector_int_op_sse PSUB l w x y
                    | otherwise -> needLlvm mop
       MO_V_Mul 16 W8 -> vector_int8x16_mul_sse2 x y
-      MO_V_Mul l@8 w@W16 -> vector_int_op_sse PMULL l w x y -- PMULLW (SSE2)
-      MO_V_Mul l@4 w@W32 | sse4_1 -> vector_int_op_sse PMULL l w x y -- PMULLD (SSE4.1)
+      MO_V_Mul l@8 w@W16 | avx -> vector_int_op_avx VPMULL l w x y -- VPMULLW (AVX)
+                         | otherwise -> vector_int_op_sse PMULL l w x y -- PMULLW (SSE2)
+      MO_V_Mul l@4 w@W32 | avx -> vector_int_op_avx VPMULL l w x y -- VPMULLD (AVX)
+                         | sse4_1 -> vector_int_op_sse PMULL l w x y -- PMULLD (SSE4.1)
                          | otherwise -> vector_int32x4_mul_sse2 x y
-      MO_V_Mul 2 W64 -> vector_int64x2_mul_sse2 x y
+      MO_V_Mul l@2 w@W64 | avx512dq && avx512vl -> vector_int_op_avx VPMULL l w x y -- VPMULLQ (AVX512DQ+VL)
+                         | otherwise -> vector_int64x2_mul_sse2 x y
       MO_V_Mul {} -> needLlvm mop
 
       MO_VU_Min l@16 w@W8
-                    -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUB (SSE2)
+        | avx       -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUB (AVX)
+        | otherwise -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUB (SSE2)
       MO_VU_Min l@8 w@W16
+        | avx       -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUW (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUW (SSE4.1)
         | otherwise -> vector_word_minmax_sse Min l w x y
       MO_VU_Min l@4 w@W32
+        | avx       -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUD (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Min (IntVecMinMax False)) l w x y -- PMINUD (SSE4.1)
         | otherwise -> vector_word_minmax_sse Min l w x y
       MO_VU_Min l@2 w@W64
+        | avx512vl  -> vector_int_op_avx (VMINMAX Min (IntVecMinMax False)) l w x y -- VPMINUQ (AVX512F+VL)
         | sse4_2    -> vector_word_minmax_sse Min l w x y -- PCMPGTQ requires SSE4.2
         -- The SSE2 version is implemented as a C call (MO_W64X2_Min)
       MO_VU_Min {} -> needLlvm mop
       MO_VU_Max l@16 w@W8
-                    -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUB (SSE2)
+        | avx       -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUB (AVX)
+        | otherwise -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUB (SSE2)
       MO_VU_Max l@8 w@W16
+        | avx       -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUW (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUW (SSE4.1)
         | otherwise -> vector_word_minmax_sse Max l w x y
       MO_VU_Max l@4 w@W32
+        | avx       -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUD (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Max (IntVecMinMax False)) l w x y -- PMAXUD (SSE4.1)
         | otherwise -> vector_word_minmax_sse Max l w x y
       MO_VU_Max l@2 w@W64
+        | avx512vl  -> vector_int_op_avx (VMINMAX Max (IntVecMinMax False)) l w x y -- VPMAXUQ (AVX512F+VL)
         | sse4_2    -> vector_word_minmax_sse Max l w x y -- PCMPGTQ requires SSE4.2
         -- The SSE2 version is implemented as a C call (MO_W64X2_Max)
       MO_VU_Max {} -> needLlvm mop
       MO_VS_Min l@16 w@W8
+        | avx       -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSB (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSB (SSE4.1)
         | otherwise -> vector_int_minmax_sse Min l w x y
       MO_VS_Min l@8 w@W16
-                    -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSW (SSE2)
+        | avx       -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSW (AVX)
+        | otherwise -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSW (SSE2)
       MO_VS_Min l@4 w@W32
+        | avx       -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSD (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Min (IntVecMinMax True)) l w x y -- PMINSD (SSE4.1)
         | otherwise -> vector_int_minmax_sse Min l w x y
       MO_VS_Min l@2 w@W64
+        | avx512vl  -> vector_int_op_avx (VMINMAX Min (IntVecMinMax True)) l w x y -- VPMINSQ (AVX512F+VL)
         | sse4_2    -> vector_int_minmax_sse Min l w x y -- PCMPGTQ requires SSE4.2
         -- The SSE2 version is implemented as a C call (MO_I64X2_Min)
       MO_VS_Min {} -> needLlvm mop
       MO_VS_Max l@16 w@W8
+        | avx       -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSB (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSB (SSE4.1)
         | otherwise -> vector_int_minmax_sse Max l w x y
       MO_VS_Max l@8 w@W16
-                    -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSW (SSE2)
+        | avx       -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSW (AVX)
+        | otherwise -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSW (SSE2)
       MO_VS_Max l@4 w@W32
+        | avx       -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSD (AVX)
         | sse4_1    -> vector_int_op_sse (MINMAX Max (IntVecMinMax True)) l w x y -- PMAXSD (SSE4.1)
         | otherwise -> vector_int_minmax_sse Max l w x y
       MO_VS_Max l@2 w@W64
+        | avx512vl  -> vector_int_op_avx (VMINMAX Max (IntVecMinMax True)) l w x y -- VPMAXSQ (AVX512F+VL)
         | sse4_2    -> vector_int_minmax_sse Max l w x y -- PCMPGTQ requires SSE4.2
         -- The SSE2 version is implemented as a C call (MO_I64X2_Max)
       MO_VS_Max {} -> needLlvm mop
@@ -1975,7 +2002,6 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                      (PUNPCKLDQ format (OpReg tmpOdd1) dst)                                  -- dst <- (dst[0],tmpOdd1[0],dst[1],tmpOdd1[1])
       return (Any format code)
 
-    -- TODO: We could use `VPMULLQ` if AVX-512 or AVX10.1 is available.
     vector_int64x2_mul_sse2 :: CmmExpr -> CmmExpr -> NatM Register
     vector_int64x2_mul_sse2 expr1 expr2 = do
       -- implement 64 bit multiplication using 32-bit PMULUDQ multiplication instructions


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -338,6 +338,7 @@ data Instr
         | PADD       Format Operand Reg
         | PSUB       Format Operand Reg
         | PMULL      Format Operand Reg
+        | VPMULL     Format Operand Reg Reg
         | PMULUDQ    Format Operand Reg
 
         -- SIMD compare
@@ -601,6 +602,7 @@ regUsageOfInstr platform instr
     PADD         fmt src dst   -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
     PSUB         fmt src dst   -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
     PMULL        fmt src dst   -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
+    VPMULL       fmt s1 s2 dst -> mkRU (use_R fmt s1  [mk fmt s2])  [mk fmt dst]
     PMULUDQ      fmt src dst   -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
 
     PCMPGT       fmt src dst   -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
@@ -912,6 +914,7 @@ patchRegsOfInstr platform instr env
     PADD       fmt src dst   -> PADD fmt (patchOp src) (env dst)
     PSUB       fmt src dst   -> PSUB fmt (patchOp src) (env dst)
     PMULL      fmt src dst   -> PMULL fmt (patchOp src) (env dst)
+    VPMULL     fmt s1 s2 dst -> VPMULL fmt (patchOp s1) (env s2) (env dst)
     PMULUDQ    fmt src dst   -> PMULUDQ fmt (patchOp src) (env dst)
 
     PCMPGT     fmt src dst   -> PCMPGT fmt (patchOp src) (env dst)


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -1012,6 +1012,8 @@ pprInstr platform i = case i of
      -> pprFormatOpReg (text "psub") format src dst
    PMULL format src dst
      -> pprFormatOpReg (text "pmull") format src dst
+   VPMULL format s1 s2 dst
+     -> pprFormatOpRegReg (text "vpmull") format s1 s2 dst
    PMULUDQ format src dst
      -> pprOpReg (text "pmuludq") format src dst
    PCMPGT format src dst
@@ -1574,7 +1576,8 @@ pprInstr platform i = case i of
    pprMinMax wantV minOrMax mmTy fmt regs
      = line $ hcat ( instr : intersperse comma ( map ( pprOperand platform fmt ) regs ) )
       where
-        instr =  (if wantV then text "v" else empty)
+        instr =  char '\t'
+              <> (if wantV then text "v" else empty)
               <> (case mmTy of { IntVecMinMax {} -> text "p"; FloatMinMax -> empty })
               <> (case minOrMax of { Min -> text "min"; Max -> text "max" })
               <> (case mmTy of { IntVecMinMax wantSigned -> if wantSigned then text "s" else text "u"; FloatMinMax -> empty })


=====================================
compiler/GHC/Driver/Config/CmmToAsm.hs
=====================================
@@ -65,6 +65,9 @@ initNCGConfig dflags this_mod = NCGConfig
             ArchX86    -> v
             _          -> Nothing
    , ncgAvx512fEnabled = isAvx512fEnabled dflags
+   , ncgAvx512vlEnabled = isAvx512vlEnabled dflags
+   , ncgAvx512bwEnabled = isAvx512bwEnabled dflags
+   , ncgAvx512dqEnabled = isAvx512dqEnabled dflags
 
    , ncgLa664Enabled = isLa664Enabled dflags
 


=====================================
compiler/GHC/Driver/DynFlags.hs
=====================================
@@ -83,11 +83,15 @@ module GHC.Driver.DynFlags (
         isSse4_2Enabled,
         isAvxEnabled,
         isAvx2Enabled,
+        isAvx512bwEnabled,
         isAvx512cdEnabled,
+        isAvx512dqEnabled,
         isAvx512erEnabled,
         isAvx512fEnabled,
         isAvx512pfEnabled,
+        isAvx512vlEnabled,
         isFmaEnabled,
+        isGfniEnabled,
         isBmiEnabled,
         isBmi2Enabled,
         -- For LoongArch platform
@@ -454,12 +458,16 @@ data DynFlags = DynFlags {
   -- | Machine dependent flags (-m\<blah> stuff)
   sseAvxVersion         :: Maybe SseAvxVersion,
   bmiVersion            :: Maybe BmiVersion,
-  avx512cd              :: Bool, -- Enable AVX-512 Conflict Detection Instructions.
-  avx512er              :: Bool, -- Enable AVX-512 Exponential and Reciprocal Instructions.
-  avx512f               :: Bool, -- Enable AVX-512 instructions.
-  avx512pf              :: Bool, -- Enable AVX-512 PreFetch Instructions.
+  avx512bw              :: Bool, -- ^ Enable AVX-512BW Instructions.
+  avx512cd              :: Bool, -- ^ Enable AVX-512 Conflict Detection Instructions.
+  avx512dq              :: Bool, -- ^ Enable AVX-512DQ Instructions.
+  avx512er              :: Bool, -- ^ Enable AVX-512 Exponential and Reciprocal Instructions.
+  avx512f               :: Bool, -- ^ Enable AVX-512 instructions.
+  avx512pf              :: Bool, -- ^ Enable AVX-512 PreFetch Instructions.
+  avx512vl              :: Bool, -- ^ Enable AVX-512VL Instructions.
   fma                   :: Bool, -- ^ Enable FMA instructions.
-  la664                 :: Bool, -- Enable LA664 instructions
+  gfni                  :: Bool, -- ^ Enable GFNI Instructions.
+  la664                 :: Bool, -- ^ Enable LA664 instructions
 
   -- Constants used to control the amount of optimization done.
 
@@ -737,12 +745,16 @@ defaultDynFlags mySettings =
         interactivePrint = Nothing,
         sseAvxVersion = Nothing,
         bmiVersion = Nothing,
+        avx512bw = False,
         avx512cd = False,
+        avx512dq = False,
         avx512er = False,
         avx512f = False,
         avx512pf = False,
+        avx512vl = False,
         -- Use FMA by default on AArch64
         fma = (platformArch . sTargetPlatform $ mySettings) == ArchAArch64,
+        gfni = False,
         -- For LoongArch, la464 is used by default.
         la664 = False,
 
@@ -1616,18 +1628,27 @@ isAvxEnabled dflags = sseAvxVersion dflags >= Just AVX1 || (isX86 && fma dflags)
 isAvx2Enabled :: DynFlags -> Bool
 isAvx2Enabled dflags = sseAvxVersion dflags >= Just AVX2 || isAvx512fEnabled dflags
 
+isAvx512bwEnabled :: DynFlags -> Bool
+isAvx512bwEnabled dflags = avx512bw dflags
+
 isAvx512cdEnabled :: DynFlags -> Bool
 isAvx512cdEnabled dflags = avx512cd dflags
 
+isAvx512dqEnabled :: DynFlags -> Bool
+isAvx512dqEnabled dflags = avx512dq dflags
+
 isAvx512erEnabled :: DynFlags -> Bool
 isAvx512erEnabled dflags = avx512er dflags
 
 isAvx512fEnabled :: DynFlags -> Bool
-isAvx512fEnabled dflags = avx512f dflags || avx512cd dflags || avx512er dflags || avx512pf dflags
+isAvx512fEnabled dflags = avx512f dflags || avx512bw dflags || avx512cd dflags || avx512dq dflags || avx512er dflags || avx512pf dflags || avx512vl dflags
 
 isAvx512pfEnabled :: DynFlags -> Bool
 isAvx512pfEnabled dflags = avx512pf dflags
 
+isAvx512vlEnabled :: DynFlags -> Bool
+isAvx512vlEnabled dflags = avx512vl dflags
+
 isFmaEnabled :: DynFlags -> Bool
 isFmaEnabled dflags = fma dflags || (isX86 && isAvx512fEnabled dflags)
   where
@@ -1637,6 +1658,9 @@ isFmaEnabled dflags = fma dflags || (isX86 && isAvx512fEnabled dflags)
       ArchX86    -> True
       _          -> False
 
+isGfniEnabled :: DynFlags -> Bool
+isGfniEnabled dflags = gfni dflags
+
 {- Note [Implications between X86 CPU feature flags]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Many X86 CPU feature flags (such as -mavx, -mfma or -msse4) imply other
@@ -1649,7 +1673,7 @@ structures:
 together with other implications such as
 
   3. FMA -> AVX
-  4. AVX512{CD,ED,PF} -> AVX512F -> AVX2
+  4. AVX512{BW,CD,DQ,ER,PF,VL} -> AVX512F -> AVX2
 
 
 We handle this as follows:


=====================================
compiler/GHC/Driver/Pipeline/Execute.hs
=====================================
@@ -984,13 +984,17 @@ llvmOptions llvm_config llvm_version dflags =
                    -- It may become deprecated in a future LLVM version, though.
               ++ ["+avx2"    | isAvx2Enabled dflags     ]
               ++ ["+avx"     | isAvxEnabled dflags      ]
+              ++ ["+avx512bw"| isAvx512bwEnabled dflags ]
               ++ ["+avx512cd"| isAvx512cdEnabled dflags ]
+              ++ ["+avx512dq"| isAvx512dqEnabled dflags ]
               ++ ["+avx512er"| isAvx512erEnabled dflags ]
               ++ ["+avx512pf"| isAvx512pfEnabled dflags ]
-              -- For Arch64 +fma is not a option (it's unconditionally available).
+              ++ ["+avx512vl"| isAvx512vlEnabled dflags ]
+              -- For AArch64 +fma is not a option (it's unconditionally available).
               ++ ["+fma"     | isFmaEnabled dflags && (arch /= ArchAArch64) ]
               ++ ["+bmi"     | isBmiEnabled dflags      ]
               ++ ["+bmi2"    | isBmi2Enabled dflags     ]
+              ++ ["+gfni"    | isGfniEnabled dflags     ]
 
         abi :: String
         abi = case platformArch (targetPlatform dflags) of


=====================================
compiler/GHC/Driver/Session.hs
=====================================
@@ -212,11 +212,15 @@ module GHC.Driver.Session (
         isBmi2Enabled,
         isAvxEnabled,
         isAvx2Enabled,
+        isAvx512bwEnabled,
         isAvx512cdEnabled,
+        isAvx512dqEnabled,
         isAvx512erEnabled,
         isAvx512fEnabled,
         isAvx512pfEnabled,
+        isAvx512vlEnabled,
         isFmaEnabled,
+        isGfniEnabled,
 
         -- LoongArch: ISA version: la664, la464(default)
         isLa664Enabled,
@@ -1723,14 +1727,17 @@ dynamic_flags_deps = [
                                                  d { sseAvxVersion = max (Just AVX1) (sseAvxVersion d) }))
   , make_ord_flag defGhcFlag "mavx2"        (noArg (\d ->
                                                  d { sseAvxVersion = max (Just AVX2) (sseAvxVersion d) }))
-  , make_ord_flag defGhcFlag "mavx512cd"    (noArg (\d ->
-                                                         d { avx512cd = True }))
-  , make_ord_flag defGhcFlag "mavx512er"    (noArg (\d ->
-                                                         d { avx512er = True }))
+  , make_ord_flag defGhcFlag "mavx512bw"    (noArg (\d -> d { avx512bw = True }))
+  , make_ord_flag defGhcFlag "mavx512cd"    (noArg (\d -> d { avx512cd = True }))
+  , make_ord_flag defGhcFlag "mavx512dq"    (noArg (\d -> d { avx512dq = True }))
+  , make_dep_flag defGhcFlag "mavx512er"    (noArg (\d -> d { avx512er = True }))
+        "AVX-512ER was only available on Xeon Phi"
   , make_ord_flag defGhcFlag "mavx512f"     (noArg (\d -> d { avx512f = True }))
-  , make_ord_flag defGhcFlag "mavx512pf"    (noArg (\d ->
-                                                         d { avx512pf = True }))
+  , make_dep_flag defGhcFlag "mavx512pf"    (noArg (\d -> d { avx512pf = True }))
+        "AVX-512PF was only available on Xeon Phi"
+  , make_ord_flag defGhcFlag "mavx512vl"    (noArg (\d -> d { avx512vl = True }))
   , make_ord_flag defGhcFlag "mfma"         (noArg (\d -> d { fma = True }))
+  , make_ord_flag defGhcFlag "mgfni"        (noArg (\d -> d { gfni = True }))
 
 
   , make_ord_flag defGhcFlag "mla664"       (noArg (\d -> d { la664 = True }))


=====================================
compiler/GHC/SysTools/Cpp.hs
=====================================
@@ -165,10 +165,16 @@ doCpp logger tmpfs dflags unit_env opts input_fn output_fn = do
     let avx_defs =
           [ "-D__AVX__"      | isAvxEnabled      dflags ] ++
           [ "-D__AVX2__"     | isAvx2Enabled     dflags ] ++
+          [ "-D__AVX512BW__" | isAvx512bwEnabled dflags ] ++
           [ "-D__AVX512CD__" | isAvx512cdEnabled dflags ] ++
+          [ "-D__AVX512DQ__" | isAvx512dqEnabled dflags ] ++
           [ "-D__AVX512ER__" | isAvx512erEnabled dflags ] ++
           [ "-D__AVX512F__"  | isAvx512fEnabled  dflags ] ++
-          [ "-D__AVX512PF__" | isAvx512pfEnabled dflags ]
+          [ "-D__AVX512PF__" | isAvx512pfEnabled dflags ] ++
+          [ "-D__AVX512VL__" | isAvx512vlEnabled dflags ]
+
+    let gfni_def =
+          [ "-D__GFNI__"     | isGfniEnabled dflags ]
 
     backend_defs <- applyCDefs (backendCDefs $ backend dflags) logger dflags
 
@@ -209,6 +215,7 @@ doCpp logger tmpfs dflags unit_env opts input_fn output_fn = do
                     ++ map GHC.SysTools.Option sse_defs
                     ++ map GHC.SysTools.Option fma_def
                     ++ map GHC.SysTools.Option avx_defs
+                    ++ map GHC.SysTools.Option gfni_def
                     ++ map GHC.SysTools.Option io_manager_defs
                     ++ mb_macro_include
                     ++ line_pragmas


=====================================
docs/users_guide/9.16.1-notes.rst
=====================================
@@ -101,6 +101,9 @@ to
 
 See :ghc-ticket:`25345`.
 
+- Add several options for x86 extensions: :ghc-flag:`-mavx512bw`,
+  :ghc-flag:`-mavx512dq`, :ghc-flag:`-mavx512vl`, and :ghc-flag:`-mgfni`.
+
 GHCi
 ~~~~
 


=====================================
docs/users_guide/phases.rst
=====================================
@@ -553,8 +553,10 @@ SIMD macros
     These are defined conditionally based on the SIMD
     flags used for compilation:
 
-    ``__SSE__``, ``__SSE2__``, ``__SSE4_2__``, ``__FMA__``,
-    ``__AVX__``, ``__AVX2__``, ``__AVX512CD__``, ``__AVX512ER__``, ``__AVX512F__``, ``__AVX512PF__``,
+    ``__SSE__``, ``__SSE2__``, ``__SSE3__``, ``__SSSE3__``,
+    ``__SSE4_1__``, ``__SSE4_2__``, ``__FMA__``, ``__AVX__``, ``__AVX2__``,
+    ``__AVX512BW__``, ``__AVX512CD__``, ``__AVX512DQ__``, ``__AVX512ER__``,
+    ``__AVX512F__``, ``__AVX512PF__``, ``__AVX512VL__``, ``__GFNI__``
 
 .. _cpp-string-gaps:
 


=====================================
docs/users_guide/using.rst
=====================================
@@ -1601,7 +1601,7 @@ Some flags only make sense for particular target platforms.
     :implies: :ghc-flag:`-msse4.2`
 
     (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
-    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX instructions.
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX instructions.
 
 .. ghc-flag:: -mavx2
     :shortdesc: (x86 only) Enable support for AVX2 SIMD extensions
@@ -1611,47 +1611,84 @@ Some flags only make sense for particular target platforms.
     :implies: :ghc-flag:`-mavx`
 
     (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
-    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX2 instructions.
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX2 instructions.
+
+.. ghc-flag:: -mavx512bw
+    :shortdesc: (x86 only) Enable support for AVX-512BW SIMD extensions
+    :type: dynamic
+    :category: platform-options
+
+    :since: 9.16.1
+    :implies: :ghc-flag:`-mavx512f`
+
+    (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512BW instructions.
 
 .. ghc-flag:: -mavx512cd
-    :shortdesc: (x86 only) Enable support for AVX512-CD SIMD extensions
+    :shortdesc: (x86 only) Enable support for AVX-512CD SIMD extensions
     :type: dynamic
     :category: platform-options
 
     :implies: :ghc-flag:`-mavx512f`
 
     (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
-    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-CD instructions.
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512CD instructions.
+
+.. ghc-flag:: -mavx512dq
+    :shortdesc: (x86 only) Enable support for AVX-512DQ SIMD extensions
+    :type: dynamic
+    :category: platform-options
+
+    :since: 9.16.1
+    :implies: :ghc-flag:`-mavx512f`
+
+    (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512DQ instructions.
 
 .. ghc-flag:: -mavx512er
-    :shortdesc: (x86 only) Enable support for AVX512-ER SIMD extensions
+    :shortdesc: (x86 only, deprecated) Enable support for AVX-512ER SIMD extensions
     :type: dynamic
     :category: platform-options
 
     :implies: :ghc-flag:`-mavx512f`
 
     (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
-    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-ER instructions.
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512ER instructions.
+
+    The AVX-512ER extension is deprecated and not supported by newer LLVM versions.
 
 .. ghc-flag:: -mavx512f
-    :shortdesc: (x86 only) Enable support for AVX512-F SIMD extensions
+    :shortdesc: (x86 only) Enable support for AVX-512F SIMD extensions
     :type: dynamic
     :category: platform-options
 
     :implies: :ghc-flag:`-mavx2`, :ghc-flag:`-mfma`
 
     (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
-    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-F instructions.
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512F instructions.
 
 .. ghc-flag:: -mavx512pf
-    :shortdesc: (x86 only) Enable support for AVX512-PF SIMD extensions
+    :shortdesc: (x86 only, deprecated) Enable support for AVX-512PF SIMD extensions
     :type: dynamic
     :category: platform-options
 
     :implies: :ghc-flag:`-mavx512f`
 
     (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
-    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-PF instructions.
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512PF instructions.
+
+    The AVX-512PF extension is deprecated and not supported by newer LLVM versions.
+
+.. ghc-flag:: -mavx512vl
+    :shortdesc: (x86 only) Enable support for AVX-512VL SIMD extensions
+    :type: dynamic
+    :category: platform-options
+
+    :since: 9.16.1
+    :implies: :ghc-flag:`-mavx512f`
+
+    (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 AVX-512VL instructions.
 
 .. ghc-flag:: -msse
     :shortdesc: (x86 only) Use SSE for floating-point operations
@@ -1714,13 +1751,13 @@ Some flags only make sense for particular target platforms.
     or the :ref:`LLVM backend <llvm-code-gen>`).
 
 .. ghc-flag:: -msse4
-    :shortdesc: (x86 only) Use SSE4 for floating-point operations
+    :shortdesc: (x86 only) Use SSE4.1 for floating-point operations
     :type: dynamic
     :category: platform-options
 
     :implies: :ghc-flag:`-mssse3`
 
-    (x86 only) Use the SSE4 instruction set to
+    (x86 only) Use the SSE4.1 instruction set to
     implement some floating point and bit operations(whether using the :ref:`native code generator <native-code-gen>`
     or the :ref:`LLVM backend <llvm-code-gen>`).
 
@@ -1781,6 +1818,16 @@ Some flags only make sense for particular target platforms.
     multiply-add, which might perform non-IEEE-compliant software emulation on
     some platforms (depending on the implementation of the C standard library).
 
+.. ghc-flag:: -mgfni
+    :shortdesc: (x86 only) Use GFNI for advanced bit manipulations
+    :type: dynamic
+    :category: platform-options
+
+    :since: 9.16.1
+
+    (x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
+    or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86 GFNI instructions.
+
 .. ghc-flag:: -mla664
     :shortdesc: (LoongArch only) Used for new instructions for la664 uarch
     :type: dynamic


=====================================
testsuite/driver/cpu_features.py
=====================================
@@ -9,9 +9,9 @@ SUPPORTED_CPU_FEATURES = {
 
     # x86:
     'sse', 'sse2', 'sse3', 'pni', 'ssse3', 'sse4_1', 'sse4_2',
-    'avx', 'avx2', 'avx512f',
+    'avx', 'avx2', 'avx512f', 'avx512vl', 'avx512bw', 'avx512dq',
     'fma',
-    'popcnt', 'bmi1', 'bmi2'
+    'popcnt', 'bmi1', 'bmi2', 'gfni',
 }
 
 cpu_feature_cache = None


=====================================
testsuite/tests/codeGen/should_gen_asm/all.T
=====================================
@@ -17,3 +17,9 @@ test('msse-option-order', [unless(arch('x86_64') or arch('i386'), skip),
                            when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-msse4.2 -msse2'])
 test('mavx-should-enable-popcnt', [unless(arch('x86_64') or arch('i386'), skip),
                                    when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-mavx'])
+test('avx512-int64-mul', [unless(arch('x86_64'), skip),
+                          when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512dq -mavx512vl'])
+test('avx512-int64-minmax', [unless(arch('x86_64'), skip),
+                             when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512vl'])
+test('avx512-word64-minmax', [unless(arch('x86_64'), skip),
+                              when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512vl'])


=====================================
testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.asm
=====================================
@@ -0,0 +1,2 @@
+vpminsq
+vpmaxsq


=====================================
testsuite/tests/codeGen/should_gen_asm/avx512-int64-minmax.hs
=====================================
@@ -0,0 +1,27 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE ExtendedLiterals #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+import GHC.Exts
+import GHC.Prim
+import GHC.Int
+
+{-# NOINLINE f #-}
+f :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2#
+f x y z = minInt64X2# x (plusInt64X2# y z)
+
+{-# NOINLINE g #-}
+g :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2#
+g x y z = maxInt64X2# x (plusInt64X2# y z)
+
+main :: IO ()
+main = do
+  let !x = packInt64X2# (# 1#Int64, 10#Int64 #)
+      !y = packInt64X2# (# 4#Int64, 2#Int64 #)
+      !z = broadcastInt64X2# 5#Int64
+      !w = f x y z
+      (# w0, w1 #) = unpackInt64X2# w
+      !v = g x y z
+      (# v0, v1 #) = unpackInt64X2# v
+  print (I64# w0, I64# w1)
+  print (I64# v0, I64# v1)


=====================================
testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.asm
=====================================
@@ -0,0 +1 @@
+vpmullq


=====================================
testsuite/tests/codeGen/should_gen_asm/avx512-int64-mul.hs
=====================================
@@ -0,0 +1,19 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE ExtendedLiterals #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+import GHC.Exts
+import GHC.Int
+
+{-# NOINLINE f #-}
+f :: Int64X2# -> Int64X2# -> Int64X2# -> Int64X2#
+f x y z = timesInt64X2# x (plusInt64X2# y z)
+
+main :: IO ()
+main = do
+  let !x = packInt64X2# (# 1#Int64, 3#Int64 #)
+      !y = packInt64X2# (# 4#Int64, 2#Int64 #)
+      !z = broadcastInt64X2# 5#Int64
+      !w = f x y z
+      (# w0, w1 #) = unpackInt64X2# w
+  print (I64# w0, I64# w1)


=====================================
testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.asm
=====================================
@@ -0,0 +1,2 @@
+vpminuq
+vpmaxuq


=====================================
testsuite/tests/codeGen/should_gen_asm/avx512-word64-minmax.hs
=====================================
@@ -0,0 +1,27 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE ExtendedLiterals #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+import GHC.Exts
+import GHC.Prim
+import GHC.Word
+
+{-# NOINLINE f #-}
+f :: Word64X2# -> Word64X2# -> Word64X2# -> Word64X2#
+f x y z = minWord64X2# x (plusWord64X2# y z)
+
+{-# NOINLINE g #-}
+g :: Word64X2# -> Word64X2# -> Word64X2# -> Word64X2#
+g x y z = maxWord64X2# x (plusWord64X2# y z)
+
+main :: IO ()
+main = do
+  let !x = packWord64X2# (# 1#Word64, 10#Word64 #)
+      !y = packWord64X2# (# 4#Word64, 2#Word64 #)
+      !z = broadcastWord64X2# 5#Word64
+      !w = f x y z
+      (# w0, w1 #) = unpackWord64X2# w
+      !v = g x y z
+      (# v0, v1 #) = unpackWord64X2# v
+  print (W64# w0, W64# w1)
+  print (W64# v0, W64# v1)


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -66,6 +66,9 @@ setTestOpts(
   , when(have_cpu_feature('avx'), extra_hc_opts('-mavx'))
   , when(have_cpu_feature('avx2'), extra_hc_opts('-mavx2'))
   , when(have_cpu_feature('avx512f'), extra_hc_opts('-mavx512f'))
+  , when(have_cpu_feature('avx512vl'), extra_hc_opts('-mavx512vl'))
+  , when(have_cpu_feature('avx512bw'), extra_hc_opts('-mavx512bw'))
+  , when(have_cpu_feature('avx512dq'), extra_hc_opts('-mavx512dq'))
   ])
 
 test('simd000', [], compile_and_run, [''])



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/14f485ee92ee8bcb25d083c3298c93a8...

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/14f485ee92ee8bcb25d083c3298c93a8...
You're receiving this email because of your account on gitlab.haskell.org.

    

[Git][ghc/ghc][master] Support more x86 extensions: AVX-512 {BW,DQ,VL} and GFNI

Marge Bot (＠marge-bot)