Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC
Commits:
d4a9d6d6 by ARATA Mizuki at 2025-10-19T18:43:47+09:00
Handle implications between x86 feature flags
This includes:
* Multiple -msse* options can be specified
* -mavx implies -msse4.2
* -mavx2 implies -mavx
* -mfma implies -mavx
* -mavx512f implies -mavx2 and -mfma
* -mavx512{cd,er,pf} imply -mavx512f
Closes #24989
Co-authored-by: sheaf
- - - - -
14 changed files:
- compiler/GHC/CmmToAsm/Config.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/Driver/Config/CmmToAsm.hs
- compiler/GHC/Driver/DynFlags.hs
- compiler/GHC/Driver/Session.hs
- compiler/GHC/Platform.hs
- docs/users_guide/9.16.1-notes.rst
- docs/users_guide/using.rst
- testsuite/tests/codeGen/should_gen_asm/all.T
- + testsuite/tests/codeGen/should_gen_asm/mavx-should-enable-popcnt.asm
- + testsuite/tests/codeGen/should_gen_asm/mavx-should-enable-popcnt.hs
- + testsuite/tests/codeGen/should_gen_asm/msse-option-order.asm
- + testsuite/tests/codeGen/should_gen_asm/msse-option-order.hs
Changes:
=====================================
compiler/GHC/CmmToAsm/Config.hs
=====================================
@@ -29,9 +29,7 @@ data NCGConfig = NCGConfig
, ncgRegsGraph :: !Bool
, ncgAsmLinting :: !Bool -- ^ Perform ASM linting pass
, ncgDoConstantFolding :: !Bool -- ^ Perform CMM constant folding
- , ncgSseVersion :: Maybe SseVersion -- ^ (x86) SSE instructions
- , ncgAvxEnabled :: !Bool
- , ncgAvx2Enabled :: !Bool
+ , ncgSseAvxVersion :: Maybe SseAvxVersion -- ^ (x86) SSE and AVX instructions
, ncgAvx512fEnabled :: !Bool
, ncgBmiVersion :: Maybe BmiVersion -- ^ (x86) BMI instructions
, ncgDumpRegAllocStages :: !Bool
=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -104,30 +104,32 @@ is32BitPlatform = do
platform <- getPlatform
return $ target32Bit platform
+-- These flags may be implied by other flags like -mfma or -mavx512f.
+-- See Note [Implications between X86 CPU feature flags] for details.
ssse3Enabled :: NatM Bool
ssse3Enabled = do
config <- getConfig
- return (ncgSseVersion config >= Just SSSE3)
+ return (ncgSseAvxVersion config >= Just SSSE3)
sse4_1Enabled :: NatM Bool
sse4_1Enabled = do
config <- getConfig
- return (ncgSseVersion config >= Just SSE4)
+ return (ncgSseAvxVersion config >= Just SSE4)
sse4_2Enabled :: NatM Bool
sse4_2Enabled = do
config <- getConfig
- return (ncgSseVersion config >= Just SSE42)
+ return (ncgSseAvxVersion config >= Just SSE42)
avxEnabled :: NatM Bool
avxEnabled = do
config <- getConfig
- return (ncgAvxEnabled config)
+ return (ncgSseAvxVersion config >= Just AVX1)
avx2Enabled :: NatM Bool
avx2Enabled = do
config <- getConfig
- return (ncgAvx2Enabled config)
+ return (ncgSseAvxVersion config >= Just AVX2)
cmmTopCodeGen
:: RawCmmDecl
=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -1121,8 +1121,8 @@ movInstr config fmt =
= f
plat = ncgPlatform config
- avx = ncgAvxEnabled config
- avx2 = ncgAvx2Enabled config
+ avx = ncgSseAvxVersion config >= Just AVX1
+ avx2 = ncgSseAvxVersion config >= Just AVX2
avx512f = ncgAvx512fEnabled config
avx_move sFmt =
if isFloatScalarFormat sFmt
=====================================
compiler/GHC/Driver/Config/CmmToAsm.hs
=====================================
@@ -52,15 +52,18 @@ initNCGConfig dflags this_mod = NCGConfig
-- operations would change the precision and final result of what
-- would otherwise be the same expressions with respect to single or
-- double precision IEEE floating point computations.
- , ncgSseVersion =
- let v | sseVersion dflags < Just SSE2 = Just SSE2
- | otherwise = sseVersion dflags
+
+ -- ncgSseAvxVersion is set to the actual SSE/AVX version.
+ -- For example, -mfma does not set DynFlags's sseAvxVersion, but makes ncgSseAvxVersion >= AVX1.
+ -- See also Note [Implications between X86 CPU feature flags]
+ , ncgSseAvxVersion =
+ let v | isAvx2Enabled dflags = Just AVX2 -- -mavx512f does not set sseAvxVersion, but makes isAvx2Enabled true
+ | isAvxEnabled dflags = Just AVX1 -- -mfma does not set sseAvxVersion, but makes isAvxEnabled true
+ | otherwise = max (Just SSE2) (sseAvxVersion dflags)
in case platformArch (targetPlatform dflags) of
ArchX86_64 -> v
ArchX86 -> v
_ -> Nothing
- , ncgAvxEnabled = isAvxEnabled dflags
- , ncgAvx2Enabled = isAvx2Enabled dflags
, ncgAvx512fEnabled = isAvx512fEnabled dflags
, ncgDwarfEnabled = osElfTarget (platformOS (targetPlatform dflags)) && debugLevel dflags > 0 && platformArch (targetPlatform dflags) /= ArchAArch64
=====================================
compiler/GHC/Driver/DynFlags.hs
=====================================
@@ -447,10 +447,8 @@ data DynFlags = DynFlags {
interactivePrint :: Maybe String,
-- | Machine dependent flags (-m\<blah> stuff)
- sseVersion :: Maybe SseVersion,
+ sseAvxVersion :: Maybe SseAvxVersion,
bmiVersion :: Maybe BmiVersion,
- avx :: Bool,
- avx2 :: Bool,
avx512cd :: Bool, -- Enable AVX-512 Conflict Detection Instructions.
avx512er :: Bool, -- Enable AVX-512 Exponential and Reciprocal Instructions.
avx512f :: Bool, -- Enable AVX-512 instructions.
@@ -731,10 +729,8 @@ defaultDynFlags mySettings =
profAuto = NoProfAuto,
callerCcFilters = [],
interactivePrint = Nothing,
- sseVersion = Nothing,
+ sseAvxVersion = Nothing,
bmiVersion = Nothing,
- avx = False,
- avx2 = False,
avx512cd = False,
avx512er = False,
avx512f = False,
@@ -1548,22 +1544,28 @@ initPromotionTickContext dflags =
-- SSE, AVX, FMA
isSse3Enabled :: DynFlags -> Bool
-isSse3Enabled dflags = sseVersion dflags >= Just SSE3
+isSse3Enabled dflags = sseAvxVersion dflags >= Just SSE3 || isAvxEnabled dflags
isSsse3Enabled :: DynFlags -> Bool
-isSsse3Enabled dflags = sseVersion dflags >= Just SSSE3
+isSsse3Enabled dflags = sseAvxVersion dflags >= Just SSSE3 || isAvxEnabled dflags
isSse4_1Enabled :: DynFlags -> Bool
-isSse4_1Enabled dflags = sseVersion dflags >= Just SSE4
+isSse4_1Enabled dflags = sseAvxVersion dflags >= Just SSE4 || isAvxEnabled dflags
isSse4_2Enabled :: DynFlags -> Bool
-isSse4_2Enabled dflags = sseVersion dflags >= Just SSE42
+isSse4_2Enabled dflags = sseAvxVersion dflags >= Just SSE42 || isAvxEnabled dflags
isAvxEnabled :: DynFlags -> Bool
-isAvxEnabled dflags = avx dflags || avx2 dflags || avx512f dflags
+isAvxEnabled dflags = sseAvxVersion dflags >= Just AVX1 || (isX86 && fma dflags) || isAvx512fEnabled dflags
+ where
+ -- -mfma can be used on multiple platforms, but -mavx is x86-only
+ isX86 = case platformArch (targetPlatform dflags) of
+ ArchX86_64 -> True
+ ArchX86 -> True
+ _ -> False
isAvx2Enabled :: DynFlags -> Bool
-isAvx2Enabled dflags = avx2 dflags || avx512f dflags
+isAvx2Enabled dflags = sseAvxVersion dflags >= Just AVX2 || isAvx512fEnabled dflags
isAvx512cdEnabled :: DynFlags -> Bool
isAvx512cdEnabled dflags = avx512cd dflags
@@ -1572,13 +1574,49 @@ isAvx512erEnabled :: DynFlags -> Bool
isAvx512erEnabled dflags = avx512er dflags
isAvx512fEnabled :: DynFlags -> Bool
-isAvx512fEnabled dflags = avx512f dflags
+isAvx512fEnabled dflags = avx512f dflags || avx512cd dflags || avx512er dflags || avx512pf dflags
isAvx512pfEnabled :: DynFlags -> Bool
isAvx512pfEnabled dflags = avx512pf dflags
isFmaEnabled :: DynFlags -> Bool
-isFmaEnabled dflags = fma dflags
+isFmaEnabled dflags = fma dflags || (isX86 && isAvx512fEnabled dflags)
+ where
+ -- -mfma is used on multiple platforms, but -mavx512f is x86-only
+ isX86 = case platformArch (targetPlatform dflags) of
+ ArchX86_64 -> True
+ ArchX86 -> True
+ _ -> False
+
+{- Note [Implications between X86 CPU feature flags]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Many X86 CPU feature flags (such as -mavx, -mfma or -msse4) imply other
+feature flags. In particular, there are straightforward linear implication
+structures:
+
+ 1. AVX2 -> AVX -> SSE4.2 -> SSE4 -> SSSE3 -> SSE3 -> SSE2 -> SSE1
+ 2. BMI2 -> BMI1
+
+together with other implications such as
+
+ 3. FMA -> AVX
+ 4. AVX512{CD,ED,PF} -> AVX512F -> AVX2
+
+
+We handle this as follows:
+
+ A. When parsing command line options into `DynFlags`, we record:
+ - an `SseAvxVersion` which gives the SSE/AVX level supported in
+ the total order (1),
+ - a `BmiVersion` for (2),
+ - whether FMA is enabled,
+ - various AVX512 flags saying which AVX512 extensions are supported
+
+ B. When converting these "raw" `DynFlags` into a `CmmConfig` for use
+ in code generator backends, we handle the remaining implications (3) (4),
+ e.g. if the user passed -mavx512f then we also set the `SseAvxVersion`
+ to `AVX2`.
+-}
-- -----------------------------------------------------------------------------
-- BMI2
=====================================
compiler/GHC/Driver/Session.hs
=====================================
@@ -1684,25 +1684,28 @@ dynamic_flags_deps = [
(setDumpFlag Opt_D_dump_faststrings)
------ Machine dependent (-m<blah>) stuff ---------------------------
+ -- See Note [Implications between X86 CPU feature flags]
, make_ord_flag defGhcFlag "msse" (noArg (\d ->
- d { sseVersion = Just SSE1 }))
+ d { sseAvxVersion = max (Just SSE1) (sseAvxVersion d) }))
, make_ord_flag defGhcFlag "msse2" (noArg (\d ->
- d { sseVersion = Just SSE2 }))
+ d { sseAvxVersion = max (Just SSE2) (sseAvxVersion d) }))
, make_ord_flag defGhcFlag "msse3" (noArg (\d ->
- d { sseVersion = Just SSE3 }))
+ d { sseAvxVersion = max (Just SSE3) (sseAvxVersion d) }))
, make_ord_flag defGhcFlag "mssse3" (noArg (\d ->
- d { sseVersion = Just SSSE3 }))
+ d { sseAvxVersion = max (Just SSSE3) (sseAvxVersion d) }))
, make_ord_flag defGhcFlag "msse4" (noArg (\d ->
- d { sseVersion = Just SSE4 }))
+ d { sseAvxVersion = max (Just SSE4) (sseAvxVersion d) }))
, make_ord_flag defGhcFlag "msse4.2" (noArg (\d ->
- d { sseVersion = Just SSE42 }))
+ d { sseAvxVersion = max (Just SSE42) (sseAvxVersion d) }))
, make_ord_flag defGhcFlag "mbmi" (noArg (\d ->
- d { bmiVersion = Just BMI1 }))
+ d { bmiVersion = max (Just BMI1) (bmiVersion d) }))
, make_ord_flag defGhcFlag "mbmi2" (noArg (\d ->
d { bmiVersion = Just BMI2 }))
- , make_ord_flag defGhcFlag "mavx" (noArg (\d -> d { avx = True }))
- , make_ord_flag defGhcFlag "mavx2" (noArg (\d -> d { avx2 = True }))
+ , make_ord_flag defGhcFlag "mavx" (noArg (\d ->
+ d { sseAvxVersion = max (Just AVX1) (sseAvxVersion d) }))
+ , make_ord_flag defGhcFlag "mavx2" (noArg (\d ->
+ d { sseAvxVersion = max (Just AVX2) (sseAvxVersion d) }))
, make_ord_flag defGhcFlag "mavx512cd" (noArg (\d ->
d { avx512cd = True }))
, make_ord_flag defGhcFlag "mavx512er" (noArg (\d ->
=====================================
compiler/GHC/Platform.hs
=====================================
@@ -32,7 +32,7 @@ module GHC.Platform
, platformCConvNeedsExtension
, platformHasRTSLinker
, PlatformMisc(..)
- , SseVersion (..)
+ , SseAvxVersion (..)
, BmiVersion (..)
, wordAlignment
-- * SSE and AVX
@@ -264,14 +264,16 @@ platformHasRTSLinker p = case archOS_arch (platformArchOS p) of
-- Instruction sets
--------------------------------------------------
--- | x86 SSE instructions
-data SseVersion
+-- | x86 SSE and AVX instructions
+data SseAvxVersion
= SSE1
| SSE2
| SSE3
| SSSE3
| SSE4
| SSE42
+ | AVX1
+ | AVX2
deriving (Eq, Ord)
-- | x86 BMI (bit manipulation) instructions
=====================================
docs/users_guide/9.16.1-notes.rst
=====================================
@@ -28,6 +28,16 @@ Compiler
bound to variables. The very similar pattern ``Foo{bar = Bar{baz = 42}}``
will will not yet mark ``bar`` or ``baz`` as covered.
+- When multiple ``-msse*`` flags are given, the maximum version takes effect.
+ For example, ``-msse4.2 -msse2`` is now equivalent to ``-msse4.2``.
+ Previously, only the last flag took effect.
+
+- Some x86 architecture flags now imply other flags.
+ For example, :ghc-flag:`-mavx` now implies :ghc-flag:`-msse4.2`,
+ and :ghc-flag:`-mavx512f` now implies :ghc-flag:`-mfma`
+ in addition to :ghc-flag:`-mavx2`.
+ Refer to the users' guide for more details about each individual flag.
+
GHCi
~~~~
=====================================
docs/users_guide/using.rst
=====================================
@@ -1594,6 +1594,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-msse4.2`
+
(x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX instructions.
@@ -1602,6 +1604,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-mavx`
+
(x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX2 instructions.
@@ -1610,6 +1614,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-mavx512f`
+
(x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-CD instructions.
@@ -1618,6 +1624,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-mavx512f`
+
(x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-ER instructions.
@@ -1626,6 +1634,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-mavx2`, :ghc-flag:`-mfma`
+
(x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-F instructions.
@@ -1634,6 +1644,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-mavx512f`
+
(x86 only) This flag allows the code generator (whether the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`) to emit x86_64 AVX512-PF instructions.
@@ -1690,6 +1702,7 @@ Some flags only make sense for particular target platforms.
:category: platform-options
:since: 9.14.1
+ :implies: :ghc-flag:`-msse3`
(x86 only) Use the SSSE3 instruction set to
implement some vector operations
@@ -1701,6 +1714,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-mssse3`
+
(x86 only) Use the SSE4 instruction set to
implement some floating point and bit operations(whether using the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`).
@@ -1710,6 +1725,8 @@ Some flags only make sense for particular target platforms.
:type: dynamic
:category: platform-options
+ :implies: :ghc-flag:`-msse4`
+
(x86 only, added in GHC 7.4.1) Use the SSE4.2 instruction set to
implement some floating point and bit operations,
whether using the :ref:`native code generator <native-code-gen>`
@@ -1747,6 +1764,7 @@ Some flags only make sense for particular target platforms.
:default: off by default, except for Aarch64 where it's on by default.
:since: 9.8.1
+ :implies: (on x86) :ghc-flag:`-mavx`
Use native FMA instructions to implement the fused multiply-add floating-point
operations of the form ``x * y + z``.
=====================================
testsuite/tests/codeGen/should_gen_asm/all.T
=====================================
@@ -12,3 +12,8 @@ test('bytearray-memcpy-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True,
test('T18137', [when(opsys('darwin'), skip), only_ways(llvm_ways)], compile_grep_asm, ['hs', False, '-fllvm -split-sections'])
test('T24941', [only_ways(['optasm'])], compile, ['-fregs-graph'])
+
+test('msse-option-order', [unless(arch('x86_64') or arch('i386'), skip),
+ when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-msse4.2 -msse2'])
+test('mavx-should-enable-popcnt', [unless(arch('x86_64') or arch('i386'), skip),
+ when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-mavx'])
=====================================
testsuite/tests/codeGen/should_gen_asm/mavx-should-enable-popcnt.asm
=====================================
@@ -0,0 +1 @@
+popcnt(?![0-9])
\ No newline at end of file
=====================================
testsuite/tests/codeGen/should_gen_asm/mavx-should-enable-popcnt.hs
=====================================
@@ -0,0 +1,10 @@
+-- `-mavx` should imply `-msse4.2`.
+-- See https://gitlab.haskell.org/ghc/ghc/-/issues/24989
+import Data.Bits
+
+{-# NOINLINE foo #-}
+foo :: Int -> Int
+foo x = 1 + popCount x
+
+main :: IO ()
+main = print (foo 42)
=====================================
testsuite/tests/codeGen/should_gen_asm/msse-option-order.asm
=====================================
@@ -0,0 +1 @@
+popcnt(?![0-9])
\ No newline at end of file
=====================================
testsuite/tests/codeGen/should_gen_asm/msse-option-order.hs
=====================================
@@ -0,0 +1,10 @@
+-- `-msse2 -msse4.2` and `-msse4.2 -msse2` should have the same effect.
+-- See https://gitlab.haskell.org/ghc/ghc/-/issues/24989#note_587510
+import Data.Bits
+
+{-# NOINLINE foo #-}
+foo :: Int -> Int
+foo x = 1 + popCount x
+
+main :: IO ()
+main = print (foo 42)
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/d4a9d6d6ec73b1851dec36cbf04d607e...
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/d4a9d6d6ec73b1851dec36cbf04d607e...
You're receiving this email because of your account on gitlab.haskell.org.