Simon Jakobi pushed to branch wip/sjakobi/T25450-march-native at Glasgow Haskell Compiler / GHC

Commits:

16 changed files:

Changes:

  • changelog.d/march-native
    1
    +section: compiler
    
    2
    +synopsis: Add -march=native flag
    
    3
    +issues: #25450
    
    4
    +mrs: !16126
    
    5
    +
    
    6
    +description:
    
    7
    +  GHC now supports ``-march=native`` on x86 and x86_64. It probes the CPU of the
    
    8
    +  machine running GHC and enables all of the corresponding ``-m...`` CPU-feature
    
    9
    +  options automatically (such as ``-msse4.2``, ``-mavx2``, ``-mbmi2`` and
    
    10
    +  ``-mfma``), for both the native code generator and the LLVM backend. The
    
    11
    +  detected features are enabled in addition to any explicitly requested feature
    
    12
    +  flags. The flag is rejected for non-x86 targets and when cross-compiling.

  • changelog.d/print-enabled-cpu-features
    ... ... @@ -8,9 +8,11 @@ description:
    8 8
       prints a JSON object describing the CPU features currently enabled for code
    
    9 9
       generation, together with a set of ``-m...`` flags that reproduce the
    
    10 10
       effective feature set for the current target.
    
    11
    -  Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected. ::
    
    11
    +  Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected, so the flag
    
    12
    +  can also be used to inspect which features :ghc-flag:`-march=native` detected
    
    13
    +  and enabled. ::
    
    12 14
     
    
    13
    -    $ ghc -mavx2 --print-enabled-cpu-features
    
    15
    +    $ ghc -march=native --print-enabled-cpu-features
    
    14 16
         {"tag":"enabled-cpu-features","version":1,"target":"x86_64-linux-gnu",
    
    15
    -     "features":["SSE","SSE2","SSE3","SSSE3","SSE4.1","SSE4.2","AVX","AVX2"],
    
    16
    -     "as_m_flags":["-mavx2"]}
    17
    +     "features":["SSE","SSE2","SSE3","SSSE3","SSE4.1","SSE4.2","AVX","AVX2","BMI1","BMI2","FMA"],
    
    18
    +     "as_m_flags":["-mavx2","-mbmi2","-mfma"]}

  • compiler/GHC/Driver/CpuFeatures.hs
    1
    +{-# LANGUAGE CPP #-}
    
    2
    +
    
    3
    +module GHC.Driver.CpuFeatures
    
    4
    +  ( X86CpuFeature(..)
    
    5
    +  , cachedX86CpuFeatures
    
    6
    +  ) where
    
    7
    +
    
    8
    +import GHC.Prelude
    
    9
    +
    
    10
    +import Data.Word (Word64)
    
    11
    +import System.IO.Unsafe (unsafePerformIO)
    
    12
    +
    
    13
    +-- | x86 CPU features understood by GHC's native CPU feature probe.
    
    14
    +data X86CpuFeature
    
    15
    +  = SSE2
    
    16
    +  | SSE3
    
    17
    +  | SSSE3
    
    18
    +  | SSE4_1
    
    19
    +  | SSE4_2
    
    20
    +  | AVX
    
    21
    +  | AVX2
    
    22
    +  | AVX512F
    
    23
    +  | AVX512BW
    
    24
    +  | AVX512CD
    
    25
    +  | AVX512DQ
    
    26
    +  | AVX512VL
    
    27
    +  | BMI1
    
    28
    +  | BMI2
    
    29
    +  | FMA
    
    30
    +  | GFNI
    
    31
    +  deriving (Eq, Ord, Show)
    
    32
    +
    
    33
    +-- | Decode the bitmask returned by 'ghc_detect_x86_cpu_features'.
    
    34
    +--
    
    35
    +-- NOTE: Bit positions must match the enum in @compiler/cbits/cpu_features_x86.c@.
    
    36
    +decodeX86CpuFeatureMask :: Word64 -> [X86CpuFeature]
    
    37
    +decodeX86CpuFeatureMask mask =
    
    38
    +  [ feat
    
    39
    +  | (bit_ix, feat) <- cpuFeatureBitLayout
    
    40
    +  , testBit mask bit_ix
    
    41
    +  ]
    
    42
    +
    
    43
    +-- | Low-level FFI access to the C probe.
    
    44
    +detectX86CpuFeatureMask :: IO Word64
    
    45
    +#if defined(javascript_HOST_ARCH)
    
    46
    +detectX86CpuFeatureMask = pure 0
    
    47
    +#else
    
    48
    +detectX86CpuFeatureMask = c_ghc_detect_x86_cpu_features
    
    49
    +#endif
    
    50
    +
    
    51
    +-- | Probe host x86 CPU features and decode them into an ordered feature list.
    
    52
    +detectX86CpuFeatures :: IO [X86CpuFeature]
    
    53
    +detectX86CpuFeatures = decodeX86CpuFeatureMask <$> detectX86CpuFeatureMask
    
    54
    +
    
    55
    +-- | The host's x86 CPU features, probed once and memoized.
    
    56
    +--
    
    57
    +-- CPUID results are constant for the lifetime of the process, so probing more
    
    58
    +-- than once (e.g. once per @-march=native@ in a command line or file pragma)
    
    59
    +-- is wasteful. This is referentially transparent despite the FFI call.
    
    60
    +cachedX86CpuFeatures :: [X86CpuFeature]
    
    61
    +cachedX86CpuFeatures = unsafePerformIO detectX86CpuFeatures
    
    62
    +{-# NOINLINE cachedX86CpuFeatures #-}
    
    63
    +
    
    64
    +cpuFeatureBitLayout :: [(Int, X86CpuFeature)]
    
    65
    +cpuFeatureBitLayout =
    
    66
    +  [ (0,  SSE2)
    
    67
    +  , (1,  SSE3)
    
    68
    +  , (2,  SSSE3)
    
    69
    +  , (3,  SSE4_1)
    
    70
    +  , (4,  SSE4_2)
    
    71
    +  , (5,  AVX)
    
    72
    +  , (6,  AVX2)
    
    73
    +  , (7,  AVX512F)
    
    74
    +  , (8,  AVX512BW)
    
    75
    +  , (9,  AVX512CD)
    
    76
    +  , (10, AVX512DQ)
    
    77
    +  , (11, AVX512VL)
    
    78
    +  , (12, BMI1)
    
    79
    +  , (13, BMI2)
    
    80
    +  , (14, FMA)
    
    81
    +  , (15, GFNI)
    
    82
    +  ]
    
    83
    +
    
    84
    +#if !defined(javascript_HOST_ARCH)
    
    85
    +foreign import ccall unsafe "ghc_detect_x86_cpu_features"
    
    86
    +  c_ghc_detect_x86_cpu_features :: IO Word64
    
    87
    +#endif

  • compiler/GHC/Driver/DynFlags.hs
    ... ... @@ -470,6 +470,8 @@ data DynFlags = DynFlags {
    470 470
       fma                   :: Bool, -- ^ Enable FMA instructions.
    
    471 471
       gfni                  :: Bool, -- ^ Enable GFNI Instructions.
    
    472 472
       la664                 :: Bool, -- ^ Enable LA664 instructions
    
    473
    +  marchNative           :: Bool, -- ^ @-march=native@ was requested; the host
    
    474
    +                                 -- CPU features are applied during flag parsing.
    
    473 475
     
    
    474 476
       -- Constants used to control the amount of optimization done.
    
    475 477
     
    
    ... ... @@ -760,6 +762,7 @@ defaultDynFlags mySettings =
    760 762
             gfni = False,
    
    761 763
             -- For LoongArch, la464 is used by default.
    
    762 764
             la664 = False,
    
    765
    +        marchNative = False,
    
    763 766
     
    
    764 767
             maxInlineAllocSize = 128,
    
    765 768
             maxInlineMemcpyInsns = 32,
    

  • compiler/GHC/Driver/Session.hs
    ... ... @@ -245,6 +245,8 @@ import GHC.Platform
    245 245
     import GHC.Platform.Ways
    
    246 246
     import GHC.Platform.Profile
    
    247 247
     import GHC.Platform.ArchOS
    
    248
    +import GHC.Platform.Host (hostPlatformArch)
    
    249
    +import qualified GHC.Driver.CpuFeatures as Cpu
    
    248 250
     
    
    249 251
     import GHC.Unit.Types
    
    250 252
     import GHC.Unit.Parser
    
    ... ... @@ -906,8 +908,12 @@ parseDynamicFlagsFull activeFlags cmdline logger dflags0 args = do
    906 908
       unless (null errs) $ liftIO $ throwGhcExceptionIO $ errorsToGhcException $
    
    907 909
         map ((rdr . ppr . getLoc &&& unLoc) . errMsg) $ errs
    
    908 910
     
    
    911
    +  -- Apply -march=native: probe the host CPU and enable the matching feature
    
    912
    +  -- flags. This needs IO (CPUID), so it cannot live in the pure flag handlers.
    
    913
    +  dflags1' <- applyMarchNative dflags1
    
    914
    +
    
    909 915
       -- check for disabled flags in safe haskell
    
    910
    -  let (dflags2, sh_warns) = safeFlagCheck cmdline dflags1
    
    916
    +  let (dflags2, sh_warns) = safeFlagCheck cmdline dflags1'
    
    911 917
           theWays = ways dflags2
    
    912 918
     
    
    913 919
       unless (allowed_combination theWays) $ liftIO $
    
    ... ... @@ -1743,6 +1749,7 @@ dynamic_flags_deps = [
    1743 1749
       , make_ord_flag defGhcFlag "mavx512vl"    (noArg (\d -> d { avx512vl = True }))
    
    1744 1750
       , make_ord_flag defGhcFlag "mfma"         (noArg (\d -> d { fma = True }))
    
    1745 1751
       , make_ord_flag defGhcFlag "mgfni"        (noArg (\d -> d { gfni = True }))
    
    1752
    +  , make_ord_flag defGhcFlag "march=native" (noArg (\d -> d { marchNative = True }))
    
    1746 1753
     
    
    1747 1754
     
    
    1748 1755
       , make_ord_flag defGhcFlag "mla664"       (noArg (\d -> d { la664 = True }))
    
    ... ... @@ -3806,6 +3813,59 @@ x86AsMFlags dflags =
    3806 3813
     
    
    3807 3814
         gfniFlags = [ "-mgfni" | gfni dflags ]
    
    3808 3815
     
    
    3816
    +-- | Apply a requested @-march=native@ by probing the host CPU and enabling the
    
    3817
    +-- matching CPU-feature flags.
    
    3818
    +--
    
    3819
    +-- This runs in 'parseDynamicFlagsFull' rather than in a flag handler because the
    
    3820
    +-- CPUID probe needs 'IO', whereas flag handlers are pure. The detected features
    
    3821
    +-- are folded into the existing feature 'DynFlags' so that 'makeDynFlagsConsistent'
    
    3822
    +-- and the backends treat them exactly like the corresponding @-m...@ flags.
    
    3823
    +applyMarchNative :: MonadIO m => DynFlags -> m DynFlags
    
    3824
    +applyMarchNative dflags
    
    3825
    +  | not (marchNative dflags) = return dflags
    
    3826
    +  | otherwise = do
    
    3827
    +      let arch = platformArch (targetPlatform dflags)
    
    3828
    +      unless (arch == ArchX86 || arch == ArchX86_64) $ liftIO $
    
    3829
    +        throwGhcExceptionIO $ CmdLineError
    
    3830
    +          "-march=native is only supported on x86 and x86_64 targets"
    
    3831
    +      unless (arch == hostPlatformArch) $ liftIO $
    
    3832
    +        throwGhcExceptionIO $ CmdLineError
    
    3833
    +          "-march=native is not supported when cross-compiling"
    
    3834
    +      return (applyX86CpuFeatures Cpu.cachedX86CpuFeatures dflags)
    
    3835
    +
    
    3836
    +-- | Enable the 'DynFlags' CPU-feature fields corresponding to a probed set of
    
    3837
    +-- host x86 features. SSE/AVX and BMI levels are collapsed to their maximum,
    
    3838
    +-- since 'sseAvxVersion' and 'bmiVersion' each record a single level.
    
    3839
    +applyX86CpuFeatures :: [Cpu.X86CpuFeature] -> DynFlags -> DynFlags
    
    3840
    +applyX86CpuFeatures feats dflags = dflags
    
    3841
    +  { sseAvxVersion = foldr (max . Just) (sseAvxVersion dflags) sseLevels
    
    3842
    +  , bmiVersion    = foldr (max . Just) (bmiVersion dflags)    bmiLevels
    
    3843
    +  , avx512f  = avx512f dflags  || has Cpu.AVX512F
    
    3844
    +  , avx512bw = avx512bw dflags || has Cpu.AVX512BW
    
    3845
    +  , avx512cd = avx512cd dflags || has Cpu.AVX512CD
    
    3846
    +  , avx512dq = avx512dq dflags || has Cpu.AVX512DQ
    
    3847
    +  , avx512vl = avx512vl dflags || has Cpu.AVX512VL
    
    3848
    +  , fma      = fma dflags      || has Cpu.FMA
    
    3849
    +  , gfni     = gfni dflags     || has Cpu.GFNI
    
    3850
    +  }
    
    3851
    +  where
    
    3852
    +    has feat = feat `elem` feats
    
    3853
    +    sseLevels = [ lvl | feat <- feats, Just lvl <- [sseLevelOf feat] ]
    
    3854
    +    bmiLevels = [ lvl | feat <- feats, Just lvl <- [bmiLevelOf feat] ]
    
    3855
    +    sseLevelOf feat = case feat of
    
    3856
    +      Cpu.SSE2   -> Just SSE2
    
    3857
    +      Cpu.SSE3   -> Just SSE3
    
    3858
    +      Cpu.SSSE3  -> Just SSSE3
    
    3859
    +      Cpu.SSE4_1 -> Just SSE4
    
    3860
    +      Cpu.SSE4_2 -> Just SSE42
    
    3861
    +      Cpu.AVX    -> Just AVX1
    
    3862
    +      Cpu.AVX2   -> Just AVX2
    
    3863
    +      _          -> Nothing
    
    3864
    +    bmiLevelOf feat = case feat of
    
    3865
    +      Cpu.BMI1 -> Just BMI1
    
    3866
    +      Cpu.BMI2 -> Just BMI2
    
    3867
    +      _        -> Nothing
    
    3868
    +
    
    3809 3869
     -- | Query if the target RTS has the given 'Ways'. It's computed from
    
    3810 3870
     -- the @"RTS ways"@ field in the settings file.
    
    3811 3871
     targetHasRTSWays :: DynFlags -> Ways -> Bool
    

  • compiler/cbits/cpu_features_x86.c
    1
    +#include <HsFFI.h>
    
    2
    +#include <stdint.h>
    
    3
    +
    
    4
    +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
    
    5
    +#include <immintrin.h>
    
    6
    +#include <intrin.h>
    
    7
    +#endif
    
    8
    +
    
    9
    +#if !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
    
    10
    +#include <cpuid.h>
    
    11
    +#endif
    
    12
    +
    
    13
    +#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
    
    14
    +#include <sys/sysctl.h>
    
    15
    +#endif
    
    16
    +
    
    17
    +enum {
    
    18
    +  GHC_X86_FEAT_SSE2 = 0,
    
    19
    +  GHC_X86_FEAT_SSE3,
    
    20
    +  GHC_X86_FEAT_SSSE3,
    
    21
    +  GHC_X86_FEAT_SSE4_1,
    
    22
    +  GHC_X86_FEAT_SSE4_2,
    
    23
    +  GHC_X86_FEAT_AVX,
    
    24
    +  GHC_X86_FEAT_AVX2,
    
    25
    +  GHC_X86_FEAT_AVX512F,
    
    26
    +  GHC_X86_FEAT_AVX512BW,
    
    27
    +  GHC_X86_FEAT_AVX512CD,
    
    28
    +  GHC_X86_FEAT_AVX512DQ,
    
    29
    +  GHC_X86_FEAT_AVX512VL,
    
    30
    +  GHC_X86_FEAT_BMI1,
    
    31
    +  GHC_X86_FEAT_BMI2,
    
    32
    +  GHC_X86_FEAT_FMA,
    
    33
    +  GHC_X86_FEAT_GFNI
    
    34
    +};
    
    35
    +
    
    36
    +#define SET_FEAT(mask, bit) ((mask) |= ((HsWord64)1ULL << (bit)))
    
    37
    +
    
    38
    +static int ghc_cpuid_count(uint32_t leaf, uint32_t subleaf,
    
    39
    +                           uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
    
    40
    +{
    
    41
    +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
    
    42
    +  int regs[4];
    
    43
    +  __cpuidex(regs, (int)leaf, (int)subleaf);
    
    44
    +  *a = (uint32_t)regs[0];
    
    45
    +  *b = (uint32_t)regs[1];
    
    46
    +  *c = (uint32_t)regs[2];
    
    47
    +  *d = (uint32_t)regs[3];
    
    48
    +  return 1;
    
    49
    +#elif defined(__i386__) || defined(__x86_64__)
    
    50
    +  return __get_cpuid_count(leaf, subleaf, a, b, c, d);
    
    51
    +#else
    
    52
    +  (void)leaf;
    
    53
    +  (void)subleaf;
    
    54
    +  (void)a;
    
    55
    +  (void)b;
    
    56
    +  (void)c;
    
    57
    +  (void)d;
    
    58
    +  return 0;
    
    59
    +#endif
    
    60
    +}
    
    61
    +
    
    62
    +static uint64_t ghc_xgetbv0(void)
    
    63
    +{
    
    64
    +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
    
    65
    +  return (uint64_t)_xgetbv(0);
    
    66
    +#elif defined(__i386__) || defined(__x86_64__)
    
    67
    +  uint32_t eax, edx;
    
    68
    +  __asm__ volatile(".byte 0x0f, 0x01, 0xd0" /* xgetbv */
    
    69
    +                   : "=a"(eax), "=d"(edx)
    
    70
    +                   : "c"(0));
    
    71
    +  return ((uint64_t)edx << 32) | (uint64_t)eax;
    
    72
    +#else
    
    73
    +  return 0;
    
    74
    +#endif
    
    75
    +}
    
    76
    +
    
    77
    +#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
    
    78
    +/* Query a macOS CPU-capability sysctl, e.g. "hw.optional.avx512f". */
    
    79
    +static int ghc_macos_sysctl_flag(const char *name)
    
    80
    +{
    
    81
    +  int result = 0;
    
    82
    +  size_t len = sizeof(result);
    
    83
    +  if (sysctlbyname(name, &result, &len, NULL, 0) != 0) {
    
    84
    +    return 0;
    
    85
    +  }
    
    86
    +  return result != 0;
    
    87
    +}
    
    88
    +#endif
    
    89
    +
    
    90
    +HsWord64 ghc_detect_x86_cpu_features(void)
    
    91
    +{
    
    92
    +  HsWord64 feats = 0;
    
    93
    +
    
    94
    +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
    
    95
    +  uint32_t a, b, c, d;
    
    96
    +  uint32_t max_basic = 0;
    
    97
    +
    
    98
    +  if (!ghc_cpuid_count(0, 0, &a, &b, &c, &d)) {
    
    99
    +    return 0;
    
    100
    +  }
    
    101
    +  max_basic = a;
    
    102
    +  if (max_basic < 1) {
    
    103
    +    return 0;
    
    104
    +  }
    
    105
    +
    
    106
    +  ghc_cpuid_count(1, 0, &a, &b, &c, &d);
    
    107
    +
    
    108
    +  {
    
    109
    +    int has_sse2    = !!(d & (1u << 26));
    
    110
    +    int has_sse3    = !!(c & (1u << 0));
    
    111
    +    int has_ssse3   = !!(c & (1u << 9));
    
    112
    +    int has_sse4_1  = !!(c & (1u << 19));
    
    113
    +    int has_sse4_2  = !!(c & (1u << 20));
    
    114
    +    int has_fma_hw  = !!(c & (1u << 12));
    
    115
    +    int has_avx_hw  = !!(c & (1u << 28));
    
    116
    +    int has_osxsave = !!(c & (1u << 27));
    
    117
    +
    
    118
    +    int avx_usable = 0;
    
    119
    +    int avx512_usable = 0;
    
    120
    +
    
    121
    +    if (has_osxsave) {
    
    122
    +      uint64_t xcr0 = ghc_xgetbv0();
    
    123
    +      avx_usable = ((xcr0 & 0x6u) == 0x6u);      /* XMM + YMM state */
    
    124
    +      avx512_usable = ((xcr0 & 0xE6u) == 0xE6u); /* XMM+YMM+opmask+ZMM */
    
    125
    +    }
    
    126
    +
    
    127
    +#if defined(__APPLE__)
    
    128
    +    /* On x86_64 macOS the kernel enables AVX-512 XSAVE state lazily: XCR0
    
    129
    +       reads back with the opmask/ZMM bits clear until a process first faults
    
    130
    +       on an AVX-512 instruction, so the XCR0 check above is a false negative
    
    131
    +       on AVX-512-capable Macs. Use the OS feature query instead. Checking
    
    132
    +       AVX512F alone suffices here; the AVX-512 sub-features (BW/CD/DQ/VL) are
    
    133
    +       still decoded from CPUID leaf 7 below.
    
    134
    +
    
    135
    +       Refs:
    
    136
    +         https://zenn.dev/mod_poppo/articles/detect-processor-features-x86?locale=en#notes-on-detecting-avx-512-on-macos
    
    137
    +         https://github.com/minoki/haskell-cpu-features */
    
    138
    +    avx512_usable = ghc_macos_sysctl_flag("hw.optional.avx512f");
    
    139
    +#endif
    
    140
    +
    
    141
    +    if (has_sse2) {
    
    142
    +      SET_FEAT(feats, GHC_X86_FEAT_SSE2);
    
    143
    +    }
    
    144
    +    if (has_sse3) {
    
    145
    +      SET_FEAT(feats, GHC_X86_FEAT_SSE3);
    
    146
    +    }
    
    147
    +    if (has_ssse3) {
    
    148
    +      SET_FEAT(feats, GHC_X86_FEAT_SSSE3);
    
    149
    +    }
    
    150
    +    if (has_sse4_1) {
    
    151
    +      SET_FEAT(feats, GHC_X86_FEAT_SSE4_1);
    
    152
    +    }
    
    153
    +    if (has_sse4_2) {
    
    154
    +      SET_FEAT(feats, GHC_X86_FEAT_SSE4_2);
    
    155
    +    }
    
    156
    +    if (has_avx_hw && avx_usable) {
    
    157
    +      SET_FEAT(feats, GHC_X86_FEAT_AVX);
    
    158
    +    }
    
    159
    +    if (has_fma_hw && avx_usable) {
    
    160
    +      SET_FEAT(feats, GHC_X86_FEAT_FMA);
    
    161
    +    }
    
    162
    +
    
    163
    +    if (max_basic >= 7 && ghc_cpuid_count(7, 0, &a, &b, &c, &d)) {
    
    164
    +      int has_bmi1     = !!(b & (1u << 3));
    
    165
    +      int has_avx2_hw  = !!(b & (1u << 5));
    
    166
    +      int has_bmi2     = !!(b & (1u << 8));
    
    167
    +      int has_avx512f  = !!(b & (1u << 16));
    
    168
    +      int has_avx512dq = !!(b & (1u << 17));
    
    169
    +      int has_avx512cd = !!(b & (1u << 28));
    
    170
    +      int has_avx512bw = !!(b & (1u << 30));
    
    171
    +      int has_avx512vl = !!(b & (1u << 31));
    
    172
    +      int has_gfni     = !!(c & (1u << 8));
    
    173
    +
    
    174
    +      if (has_bmi1) {
    
    175
    +        SET_FEAT(feats, GHC_X86_FEAT_BMI1);
    
    176
    +      }
    
    177
    +      if (has_bmi2) {
    
    178
    +        SET_FEAT(feats, GHC_X86_FEAT_BMI2);
    
    179
    +      }
    
    180
    +      if (avx_usable && has_avx2_hw) {
    
    181
    +        SET_FEAT(feats, GHC_X86_FEAT_AVX2);
    
    182
    +      }
    
    183
    +
    
    184
    +      if (avx512_usable && has_avx512f) {
    
    185
    +        SET_FEAT(feats, GHC_X86_FEAT_AVX512F);
    
    186
    +        if (has_avx512bw) {
    
    187
    +          SET_FEAT(feats, GHC_X86_FEAT_AVX512BW);
    
    188
    +        }
    
    189
    +        if (has_avx512cd) {
    
    190
    +          SET_FEAT(feats, GHC_X86_FEAT_AVX512CD);
    
    191
    +        }
    
    192
    +        if (has_avx512dq) {
    
    193
    +          SET_FEAT(feats, GHC_X86_FEAT_AVX512DQ);
    
    194
    +        }
    
    195
    +        if (has_avx512vl) {
    
    196
    +          SET_FEAT(feats, GHC_X86_FEAT_AVX512VL);
    
    197
    +        }
    
    198
    +      }
    
    199
    +
    
    200
    +      if (has_gfni) {
    
    201
    +        SET_FEAT(feats, GHC_X86_FEAT_GFNI);
    
    202
    +      }
    
    203
    +    }
    
    204
    +  }
    
    205
    +#endif
    
    206
    +
    
    207
    +  return feats;
    
    208
    +}

  • compiler/ghc.cabal.in
    ... ... @@ -187,6 +187,7 @@ Library
    187 187
         else
    
    188 188
           c-sources:
    
    189 189
             cbits/cutils.c
    
    190
    +        cbits/cpu_features_x86.c
    
    190 191
             cbits/genSym.c
    
    191 192
             cbits/keepCAFsForGHCi.c
    
    192 193
     
    
    ... ... @@ -514,6 +515,7 @@ Library
    514 515
             GHC.Driver.Config.StgToCmm
    
    515 516
             GHC.Driver.Config.Tidy
    
    516 517
             GHC.Driver.Config.StgToJS
    
    518
    +        GHC.Driver.CpuFeatures
    
    517 519
             GHC.Driver.DynFlags
    
    518 520
             GHC.Driver.IncludeSpecs
    
    519 521
             GHC.Driver.Downsweep
    

  • docs/users_guide/expected-undocumented-flags.txt
    ... ... @@ -75,6 +75,7 @@
    75 75
     -instantiated-with
    
    76 76
     -keep-hi-file
    
    77 77
     -keep-o-file
    
    78
    +-march=native
    
    78 79
     -n
    
    79 80
     -no-keep-hi-file
    
    80 81
     -no-keep-o-file
    

  • docs/users_guide/using.rst
    ... ... @@ -496,7 +496,9 @@ The available mode flags are:
    496 496
         Print a JSON object describing the CPU features currently enabled for code
    
    497 497
         generation, together with a set of ``-m...`` flags that reproduce the
    
    498 498
         effective feature set for the current target.
    
    499
    -    Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected.
    
    499
    +    Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected, so this flag
    
    500
    +    can also be used to inspect which features :ghc-flag:`-march=native` detected
    
    501
    +    and enabled.
    
    500 502
     
    
    501 503
     .. ghc-flag:: --print-debug-on
    
    502 504
         :shortdesc: print whether GHC was built with ``-DDEBUG``
    
    ... ... @@ -1854,6 +1856,34 @@ Some flags only make sense for particular target platforms.
    1854 1856
         so this flag has no effect when used with the :ref:`native code generator <native-code-gen>`
    
    1855 1857
         or the :ref:`LLVM backend <llvm-code-gen>`.
    
    1856 1858
     
    
    1859
    +.. ghc-flag:: -march=native
    
    1860
    +    :shortdesc: (x86 only) Enable all CPU features supported by the host
    
    1861
    +    :type: dynamic
    
    1862
    +    :category: platform-options
    
    1863
    +
    
    1864
    +    (x86/x86_64 only) Probe the CPU of the machine running GHC and enable all of
    
    1865
    +    the corresponding ``-m...`` CPU-feature options automatically (for example
    
    1866
    +    ``-msse4.2``, ``-mavx2``, ``-mbmi2``, ``-mfma``). The detected features apply
    
    1867
    +    to both the :ref:`native code generator <native-code-gen>` and the
    
    1868
    +    :ref:`LLVM backend <llvm-code-gen>`.
    
    1869
    +
    
    1870
    +    The detected features are enabled *in addition* to any CPU-feature flags you
    
    1871
    +    pass explicitly, regardless of their order on the command line; ``-march=native``
    
    1872
    +    never disables a feature.
    
    1873
    +
    
    1874
    +    The features that were detected and enabled can be inspected with
    
    1875
    +    :ghc-flag:`--print-enabled-cpu-features`.
    
    1876
    +
    
    1877
    +    .. warning::
    
    1878
    +
    
    1879
    +        Code compiled with ``-march=native`` may use instructions that are not
    
    1880
    +        available on other CPUs, and is therefore not portable to a different
    
    1881
    +        machine.
    
    1882
    +
    
    1883
    +    Only x86 and x86_64 targets are supported so far; the flag is rejected on
    
    1884
    +    other targets. It is also rejected when cross-compiling, since the host CPU
    
    1885
    +    is then unrelated to the target.
    
    1886
    +
    
    1857 1887
     Haddock
    
    1858 1888
     -------
    
    1859 1889
     
    

  • testsuite/tests/codeGen/should_gen_asm/all.T
    ... ... @@ -17,6 +17,11 @@ test('msse-option-order', [unless(arch('x86_64') or arch('i386'), skip),
    17 17
                                when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-msse4.2 -msse2'])
    
    18 18
     test('mavx-should-enable-popcnt', [unless(arch('x86_64') or arch('i386'), skip),
    
    19 19
                                        when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-mavx'])
    
    20
    +# -march=native probes the host CPU, so gate on the host actually having SSE4.2
    
    21
    +# (have_cpu_feature reports nothing under cross, skipping the test there too).
    
    22
    +test('march-native-enables-popcnt',
    
    23
    +     [unless((arch('x86_64') or arch('i386')) and have_cpu_feature('sse4_2'), skip),
    
    24
    +      when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-march=native'])
    
    20 25
     test('avx512-int64-mul', [unless(arch('x86_64'), skip),
    
    21 26
                               when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512dq -mavx512vl'])
    
    22 27
     test('avx512-int64-minmax', [unless(arch('x86_64'), skip),
    

  • testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.asm
    1
    +popcnt(?![0-9])
    \ No newline at end of file

  • testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.hs
    1
    +-- `-march=native` enables the host's CPU features. On a host with SSE4.2
    
    2
    +-- (gated in all.T via have_cpu_feature) this makes popCount compile to a
    
    3
    +-- `popcnt` instruction rather than the SSE2-baseline software fallback.
    
    4
    +import Data.Bits
    
    5
    +
    
    6
    +{-# NOINLINE foo #-}
    
    7
    +foo :: Int -> Int
    
    8
    +foo x = 1 + popCount x
    
    9
    +
    
    10
    +main :: IO ()
    
    11
    +main = print (foo 42)

  • testsuite/tests/driver/all.T
    ... ... @@ -7,6 +7,12 @@ def normalise_unknown_flag(msg):
    7 7
         m = re.search(r'unrecognised flag: \S+', msg)
    
    8 8
         return m.group(0) + '\n' if m else msg
    
    9 9
     
    
    10
    +def normalise_march_native_error(msg):
    
    11
    +    # Keep only the stable '-march=native ...' diagnostic; the program-name
    
    12
    +    # prefix and any usage trailer vary across configurations.
    
    13
    +    m = re.search(r'-march=native is [^\n]+', msg)
    
    14
    +    return m.group(0) + '\n' if m else msg
    
    15
    +
    
    10 16
     test('driver011', [extra_files(['A011.hs'])], makefile_test, ['test011'])
    
    11 17
     
    
    12 18
     test('driver012', [extra_files(['A012.hs'])], makefile_test, ['test012'])
    
    ... ... @@ -265,6 +271,45 @@ test('print_enabled_cpu_features_unknown_flag',
    265 271
          run_command,
    
    266 272
          ['{compiler} -mavx22 --print-enabled-cpu-features'])
    
    267 273
     
    
    274
    +# -march=native enables at least the x86_64 baseline (SSE2). The full feature
    
    275
    +# set is host-dependent, so we only assert the always-present baseline.
    
    276
    +test('march_native',
    
    277
    +     [unless(arch('x86_64') or arch('i386'), skip)],
    
    278
    +     run_command,
    
    279
    +     ['{compiler} -march=native --print-enabled-cpu-features | grep -o SSE2'])
    
    280
    +
    
    281
    +# On non-x86 targets -march=native must be rejected.
    
    282
    +test('march_native_unsupported_arch',
    
    283
    +     [when(arch('x86_64') or arch('i386'), skip),
    
    284
    +      normalise_errmsg_fun(normalise_march_native_error), exit_code(1)],
    
    285
    +     run_command,
    
    286
    +     ['{compiler} -march=native --print-enabled-cpu-features'])
    
    287
    +
    
    288
    +# -march=native is additive: its feature set is a superset of the default set.
    
    289
    +# We extract the "features" arrays with and without the flag and assert that no
    
    290
    +# baseline feature is dropped: 'grep -vxF -f native.txt base.txt' prints any
    
    291
    +# baseline feature absent from the -march=native set, of which we expect none.
    
    292
    +# (grep exits 1 when it prints nothing, so '|| true' keeps the success case from
    
    293
    +# failing the test; the empty-stdout check is what enforces the assertion.)
    
    294
    +# This avoids hard-coding the host-specific feature set.
    
    295
    +test('march_native_superset',
    
    296
    +     [unless(arch('x86_64') or arch('i386'), skip)],
    
    297
    +     run_command,
    
    298
    +     ['{compiler} --print-enabled-cpu-features | '
    
    299
    +      'sed \'s/.*"features":\\[//;s/].*//;s/"//g\' | tr \',\' \'\\n\' > base.txt && '
    
    300
    +      '{compiler} -march=native --print-enabled-cpu-features | '
    
    301
    +      'sed \'s/.*"features":\\[//;s/].*//;s/"//g\' | tr \',\' \'\\n\' > native.txt && '
    
    302
    +      '(grep -vxF -f native.txt base.txt || true)'])
    
    303
    +
    
    304
    +# -march=native is additive with explicit -m flags, regardless of order: an
    
    305
    +# explicitly requested feature (here AVX2, forced on independent of the host) is
    
    306
    +# still present whether the flag comes before or after -march=native.
    
    307
    +test('march_native_additive',
    
    308
    +     [unless(arch('x86_64') or arch('i386'), skip)],
    
    309
    +     run_command,
    
    310
    +     ['{compiler} -mavx2 -march=native --print-enabled-cpu-features | grep -o AVX2 && '
    
    311
    +      '{compiler} -march=native -mavx2 --print-enabled-cpu-features | grep -o AVX2'])
    
    312
    +
    
    268 313
     test('T10219', normal, run_command,
    
    269 314
          # `-x hspp` in make mode should work.
    
    270 315
          # Note: need to specify `-x hspp` before the filename.
    

  • testsuite/tests/driver/march_native.stdout
    1
    +SSE2

  • testsuite/tests/driver/march_native_additive.stdout
    1
    +AVX2
    
    2
    +AVX2

  • testsuite/tests/driver/march_native_unsupported_arch.stderr
    1
    +-march=native is only supported on x86 and x86_64 targets