Simon Jakobi pushed to branch wip/sjakobi/T25450-march-native at Glasgow Haskell Compiler / GHC
Commits:
-
12b5f01f
by Simon Jakobi at 2026-06-09T11:28:55+02:00
16 changed files:
- + changelog.d/march-native
- changelog.d/print-enabled-cpu-features
- + compiler/GHC/Driver/CpuFeatures.hs
- compiler/GHC/Driver/DynFlags.hs
- compiler/GHC/Driver/Session.hs
- + compiler/cbits/cpu_features_x86.c
- compiler/ghc.cabal.in
- docs/users_guide/expected-undocumented-flags.txt
- docs/users_guide/using.rst
- testsuite/tests/codeGen/should_gen_asm/all.T
- + testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.asm
- + testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.hs
- testsuite/tests/driver/all.T
- + testsuite/tests/driver/march_native.stdout
- + testsuite/tests/driver/march_native_additive.stdout
- + testsuite/tests/driver/march_native_unsupported_arch.stderr
Changes:
| 1 | +section: compiler
|
|
| 2 | +synopsis: Add -march=native flag
|
|
| 3 | +issues: #25450
|
|
| 4 | +mrs: !16126
|
|
| 5 | + |
|
| 6 | +description:
|
|
| 7 | + GHC now supports ``-march=native`` on x86 and x86_64. It probes the CPU of the
|
|
| 8 | + machine running GHC and enables all of the corresponding ``-m...`` CPU-feature
|
|
| 9 | + options automatically (such as ``-msse4.2``, ``-mavx2``, ``-mbmi2`` and
|
|
| 10 | + ``-mfma``), for both the native code generator and the LLVM backend. The
|
|
| 11 | + detected features are enabled in addition to any explicitly requested feature
|
|
| 12 | + flags. The flag is rejected for non-x86 targets and when cross-compiling. |
| ... | ... | @@ -8,9 +8,11 @@ description: |
| 8 | 8 | prints a JSON object describing the CPU features currently enabled for code
|
| 9 | 9 | generation, together with a set of ``-m...`` flags that reproduce the
|
| 10 | 10 | effective feature set for the current target.
|
| 11 | - Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected. ::
|
|
| 11 | + Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected, so the flag
|
|
| 12 | + can also be used to inspect which features :ghc-flag:`-march=native` detected
|
|
| 13 | + and enabled. ::
|
|
| 12 | 14 | |
| 13 | - $ ghc -mavx2 --print-enabled-cpu-features
|
|
| 15 | + $ ghc -march=native --print-enabled-cpu-features
|
|
| 14 | 16 | {"tag":"enabled-cpu-features","version":1,"target":"x86_64-linux-gnu",
|
| 15 | - "features":["SSE","SSE2","SSE3","SSSE3","SSE4.1","SSE4.2","AVX","AVX2"],
|
|
| 16 | - "as_m_flags":["-mavx2"]} |
|
| 17 | + "features":["SSE","SSE2","SSE3","SSSE3","SSE4.1","SSE4.2","AVX","AVX2","BMI1","BMI2","FMA"],
|
|
| 18 | + "as_m_flags":["-mavx2","-mbmi2","-mfma"]} |
| 1 | +{-# LANGUAGE CPP #-}
|
|
| 2 | + |
|
| 3 | +module GHC.Driver.CpuFeatures
|
|
| 4 | + ( X86CpuFeature(..)
|
|
| 5 | + , cachedX86CpuFeatures
|
|
| 6 | + ) where
|
|
| 7 | + |
|
| 8 | +import GHC.Prelude
|
|
| 9 | + |
|
| 10 | +import Data.Word (Word64)
|
|
| 11 | +import System.IO.Unsafe (unsafePerformIO)
|
|
| 12 | + |
|
| 13 | +-- | x86 CPU features understood by GHC's native CPU feature probe.
|
|
| 14 | +data X86CpuFeature
|
|
| 15 | + = SSE2
|
|
| 16 | + | SSE3
|
|
| 17 | + | SSSE3
|
|
| 18 | + | SSE4_1
|
|
| 19 | + | SSE4_2
|
|
| 20 | + | AVX
|
|
| 21 | + | AVX2
|
|
| 22 | + | AVX512F
|
|
| 23 | + | AVX512BW
|
|
| 24 | + | AVX512CD
|
|
| 25 | + | AVX512DQ
|
|
| 26 | + | AVX512VL
|
|
| 27 | + | BMI1
|
|
| 28 | + | BMI2
|
|
| 29 | + | FMA
|
|
| 30 | + | GFNI
|
|
| 31 | + deriving (Eq, Ord, Show)
|
|
| 32 | + |
|
| 33 | +-- | Decode the bitmask returned by 'ghc_detect_x86_cpu_features'.
|
|
| 34 | +--
|
|
| 35 | +-- NOTE: Bit positions must match the enum in @compiler/cbits/cpu_features_x86.c@.
|
|
| 36 | +decodeX86CpuFeatureMask :: Word64 -> [X86CpuFeature]
|
|
| 37 | +decodeX86CpuFeatureMask mask =
|
|
| 38 | + [ feat
|
|
| 39 | + | (bit_ix, feat) <- cpuFeatureBitLayout
|
|
| 40 | + , testBit mask bit_ix
|
|
| 41 | + ]
|
|
| 42 | + |
|
| 43 | +-- | Low-level FFI access to the C probe.
|
|
| 44 | +detectX86CpuFeatureMask :: IO Word64
|
|
| 45 | +#if defined(javascript_HOST_ARCH)
|
|
| 46 | +detectX86CpuFeatureMask = pure 0
|
|
| 47 | +#else
|
|
| 48 | +detectX86CpuFeatureMask = c_ghc_detect_x86_cpu_features
|
|
| 49 | +#endif
|
|
| 50 | + |
|
| 51 | +-- | Probe host x86 CPU features and decode them into an ordered feature list.
|
|
| 52 | +detectX86CpuFeatures :: IO [X86CpuFeature]
|
|
| 53 | +detectX86CpuFeatures = decodeX86CpuFeatureMask <$> detectX86CpuFeatureMask
|
|
| 54 | + |
|
| 55 | +-- | The host's x86 CPU features, probed once and memoized.
|
|
| 56 | +--
|
|
| 57 | +-- CPUID results are constant for the lifetime of the process, so probing more
|
|
| 58 | +-- than once (e.g. once per @-march=native@ in a command line or file pragma)
|
|
| 59 | +-- is wasteful. This is referentially transparent despite the FFI call.
|
|
| 60 | +cachedX86CpuFeatures :: [X86CpuFeature]
|
|
| 61 | +cachedX86CpuFeatures = unsafePerformIO detectX86CpuFeatures
|
|
| 62 | +{-# NOINLINE cachedX86CpuFeatures #-}
|
|
| 63 | + |
|
| 64 | +cpuFeatureBitLayout :: [(Int, X86CpuFeature)]
|
|
| 65 | +cpuFeatureBitLayout =
|
|
| 66 | + [ (0, SSE2)
|
|
| 67 | + , (1, SSE3)
|
|
| 68 | + , (2, SSSE3)
|
|
| 69 | + , (3, SSE4_1)
|
|
| 70 | + , (4, SSE4_2)
|
|
| 71 | + , (5, AVX)
|
|
| 72 | + , (6, AVX2)
|
|
| 73 | + , (7, AVX512F)
|
|
| 74 | + , (8, AVX512BW)
|
|
| 75 | + , (9, AVX512CD)
|
|
| 76 | + , (10, AVX512DQ)
|
|
| 77 | + , (11, AVX512VL)
|
|
| 78 | + , (12, BMI1)
|
|
| 79 | + , (13, BMI2)
|
|
| 80 | + , (14, FMA)
|
|
| 81 | + , (15, GFNI)
|
|
| 82 | + ]
|
|
| 83 | + |
|
| 84 | +#if !defined(javascript_HOST_ARCH)
|
|
| 85 | +foreign import ccall unsafe "ghc_detect_x86_cpu_features"
|
|
| 86 | + c_ghc_detect_x86_cpu_features :: IO Word64
|
|
| 87 | +#endif |
| ... | ... | @@ -470,6 +470,8 @@ data DynFlags = DynFlags { |
| 470 | 470 | fma :: Bool, -- ^ Enable FMA instructions.
|
| 471 | 471 | gfni :: Bool, -- ^ Enable GFNI Instructions.
|
| 472 | 472 | la664 :: Bool, -- ^ Enable LA664 instructions
|
| 473 | + marchNative :: Bool, -- ^ @-march=native@ was requested; the host
|
|
| 474 | + -- CPU features are applied during flag parsing.
|
|
| 473 | 475 | |
| 474 | 476 | -- Constants used to control the amount of optimization done.
|
| 475 | 477 | |
| ... | ... | @@ -760,6 +762,7 @@ defaultDynFlags mySettings = |
| 760 | 762 | gfni = False,
|
| 761 | 763 | -- For LoongArch, la464 is used by default.
|
| 762 | 764 | la664 = False,
|
| 765 | + marchNative = False,
|
|
| 763 | 766 | |
| 764 | 767 | maxInlineAllocSize = 128,
|
| 765 | 768 | maxInlineMemcpyInsns = 32,
|
| ... | ... | @@ -245,6 +245,8 @@ import GHC.Platform |
| 245 | 245 | import GHC.Platform.Ways
|
| 246 | 246 | import GHC.Platform.Profile
|
| 247 | 247 | import GHC.Platform.ArchOS
|
| 248 | +import GHC.Platform.Host (hostPlatformArch)
|
|
| 249 | +import qualified GHC.Driver.CpuFeatures as Cpu
|
|
| 248 | 250 | |
| 249 | 251 | import GHC.Unit.Types
|
| 250 | 252 | import GHC.Unit.Parser
|
| ... | ... | @@ -906,8 +908,12 @@ parseDynamicFlagsFull activeFlags cmdline logger dflags0 args = do |
| 906 | 908 | unless (null errs) $ liftIO $ throwGhcExceptionIO $ errorsToGhcException $
|
| 907 | 909 | map ((rdr . ppr . getLoc &&& unLoc) . errMsg) $ errs
|
| 908 | 910 | |
| 911 | + -- Apply -march=native: probe the host CPU and enable the matching feature
|
|
| 912 | + -- flags. This needs IO (CPUID), so it cannot live in the pure flag handlers.
|
|
| 913 | + dflags1' <- applyMarchNative dflags1
|
|
| 914 | + |
|
| 909 | 915 | -- check for disabled flags in safe haskell
|
| 910 | - let (dflags2, sh_warns) = safeFlagCheck cmdline dflags1
|
|
| 916 | + let (dflags2, sh_warns) = safeFlagCheck cmdline dflags1'
|
|
| 911 | 917 | theWays = ways dflags2
|
| 912 | 918 | |
| 913 | 919 | unless (allowed_combination theWays) $ liftIO $
|
| ... | ... | @@ -1743,6 +1749,7 @@ dynamic_flags_deps = [ |
| 1743 | 1749 | , make_ord_flag defGhcFlag "mavx512vl" (noArg (\d -> d { avx512vl = True }))
|
| 1744 | 1750 | , make_ord_flag defGhcFlag "mfma" (noArg (\d -> d { fma = True }))
|
| 1745 | 1751 | , make_ord_flag defGhcFlag "mgfni" (noArg (\d -> d { gfni = True }))
|
| 1752 | + , make_ord_flag defGhcFlag "march=native" (noArg (\d -> d { marchNative = True }))
|
|
| 1746 | 1753 | |
| 1747 | 1754 | |
| 1748 | 1755 | , make_ord_flag defGhcFlag "mla664" (noArg (\d -> d { la664 = True }))
|
| ... | ... | @@ -3806,6 +3813,59 @@ x86AsMFlags dflags = |
| 3806 | 3813 | |
| 3807 | 3814 | gfniFlags = [ "-mgfni" | gfni dflags ]
|
| 3808 | 3815 | |
| 3816 | +-- | Apply a requested @-march=native@ by probing the host CPU and enabling the
|
|
| 3817 | +-- matching CPU-feature flags.
|
|
| 3818 | +--
|
|
| 3819 | +-- This runs in 'parseDynamicFlagsFull' rather than in a flag handler because the
|
|
| 3820 | +-- CPUID probe needs 'IO', whereas flag handlers are pure. The detected features
|
|
| 3821 | +-- are folded into the existing feature 'DynFlags' so that 'makeDynFlagsConsistent'
|
|
| 3822 | +-- and the backends treat them exactly like the corresponding @-m...@ flags.
|
|
| 3823 | +applyMarchNative :: MonadIO m => DynFlags -> m DynFlags
|
|
| 3824 | +applyMarchNative dflags
|
|
| 3825 | + | not (marchNative dflags) = return dflags
|
|
| 3826 | + | otherwise = do
|
|
| 3827 | + let arch = platformArch (targetPlatform dflags)
|
|
| 3828 | + unless (arch == ArchX86 || arch == ArchX86_64) $ liftIO $
|
|
| 3829 | + throwGhcExceptionIO $ CmdLineError
|
|
| 3830 | + "-march=native is only supported on x86 and x86_64 targets"
|
|
| 3831 | + unless (arch == hostPlatformArch) $ liftIO $
|
|
| 3832 | + throwGhcExceptionIO $ CmdLineError
|
|
| 3833 | + "-march=native is not supported when cross-compiling"
|
|
| 3834 | + return (applyX86CpuFeatures Cpu.cachedX86CpuFeatures dflags)
|
|
| 3835 | + |
|
| 3836 | +-- | Enable the 'DynFlags' CPU-feature fields corresponding to a probed set of
|
|
| 3837 | +-- host x86 features. SSE/AVX and BMI levels are collapsed to their maximum,
|
|
| 3838 | +-- since 'sseAvxVersion' and 'bmiVersion' each record a single level.
|
|
| 3839 | +applyX86CpuFeatures :: [Cpu.X86CpuFeature] -> DynFlags -> DynFlags
|
|
| 3840 | +applyX86CpuFeatures feats dflags = dflags
|
|
| 3841 | + { sseAvxVersion = foldr (max . Just) (sseAvxVersion dflags) sseLevels
|
|
| 3842 | + , bmiVersion = foldr (max . Just) (bmiVersion dflags) bmiLevels
|
|
| 3843 | + , avx512f = avx512f dflags || has Cpu.AVX512F
|
|
| 3844 | + , avx512bw = avx512bw dflags || has Cpu.AVX512BW
|
|
| 3845 | + , avx512cd = avx512cd dflags || has Cpu.AVX512CD
|
|
| 3846 | + , avx512dq = avx512dq dflags || has Cpu.AVX512DQ
|
|
| 3847 | + , avx512vl = avx512vl dflags || has Cpu.AVX512VL
|
|
| 3848 | + , fma = fma dflags || has Cpu.FMA
|
|
| 3849 | + , gfni = gfni dflags || has Cpu.GFNI
|
|
| 3850 | + }
|
|
| 3851 | + where
|
|
| 3852 | + has feat = feat `elem` feats
|
|
| 3853 | + sseLevels = [ lvl | feat <- feats, Just lvl <- [sseLevelOf feat] ]
|
|
| 3854 | + bmiLevels = [ lvl | feat <- feats, Just lvl <- [bmiLevelOf feat] ]
|
|
| 3855 | + sseLevelOf feat = case feat of
|
|
| 3856 | + Cpu.SSE2 -> Just SSE2
|
|
| 3857 | + Cpu.SSE3 -> Just SSE3
|
|
| 3858 | + Cpu.SSSE3 -> Just SSSE3
|
|
| 3859 | + Cpu.SSE4_1 -> Just SSE4
|
|
| 3860 | + Cpu.SSE4_2 -> Just SSE42
|
|
| 3861 | + Cpu.AVX -> Just AVX1
|
|
| 3862 | + Cpu.AVX2 -> Just AVX2
|
|
| 3863 | + _ -> Nothing
|
|
| 3864 | + bmiLevelOf feat = case feat of
|
|
| 3865 | + Cpu.BMI1 -> Just BMI1
|
|
| 3866 | + Cpu.BMI2 -> Just BMI2
|
|
| 3867 | + _ -> Nothing
|
|
| 3868 | + |
|
| 3809 | 3869 | -- | Query if the target RTS has the given 'Ways'. It's computed from
|
| 3810 | 3870 | -- the @"RTS ways"@ field in the settings file.
|
| 3811 | 3871 | targetHasRTSWays :: DynFlags -> Ways -> Bool
|
| 1 | +#include <HsFFI.h>
|
|
| 2 | +#include <stdint.h>
|
|
| 3 | + |
|
| 4 | +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
|
|
| 5 | +#include <immintrin.h>
|
|
| 6 | +#include <intrin.h>
|
|
| 7 | +#endif
|
|
| 8 | + |
|
| 9 | +#if !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
|
|
| 10 | +#include <cpuid.h>
|
|
| 11 | +#endif
|
|
| 12 | + |
|
| 13 | +#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
|
|
| 14 | +#include <sys/sysctl.h>
|
|
| 15 | +#endif
|
|
| 16 | + |
|
| 17 | +enum {
|
|
| 18 | + GHC_X86_FEAT_SSE2 = 0,
|
|
| 19 | + GHC_X86_FEAT_SSE3,
|
|
| 20 | + GHC_X86_FEAT_SSSE3,
|
|
| 21 | + GHC_X86_FEAT_SSE4_1,
|
|
| 22 | + GHC_X86_FEAT_SSE4_2,
|
|
| 23 | + GHC_X86_FEAT_AVX,
|
|
| 24 | + GHC_X86_FEAT_AVX2,
|
|
| 25 | + GHC_X86_FEAT_AVX512F,
|
|
| 26 | + GHC_X86_FEAT_AVX512BW,
|
|
| 27 | + GHC_X86_FEAT_AVX512CD,
|
|
| 28 | + GHC_X86_FEAT_AVX512DQ,
|
|
| 29 | + GHC_X86_FEAT_AVX512VL,
|
|
| 30 | + GHC_X86_FEAT_BMI1,
|
|
| 31 | + GHC_X86_FEAT_BMI2,
|
|
| 32 | + GHC_X86_FEAT_FMA,
|
|
| 33 | + GHC_X86_FEAT_GFNI
|
|
| 34 | +};
|
|
| 35 | + |
|
| 36 | +#define SET_FEAT(mask, bit) ((mask) |= ((HsWord64)1ULL << (bit)))
|
|
| 37 | + |
|
| 38 | +static int ghc_cpuid_count(uint32_t leaf, uint32_t subleaf,
|
|
| 39 | + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
|
|
| 40 | +{
|
|
| 41 | +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
|
|
| 42 | + int regs[4];
|
|
| 43 | + __cpuidex(regs, (int)leaf, (int)subleaf);
|
|
| 44 | + *a = (uint32_t)regs[0];
|
|
| 45 | + *b = (uint32_t)regs[1];
|
|
| 46 | + *c = (uint32_t)regs[2];
|
|
| 47 | + *d = (uint32_t)regs[3];
|
|
| 48 | + return 1;
|
|
| 49 | +#elif defined(__i386__) || defined(__x86_64__)
|
|
| 50 | + return __get_cpuid_count(leaf, subleaf, a, b, c, d);
|
|
| 51 | +#else
|
|
| 52 | + (void)leaf;
|
|
| 53 | + (void)subleaf;
|
|
| 54 | + (void)a;
|
|
| 55 | + (void)b;
|
|
| 56 | + (void)c;
|
|
| 57 | + (void)d;
|
|
| 58 | + return 0;
|
|
| 59 | +#endif
|
|
| 60 | +}
|
|
| 61 | + |
|
| 62 | +static uint64_t ghc_xgetbv0(void)
|
|
| 63 | +{
|
|
| 64 | +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
|
|
| 65 | + return (uint64_t)_xgetbv(0);
|
|
| 66 | +#elif defined(__i386__) || defined(__x86_64__)
|
|
| 67 | + uint32_t eax, edx;
|
|
| 68 | + __asm__ volatile(".byte 0x0f, 0x01, 0xd0" /* xgetbv */
|
|
| 69 | + : "=a"(eax), "=d"(edx)
|
|
| 70 | + : "c"(0));
|
|
| 71 | + return ((uint64_t)edx << 32) | (uint64_t)eax;
|
|
| 72 | +#else
|
|
| 73 | + return 0;
|
|
| 74 | +#endif
|
|
| 75 | +}
|
|
| 76 | + |
|
| 77 | +#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
|
|
| 78 | +/* Query a macOS CPU-capability sysctl, e.g. "hw.optional.avx512f". */
|
|
| 79 | +static int ghc_macos_sysctl_flag(const char *name)
|
|
| 80 | +{
|
|
| 81 | + int result = 0;
|
|
| 82 | + size_t len = sizeof(result);
|
|
| 83 | + if (sysctlbyname(name, &result, &len, NULL, 0) != 0) {
|
|
| 84 | + return 0;
|
|
| 85 | + }
|
|
| 86 | + return result != 0;
|
|
| 87 | +}
|
|
| 88 | +#endif
|
|
| 89 | + |
|
| 90 | +HsWord64 ghc_detect_x86_cpu_features(void)
|
|
| 91 | +{
|
|
| 92 | + HsWord64 feats = 0;
|
|
| 93 | + |
|
| 94 | +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
|
|
| 95 | + uint32_t a, b, c, d;
|
|
| 96 | + uint32_t max_basic = 0;
|
|
| 97 | + |
|
| 98 | + if (!ghc_cpuid_count(0, 0, &a, &b, &c, &d)) {
|
|
| 99 | + return 0;
|
|
| 100 | + }
|
|
| 101 | + max_basic = a;
|
|
| 102 | + if (max_basic < 1) {
|
|
| 103 | + return 0;
|
|
| 104 | + }
|
|
| 105 | + |
|
| 106 | + ghc_cpuid_count(1, 0, &a, &b, &c, &d);
|
|
| 107 | + |
|
| 108 | + {
|
|
| 109 | + int has_sse2 = !!(d & (1u << 26));
|
|
| 110 | + int has_sse3 = !!(c & (1u << 0));
|
|
| 111 | + int has_ssse3 = !!(c & (1u << 9));
|
|
| 112 | + int has_sse4_1 = !!(c & (1u << 19));
|
|
| 113 | + int has_sse4_2 = !!(c & (1u << 20));
|
|
| 114 | + int has_fma_hw = !!(c & (1u << 12));
|
|
| 115 | + int has_avx_hw = !!(c & (1u << 28));
|
|
| 116 | + int has_osxsave = !!(c & (1u << 27));
|
|
| 117 | + |
|
| 118 | + int avx_usable = 0;
|
|
| 119 | + int avx512_usable = 0;
|
|
| 120 | + |
|
| 121 | + if (has_osxsave) {
|
|
| 122 | + uint64_t xcr0 = ghc_xgetbv0();
|
|
| 123 | + avx_usable = ((xcr0 & 0x6u) == 0x6u); /* XMM + YMM state */
|
|
| 124 | + avx512_usable = ((xcr0 & 0xE6u) == 0xE6u); /* XMM+YMM+opmask+ZMM */
|
|
| 125 | + }
|
|
| 126 | + |
|
| 127 | +#if defined(__APPLE__)
|
|
| 128 | + /* On x86_64 macOS the kernel enables AVX-512 XSAVE state lazily: XCR0
|
|
| 129 | + reads back with the opmask/ZMM bits clear until a process first faults
|
|
| 130 | + on an AVX-512 instruction, so the XCR0 check above is a false negative
|
|
| 131 | + on AVX-512-capable Macs. Use the OS feature query instead. Checking
|
|
| 132 | + AVX512F alone suffices here; the AVX-512 sub-features (BW/CD/DQ/VL) are
|
|
| 133 | + still decoded from CPUID leaf 7 below.
|
|
| 134 | + |
|
| 135 | + Refs:
|
|
| 136 | + https://zenn.dev/mod_poppo/articles/detect-processor-features-x86?locale=en#notes-on-detecting-avx-512-on-macos
|
|
| 137 | + https://github.com/minoki/haskell-cpu-features */
|
|
| 138 | + avx512_usable = ghc_macos_sysctl_flag("hw.optional.avx512f");
|
|
| 139 | +#endif
|
|
| 140 | + |
|
| 141 | + if (has_sse2) {
|
|
| 142 | + SET_FEAT(feats, GHC_X86_FEAT_SSE2);
|
|
| 143 | + }
|
|
| 144 | + if (has_sse3) {
|
|
| 145 | + SET_FEAT(feats, GHC_X86_FEAT_SSE3);
|
|
| 146 | + }
|
|
| 147 | + if (has_ssse3) {
|
|
| 148 | + SET_FEAT(feats, GHC_X86_FEAT_SSSE3);
|
|
| 149 | + }
|
|
| 150 | + if (has_sse4_1) {
|
|
| 151 | + SET_FEAT(feats, GHC_X86_FEAT_SSE4_1);
|
|
| 152 | + }
|
|
| 153 | + if (has_sse4_2) {
|
|
| 154 | + SET_FEAT(feats, GHC_X86_FEAT_SSE4_2);
|
|
| 155 | + }
|
|
| 156 | + if (has_avx_hw && avx_usable) {
|
|
| 157 | + SET_FEAT(feats, GHC_X86_FEAT_AVX);
|
|
| 158 | + }
|
|
| 159 | + if (has_fma_hw && avx_usable) {
|
|
| 160 | + SET_FEAT(feats, GHC_X86_FEAT_FMA);
|
|
| 161 | + }
|
|
| 162 | + |
|
| 163 | + if (max_basic >= 7 && ghc_cpuid_count(7, 0, &a, &b, &c, &d)) {
|
|
| 164 | + int has_bmi1 = !!(b & (1u << 3));
|
|
| 165 | + int has_avx2_hw = !!(b & (1u << 5));
|
|
| 166 | + int has_bmi2 = !!(b & (1u << 8));
|
|
| 167 | + int has_avx512f = !!(b & (1u << 16));
|
|
| 168 | + int has_avx512dq = !!(b & (1u << 17));
|
|
| 169 | + int has_avx512cd = !!(b & (1u << 28));
|
|
| 170 | + int has_avx512bw = !!(b & (1u << 30));
|
|
| 171 | + int has_avx512vl = !!(b & (1u << 31));
|
|
| 172 | + int has_gfni = !!(c & (1u << 8));
|
|
| 173 | + |
|
| 174 | + if (has_bmi1) {
|
|
| 175 | + SET_FEAT(feats, GHC_X86_FEAT_BMI1);
|
|
| 176 | + }
|
|
| 177 | + if (has_bmi2) {
|
|
| 178 | + SET_FEAT(feats, GHC_X86_FEAT_BMI2);
|
|
| 179 | + }
|
|
| 180 | + if (avx_usable && has_avx2_hw) {
|
|
| 181 | + SET_FEAT(feats, GHC_X86_FEAT_AVX2);
|
|
| 182 | + }
|
|
| 183 | + |
|
| 184 | + if (avx512_usable && has_avx512f) {
|
|
| 185 | + SET_FEAT(feats, GHC_X86_FEAT_AVX512F);
|
|
| 186 | + if (has_avx512bw) {
|
|
| 187 | + SET_FEAT(feats, GHC_X86_FEAT_AVX512BW);
|
|
| 188 | + }
|
|
| 189 | + if (has_avx512cd) {
|
|
| 190 | + SET_FEAT(feats, GHC_X86_FEAT_AVX512CD);
|
|
| 191 | + }
|
|
| 192 | + if (has_avx512dq) {
|
|
| 193 | + SET_FEAT(feats, GHC_X86_FEAT_AVX512DQ);
|
|
| 194 | + }
|
|
| 195 | + if (has_avx512vl) {
|
|
| 196 | + SET_FEAT(feats, GHC_X86_FEAT_AVX512VL);
|
|
| 197 | + }
|
|
| 198 | + }
|
|
| 199 | + |
|
| 200 | + if (has_gfni) {
|
|
| 201 | + SET_FEAT(feats, GHC_X86_FEAT_GFNI);
|
|
| 202 | + }
|
|
| 203 | + }
|
|
| 204 | + }
|
|
| 205 | +#endif
|
|
| 206 | + |
|
| 207 | + return feats;
|
|
| 208 | +} |
| ... | ... | @@ -187,6 +187,7 @@ Library |
| 187 | 187 | else
|
| 188 | 188 | c-sources:
|
| 189 | 189 | cbits/cutils.c
|
| 190 | + cbits/cpu_features_x86.c
|
|
| 190 | 191 | cbits/genSym.c
|
| 191 | 192 | cbits/keepCAFsForGHCi.c
|
| 192 | 193 | |
| ... | ... | @@ -514,6 +515,7 @@ Library |
| 514 | 515 | GHC.Driver.Config.StgToCmm
|
| 515 | 516 | GHC.Driver.Config.Tidy
|
| 516 | 517 | GHC.Driver.Config.StgToJS
|
| 518 | + GHC.Driver.CpuFeatures
|
|
| 517 | 519 | GHC.Driver.DynFlags
|
| 518 | 520 | GHC.Driver.IncludeSpecs
|
| 519 | 521 | GHC.Driver.Downsweep
|
| ... | ... | @@ -75,6 +75,7 @@ |
| 75 | 75 | -instantiated-with
|
| 76 | 76 | -keep-hi-file
|
| 77 | 77 | -keep-o-file
|
| 78 | +-march=native
|
|
| 78 | 79 | -n
|
| 79 | 80 | -no-keep-hi-file
|
| 80 | 81 | -no-keep-o-file
|
| ... | ... | @@ -496,7 +496,9 @@ The available mode flags are: |
| 496 | 496 | Print a JSON object describing the CPU features currently enabled for code
|
| 497 | 497 | generation, together with a set of ``-m...`` flags that reproduce the
|
| 498 | 498 | effective feature set for the current target.
|
| 499 | - Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected.
|
|
| 499 | + Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected, so this flag
|
|
| 500 | + can also be used to inspect which features :ghc-flag:`-march=native` detected
|
|
| 501 | + and enabled.
|
|
| 500 | 502 | |
| 501 | 503 | .. ghc-flag:: --print-debug-on
|
| 502 | 504 | :shortdesc: print whether GHC was built with ``-DDEBUG``
|
| ... | ... | @@ -1854,6 +1856,34 @@ Some flags only make sense for particular target platforms. |
| 1854 | 1856 | so this flag has no effect when used with the :ref:`native code generator <native-code-gen>`
|
| 1855 | 1857 | or the :ref:`LLVM backend <llvm-code-gen>`.
|
| 1856 | 1858 | |
| 1859 | +.. ghc-flag:: -march=native
|
|
| 1860 | + :shortdesc: (x86 only) Enable all CPU features supported by the host
|
|
| 1861 | + :type: dynamic
|
|
| 1862 | + :category: platform-options
|
|
| 1863 | + |
|
| 1864 | + (x86/x86_64 only) Probe the CPU of the machine running GHC and enable all of
|
|
| 1865 | + the corresponding ``-m...`` CPU-feature options automatically (for example
|
|
| 1866 | + ``-msse4.2``, ``-mavx2``, ``-mbmi2``, ``-mfma``). The detected features apply
|
|
| 1867 | + to both the :ref:`native code generator <native-code-gen>` and the
|
|
| 1868 | + :ref:`LLVM backend <llvm-code-gen>`.
|
|
| 1869 | + |
|
| 1870 | + The detected features are enabled *in addition* to any CPU-feature flags you
|
|
| 1871 | + pass explicitly, regardless of their order on the command line; ``-march=native``
|
|
| 1872 | + never disables a feature.
|
|
| 1873 | + |
|
| 1874 | + The features that were detected and enabled can be inspected with
|
|
| 1875 | + :ghc-flag:`--print-enabled-cpu-features`.
|
|
| 1876 | + |
|
| 1877 | + .. warning::
|
|
| 1878 | + |
|
| 1879 | + Code compiled with ``-march=native`` may use instructions that are not
|
|
| 1880 | + available on other CPUs, and is therefore not portable to a different
|
|
| 1881 | + machine.
|
|
| 1882 | + |
|
| 1883 | + Only x86 and x86_64 targets are supported so far; the flag is rejected on
|
|
| 1884 | + other targets. It is also rejected when cross-compiling, since the host CPU
|
|
| 1885 | + is then unrelated to the target.
|
|
| 1886 | + |
|
| 1857 | 1887 | Haddock
|
| 1858 | 1888 | -------
|
| 1859 | 1889 |
| ... | ... | @@ -17,6 +17,11 @@ test('msse-option-order', [unless(arch('x86_64') or arch('i386'), skip), |
| 17 | 17 | when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-msse4.2 -msse2'])
|
| 18 | 18 | test('mavx-should-enable-popcnt', [unless(arch('x86_64') or arch('i386'), skip),
|
| 19 | 19 | when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-mavx'])
|
| 20 | +# -march=native probes the host CPU, so gate on the host actually having SSE4.2
|
|
| 21 | +# (have_cpu_feature reports nothing under cross, skipping the test there too).
|
|
| 22 | +test('march-native-enables-popcnt',
|
|
| 23 | + [unless((arch('x86_64') or arch('i386')) and have_cpu_feature('sse4_2'), skip),
|
|
| 24 | + when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-march=native'])
|
|
| 20 | 25 | test('avx512-int64-mul', [unless(arch('x86_64'), skip),
|
| 21 | 26 | when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512dq -mavx512vl'])
|
| 22 | 27 | test('avx512-int64-minmax', [unless(arch('x86_64'), skip),
|
| 1 | +popcnt(?![0-9]) |
|
| \ No newline at end of file |
| 1 | +-- `-march=native` enables the host's CPU features. On a host with SSE4.2
|
|
| 2 | +-- (gated in all.T via have_cpu_feature) this makes popCount compile to a
|
|
| 3 | +-- `popcnt` instruction rather than the SSE2-baseline software fallback.
|
|
| 4 | +import Data.Bits
|
|
| 5 | + |
|
| 6 | +{-# NOINLINE foo #-}
|
|
| 7 | +foo :: Int -> Int
|
|
| 8 | +foo x = 1 + popCount x
|
|
| 9 | + |
|
| 10 | +main :: IO ()
|
|
| 11 | +main = print (foo 42) |
| ... | ... | @@ -7,6 +7,12 @@ def normalise_unknown_flag(msg): |
| 7 | 7 | m = re.search(r'unrecognised flag: \S+', msg)
|
| 8 | 8 | return m.group(0) + '\n' if m else msg
|
| 9 | 9 | |
| 10 | +def normalise_march_native_error(msg):
|
|
| 11 | + # Keep only the stable '-march=native ...' diagnostic; the program-name
|
|
| 12 | + # prefix and any usage trailer vary across configurations.
|
|
| 13 | + m = re.search(r'-march=native is [^\n]+', msg)
|
|
| 14 | + return m.group(0) + '\n' if m else msg
|
|
| 15 | + |
|
| 10 | 16 | test('driver011', [extra_files(['A011.hs'])], makefile_test, ['test011'])
|
| 11 | 17 | |
| 12 | 18 | test('driver012', [extra_files(['A012.hs'])], makefile_test, ['test012'])
|
| ... | ... | @@ -265,6 +271,45 @@ test('print_enabled_cpu_features_unknown_flag', |
| 265 | 271 | run_command,
|
| 266 | 272 | ['{compiler} -mavx22 --print-enabled-cpu-features'])
|
| 267 | 273 | |
| 274 | +# -march=native enables at least the x86_64 baseline (SSE2). The full feature
|
|
| 275 | +# set is host-dependent, so we only assert the always-present baseline.
|
|
| 276 | +test('march_native',
|
|
| 277 | + [unless(arch('x86_64') or arch('i386'), skip)],
|
|
| 278 | + run_command,
|
|
| 279 | + ['{compiler} -march=native --print-enabled-cpu-features | grep -o SSE2'])
|
|
| 280 | + |
|
| 281 | +# On non-x86 targets -march=native must be rejected.
|
|
| 282 | +test('march_native_unsupported_arch',
|
|
| 283 | + [when(arch('x86_64') or arch('i386'), skip),
|
|
| 284 | + normalise_errmsg_fun(normalise_march_native_error), exit_code(1)],
|
|
| 285 | + run_command,
|
|
| 286 | + ['{compiler} -march=native --print-enabled-cpu-features'])
|
|
| 287 | + |
|
| 288 | +# -march=native is additive: its feature set is a superset of the default set.
|
|
| 289 | +# We extract the "features" arrays with and without the flag and assert that no
|
|
| 290 | +# baseline feature is dropped: 'grep -vxF -f native.txt base.txt' prints any
|
|
| 291 | +# baseline feature absent from the -march=native set, of which we expect none.
|
|
| 292 | +# (grep exits 1 when it prints nothing, so '|| true' keeps the success case from
|
|
| 293 | +# failing the test; the empty-stdout check is what enforces the assertion.)
|
|
| 294 | +# This avoids hard-coding the host-specific feature set.
|
|
| 295 | +test('march_native_superset',
|
|
| 296 | + [unless(arch('x86_64') or arch('i386'), skip)],
|
|
| 297 | + run_command,
|
|
| 298 | + ['{compiler} --print-enabled-cpu-features | '
|
|
| 299 | + 'sed \'s/.*"features":\\[//;s/].*//;s/"//g\' | tr \',\' \'\\n\' > base.txt && '
|
|
| 300 | + '{compiler} -march=native --print-enabled-cpu-features | '
|
|
| 301 | + 'sed \'s/.*"features":\\[//;s/].*//;s/"//g\' | tr \',\' \'\\n\' > native.txt && '
|
|
| 302 | + '(grep -vxF -f native.txt base.txt || true)'])
|
|
| 303 | + |
|
| 304 | +# -march=native is additive with explicit -m flags, regardless of order: an
|
|
| 305 | +# explicitly requested feature (here AVX2, forced on independent of the host) is
|
|
| 306 | +# still present whether the flag comes before or after -march=native.
|
|
| 307 | +test('march_native_additive',
|
|
| 308 | + [unless(arch('x86_64') or arch('i386'), skip)],
|
|
| 309 | + run_command,
|
|
| 310 | + ['{compiler} -mavx2 -march=native --print-enabled-cpu-features | grep -o AVX2 && '
|
|
| 311 | + '{compiler} -march=native -mavx2 --print-enabled-cpu-features | grep -o AVX2'])
|
|
| 312 | + |
|
| 268 | 313 | test('T10219', normal, run_command,
|
| 269 | 314 | # `-x hspp` in make mode should work.
|
| 270 | 315 | # Note: need to specify `-x hspp` before the filename.
|
| 1 | +SSE2 |
| 1 | +AVX2
|
|
| 2 | +AVX2 |
| 1 | +-march=native is only supported on x86 and x86_64 targets |