Simon Jakobi pushed to branch wip/sjakobi/T25450-march-native at Glasgow Haskell Compiler / GHC
Commits:
12b5f01f by Simon Jakobi at 2026-06-09T11:28:55+02:00
Implement -march=native for x86/x86_64
Add a -march=native flag that probes the host CPU at parse time via an
in-process CPUID/XGETBV helper and enables the matching CPU-feature
DynFlags, so the effect applies to both the NCG and LLVM backends.
The flag handler only records a marker, since flag parsing is pure; the
probe and feature application run in parseDynamicFlagsFull, guarded
against non-x86 targets and cross-compilation. Detected SSE/AVX and BMI
levels are collapsed to their maximum and folded additively into the
existing feature flags, so explicit -m... options are never disabled.
The probe is memoized for the lifetime of the process.
On x86_64 macOS the kernel enables AVX-512 XSAVE state lazily, so XCR0
reads back with the opmask/ZMM bits clear until a process first faults on
an AVX-512 instruction. To avoid a false negative there, AVX512F is
queried via sysctlbyname(hw.optional.avx512f); the sub-features
(BW/CD/DQ/VL) are still decoded from CPUID leaf 7, and AVX/AVX2/FMA stay
on the XCR0 path, which is correct on macOS.
The flag is registered in expected-undocumented-flags.txt due to #27321.
Thanks to @aratamizuki for help with AVX-512 detection on macOS.
Closes #25450
Assisted-by: Claude Opus 4.8
- - - - -
16 changed files:
- + changelog.d/march-native
- changelog.d/print-enabled-cpu-features
- + compiler/GHC/Driver/CpuFeatures.hs
- compiler/GHC/Driver/DynFlags.hs
- compiler/GHC/Driver/Session.hs
- + compiler/cbits/cpu_features_x86.c
- compiler/ghc.cabal.in
- docs/users_guide/expected-undocumented-flags.txt
- docs/users_guide/using.rst
- testsuite/tests/codeGen/should_gen_asm/all.T
- + testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.asm
- + testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.hs
- testsuite/tests/driver/all.T
- + testsuite/tests/driver/march_native.stdout
- + testsuite/tests/driver/march_native_additive.stdout
- + testsuite/tests/driver/march_native_unsupported_arch.stderr
Changes:
=====================================
changelog.d/march-native
=====================================
@@ -0,0 +1,12 @@
+section: compiler
+synopsis: Add -march=native flag
+issues: #25450
+mrs: !16126
+
+description:
+ GHC now supports ``-march=native`` on x86 and x86_64. It probes the CPU of the
+ machine running GHC and enables all of the corresponding ``-m...`` CPU-feature
+ options automatically (such as ``-msse4.2``, ``-mavx2``, ``-mbmi2`` and
+ ``-mfma``), for both the native code generator and the LLVM backend. The
+ detected features are enabled in addition to any explicitly requested feature
+ flags. The flag is rejected for non-x86 targets and when cross-compiling.
=====================================
changelog.d/print-enabled-cpu-features
=====================================
@@ -8,9 +8,11 @@ description:
prints a JSON object describing the CPU features currently enabled for code
generation, together with a set of ``-m...`` flags that reproduce the
effective feature set for the current target.
- Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected. ::
+ Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected, so the flag
+ can also be used to inspect which features :ghc-flag:`-march=native` detected
+ and enabled. ::
- $ ghc -mavx2 --print-enabled-cpu-features
+ $ ghc -march=native --print-enabled-cpu-features
{"tag":"enabled-cpu-features","version":1,"target":"x86_64-linux-gnu",
- "features":["SSE","SSE2","SSE3","SSSE3","SSE4.1","SSE4.2","AVX","AVX2"],
- "as_m_flags":["-mavx2"]}
+ "features":["SSE","SSE2","SSE3","SSSE3","SSE4.1","SSE4.2","AVX","AVX2","BMI1","BMI2","FMA"],
+ "as_m_flags":["-mavx2","-mbmi2","-mfma"]}
=====================================
compiler/GHC/Driver/CpuFeatures.hs
=====================================
@@ -0,0 +1,87 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Driver.CpuFeatures
+ ( X86CpuFeature(..)
+ , cachedX86CpuFeatures
+ ) where
+
+import GHC.Prelude
+
+import Data.Word (Word64)
+import System.IO.Unsafe (unsafePerformIO)
+
+-- | x86 CPU features understood by GHC's native CPU feature probe.
+data X86CpuFeature
+ = SSE2
+ | SSE3
+ | SSSE3
+ | SSE4_1
+ | SSE4_2
+ | AVX
+ | AVX2
+ | AVX512F
+ | AVX512BW
+ | AVX512CD
+ | AVX512DQ
+ | AVX512VL
+ | BMI1
+ | BMI2
+ | FMA
+ | GFNI
+ deriving (Eq, Ord, Show)
+
+-- | Decode the bitmask returned by 'ghc_detect_x86_cpu_features'.
+--
+-- NOTE: Bit positions must match the enum in @compiler/cbits/cpu_features_x86.c@.
+decodeX86CpuFeatureMask :: Word64 -> [X86CpuFeature]
+decodeX86CpuFeatureMask mask =
+ [ feat
+ | (bit_ix, feat) <- cpuFeatureBitLayout
+ , testBit mask bit_ix
+ ]
+
+-- | Low-level FFI access to the C probe.
+detectX86CpuFeatureMask :: IO Word64
+#if defined(javascript_HOST_ARCH)
+detectX86CpuFeatureMask = pure 0
+#else
+detectX86CpuFeatureMask = c_ghc_detect_x86_cpu_features
+#endif
+
+-- | Probe host x86 CPU features and decode them into an ordered feature list.
+detectX86CpuFeatures :: IO [X86CpuFeature]
+detectX86CpuFeatures = decodeX86CpuFeatureMask <$> detectX86CpuFeatureMask
+
+-- | The host's x86 CPU features, probed once and memoized.
+--
+-- CPUID results are constant for the lifetime of the process, so probing more
+-- than once (e.g. once per @-march=native@ in a command line or file pragma)
+-- is wasteful. This is referentially transparent despite the FFI call.
+cachedX86CpuFeatures :: [X86CpuFeature]
+cachedX86CpuFeatures = unsafePerformIO detectX86CpuFeatures
+{-# NOINLINE cachedX86CpuFeatures #-}
+
+cpuFeatureBitLayout :: [(Int, X86CpuFeature)]
+cpuFeatureBitLayout =
+ [ (0, SSE2)
+ , (1, SSE3)
+ , (2, SSSE3)
+ , (3, SSE4_1)
+ , (4, SSE4_2)
+ , (5, AVX)
+ , (6, AVX2)
+ , (7, AVX512F)
+ , (8, AVX512BW)
+ , (9, AVX512CD)
+ , (10, AVX512DQ)
+ , (11, AVX512VL)
+ , (12, BMI1)
+ , (13, BMI2)
+ , (14, FMA)
+ , (15, GFNI)
+ ]
+
+#if !defined(javascript_HOST_ARCH)
+foreign import ccall unsafe "ghc_detect_x86_cpu_features"
+ c_ghc_detect_x86_cpu_features :: IO Word64
+#endif
=====================================
compiler/GHC/Driver/DynFlags.hs
=====================================
@@ -470,6 +470,8 @@ data DynFlags = DynFlags {
fma :: Bool, -- ^ Enable FMA instructions.
gfni :: Bool, -- ^ Enable GFNI Instructions.
la664 :: Bool, -- ^ Enable LA664 instructions
+ marchNative :: Bool, -- ^ @-march=native@ was requested; the host
+ -- CPU features are applied during flag parsing.
-- Constants used to control the amount of optimization done.
@@ -760,6 +762,7 @@ defaultDynFlags mySettings =
gfni = False,
-- For LoongArch, la464 is used by default.
la664 = False,
+ marchNative = False,
maxInlineAllocSize = 128,
maxInlineMemcpyInsns = 32,
=====================================
compiler/GHC/Driver/Session.hs
=====================================
@@ -245,6 +245,8 @@ import GHC.Platform
import GHC.Platform.Ways
import GHC.Platform.Profile
import GHC.Platform.ArchOS
+import GHC.Platform.Host (hostPlatformArch)
+import qualified GHC.Driver.CpuFeatures as Cpu
import GHC.Unit.Types
import GHC.Unit.Parser
@@ -906,8 +908,12 @@ parseDynamicFlagsFull activeFlags cmdline logger dflags0 args = do
unless (null errs) $ liftIO $ throwGhcExceptionIO $ errorsToGhcException $
map ((rdr . ppr . getLoc &&& unLoc) . errMsg) $ errs
+ -- Apply -march=native: probe the host CPU and enable the matching feature
+ -- flags. This needs IO (CPUID), so it cannot live in the pure flag handlers.
+ dflags1' <- applyMarchNative dflags1
+
-- check for disabled flags in safe haskell
- let (dflags2, sh_warns) = safeFlagCheck cmdline dflags1
+ let (dflags2, sh_warns) = safeFlagCheck cmdline dflags1'
theWays = ways dflags2
unless (allowed_combination theWays) $ liftIO $
@@ -1743,6 +1749,7 @@ dynamic_flags_deps = [
, make_ord_flag defGhcFlag "mavx512vl" (noArg (\d -> d { avx512vl = True }))
, make_ord_flag defGhcFlag "mfma" (noArg (\d -> d { fma = True }))
, make_ord_flag defGhcFlag "mgfni" (noArg (\d -> d { gfni = True }))
+ , make_ord_flag defGhcFlag "march=native" (noArg (\d -> d { marchNative = True }))
, make_ord_flag defGhcFlag "mla664" (noArg (\d -> d { la664 = True }))
@@ -3806,6 +3813,59 @@ x86AsMFlags dflags =
gfniFlags = [ "-mgfni" | gfni dflags ]
+-- | Apply a requested @-march=native@ by probing the host CPU and enabling the
+-- matching CPU-feature flags.
+--
+-- This runs in 'parseDynamicFlagsFull' rather than in a flag handler because the
+-- CPUID probe needs 'IO', whereas flag handlers are pure. The detected features
+-- are folded into the existing feature 'DynFlags' so that 'makeDynFlagsConsistent'
+-- and the backends treat them exactly like the corresponding @-m...@ flags.
+applyMarchNative :: MonadIO m => DynFlags -> m DynFlags
+applyMarchNative dflags
+ | not (marchNative dflags) = return dflags
+ | otherwise = do
+ let arch = platformArch (targetPlatform dflags)
+ unless (arch == ArchX86 || arch == ArchX86_64) $ liftIO $
+ throwGhcExceptionIO $ CmdLineError
+ "-march=native is only supported on x86 and x86_64 targets"
+ unless (arch == hostPlatformArch) $ liftIO $
+ throwGhcExceptionIO $ CmdLineError
+ "-march=native is not supported when cross-compiling"
+ return (applyX86CpuFeatures Cpu.cachedX86CpuFeatures dflags)
+
+-- | Enable the 'DynFlags' CPU-feature fields corresponding to a probed set of
+-- host x86 features. SSE/AVX and BMI levels are collapsed to their maximum,
+-- since 'sseAvxVersion' and 'bmiVersion' each record a single level.
+applyX86CpuFeatures :: [Cpu.X86CpuFeature] -> DynFlags -> DynFlags
+applyX86CpuFeatures feats dflags = dflags
+ { sseAvxVersion = foldr (max . Just) (sseAvxVersion dflags) sseLevels
+ , bmiVersion = foldr (max . Just) (bmiVersion dflags) bmiLevels
+ , avx512f = avx512f dflags || has Cpu.AVX512F
+ , avx512bw = avx512bw dflags || has Cpu.AVX512BW
+ , avx512cd = avx512cd dflags || has Cpu.AVX512CD
+ , avx512dq = avx512dq dflags || has Cpu.AVX512DQ
+ , avx512vl = avx512vl dflags || has Cpu.AVX512VL
+ , fma = fma dflags || has Cpu.FMA
+ , gfni = gfni dflags || has Cpu.GFNI
+ }
+ where
+ has feat = feat `elem` feats
+ sseLevels = [ lvl | feat <- feats, Just lvl <- [sseLevelOf feat] ]
+ bmiLevels = [ lvl | feat <- feats, Just lvl <- [bmiLevelOf feat] ]
+ sseLevelOf feat = case feat of
+ Cpu.SSE2 -> Just SSE2
+ Cpu.SSE3 -> Just SSE3
+ Cpu.SSSE3 -> Just SSSE3
+ Cpu.SSE4_1 -> Just SSE4
+ Cpu.SSE4_2 -> Just SSE42
+ Cpu.AVX -> Just AVX1
+ Cpu.AVX2 -> Just AVX2
+ _ -> Nothing
+ bmiLevelOf feat = case feat of
+ Cpu.BMI1 -> Just BMI1
+ Cpu.BMI2 -> Just BMI2
+ _ -> Nothing
+
-- | Query if the target RTS has the given 'Ways'. It's computed from
-- the @"RTS ways"@ field in the settings file.
targetHasRTSWays :: DynFlags -> Ways -> Bool
=====================================
compiler/cbits/cpu_features_x86.c
=====================================
@@ -0,0 +1,208 @@
+#include
+#include
+
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include
+#include
+#endif
+
+#if !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
+#include
+#endif
+
+#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
+#include
+#endif
+
+enum {
+ GHC_X86_FEAT_SSE2 = 0,
+ GHC_X86_FEAT_SSE3,
+ GHC_X86_FEAT_SSSE3,
+ GHC_X86_FEAT_SSE4_1,
+ GHC_X86_FEAT_SSE4_2,
+ GHC_X86_FEAT_AVX,
+ GHC_X86_FEAT_AVX2,
+ GHC_X86_FEAT_AVX512F,
+ GHC_X86_FEAT_AVX512BW,
+ GHC_X86_FEAT_AVX512CD,
+ GHC_X86_FEAT_AVX512DQ,
+ GHC_X86_FEAT_AVX512VL,
+ GHC_X86_FEAT_BMI1,
+ GHC_X86_FEAT_BMI2,
+ GHC_X86_FEAT_FMA,
+ GHC_X86_FEAT_GFNI
+};
+
+#define SET_FEAT(mask, bit) ((mask) |= ((HsWord64)1ULL << (bit)))
+
+static int ghc_cpuid_count(uint32_t leaf, uint32_t subleaf,
+ uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
+{
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+ int regs[4];
+ __cpuidex(regs, (int)leaf, (int)subleaf);
+ *a = (uint32_t)regs[0];
+ *b = (uint32_t)regs[1];
+ *c = (uint32_t)regs[2];
+ *d = (uint32_t)regs[3];
+ return 1;
+#elif defined(__i386__) || defined(__x86_64__)
+ return __get_cpuid_count(leaf, subleaf, a, b, c, d);
+#else
+ (void)leaf;
+ (void)subleaf;
+ (void)a;
+ (void)b;
+ (void)c;
+ (void)d;
+ return 0;
+#endif
+}
+
+static uint64_t ghc_xgetbv0(void)
+{
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+ return (uint64_t)_xgetbv(0);
+#elif defined(__i386__) || defined(__x86_64__)
+ uint32_t eax, edx;
+ __asm__ volatile(".byte 0x0f, 0x01, 0xd0" /* xgetbv */
+ : "=a"(eax), "=d"(edx)
+ : "c"(0));
+ return ((uint64_t)edx << 32) | (uint64_t)eax;
+#else
+ return 0;
+#endif
+}
+
+#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__))
+/* Query a macOS CPU-capability sysctl, e.g. "hw.optional.avx512f". */
+static int ghc_macos_sysctl_flag(const char *name)
+{
+ int result = 0;
+ size_t len = sizeof(result);
+ if (sysctlbyname(name, &result, &len, NULL, 0) != 0) {
+ return 0;
+ }
+ return result != 0;
+}
+#endif
+
+HsWord64 ghc_detect_x86_cpu_features(void)
+{
+ HsWord64 feats = 0;
+
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
+ uint32_t a, b, c, d;
+ uint32_t max_basic = 0;
+
+ if (!ghc_cpuid_count(0, 0, &a, &b, &c, &d)) {
+ return 0;
+ }
+ max_basic = a;
+ if (max_basic < 1) {
+ return 0;
+ }
+
+ ghc_cpuid_count(1, 0, &a, &b, &c, &d);
+
+ {
+ int has_sse2 = !!(d & (1u << 26));
+ int has_sse3 = !!(c & (1u << 0));
+ int has_ssse3 = !!(c & (1u << 9));
+ int has_sse4_1 = !!(c & (1u << 19));
+ int has_sse4_2 = !!(c & (1u << 20));
+ int has_fma_hw = !!(c & (1u << 12));
+ int has_avx_hw = !!(c & (1u << 28));
+ int has_osxsave = !!(c & (1u << 27));
+
+ int avx_usable = 0;
+ int avx512_usable = 0;
+
+ if (has_osxsave) {
+ uint64_t xcr0 = ghc_xgetbv0();
+ avx_usable = ((xcr0 & 0x6u) == 0x6u); /* XMM + YMM state */
+ avx512_usable = ((xcr0 & 0xE6u) == 0xE6u); /* XMM+YMM+opmask+ZMM */
+ }
+
+#if defined(__APPLE__)
+ /* On x86_64 macOS the kernel enables AVX-512 XSAVE state lazily: XCR0
+ reads back with the opmask/ZMM bits clear until a process first faults
+ on an AVX-512 instruction, so the XCR0 check above is a false negative
+ on AVX-512-capable Macs. Use the OS feature query instead. Checking
+ AVX512F alone suffices here; the AVX-512 sub-features (BW/CD/DQ/VL) are
+ still decoded from CPUID leaf 7 below.
+
+ Refs:
+ https://zenn.dev/mod_poppo/articles/detect-processor-features-x86?locale=en#...
+ https://github.com/minoki/haskell-cpu-features */
+ avx512_usable = ghc_macos_sysctl_flag("hw.optional.avx512f");
+#endif
+
+ if (has_sse2) {
+ SET_FEAT(feats, GHC_X86_FEAT_SSE2);
+ }
+ if (has_sse3) {
+ SET_FEAT(feats, GHC_X86_FEAT_SSE3);
+ }
+ if (has_ssse3) {
+ SET_FEAT(feats, GHC_X86_FEAT_SSSE3);
+ }
+ if (has_sse4_1) {
+ SET_FEAT(feats, GHC_X86_FEAT_SSE4_1);
+ }
+ if (has_sse4_2) {
+ SET_FEAT(feats, GHC_X86_FEAT_SSE4_2);
+ }
+ if (has_avx_hw && avx_usable) {
+ SET_FEAT(feats, GHC_X86_FEAT_AVX);
+ }
+ if (has_fma_hw && avx_usable) {
+ SET_FEAT(feats, GHC_X86_FEAT_FMA);
+ }
+
+ if (max_basic >= 7 && ghc_cpuid_count(7, 0, &a, &b, &c, &d)) {
+ int has_bmi1 = !!(b & (1u << 3));
+ int has_avx2_hw = !!(b & (1u << 5));
+ int has_bmi2 = !!(b & (1u << 8));
+ int has_avx512f = !!(b & (1u << 16));
+ int has_avx512dq = !!(b & (1u << 17));
+ int has_avx512cd = !!(b & (1u << 28));
+ int has_avx512bw = !!(b & (1u << 30));
+ int has_avx512vl = !!(b & (1u << 31));
+ int has_gfni = !!(c & (1u << 8));
+
+ if (has_bmi1) {
+ SET_FEAT(feats, GHC_X86_FEAT_BMI1);
+ }
+ if (has_bmi2) {
+ SET_FEAT(feats, GHC_X86_FEAT_BMI2);
+ }
+ if (avx_usable && has_avx2_hw) {
+ SET_FEAT(feats, GHC_X86_FEAT_AVX2);
+ }
+
+ if (avx512_usable && has_avx512f) {
+ SET_FEAT(feats, GHC_X86_FEAT_AVX512F);
+ if (has_avx512bw) {
+ SET_FEAT(feats, GHC_X86_FEAT_AVX512BW);
+ }
+ if (has_avx512cd) {
+ SET_FEAT(feats, GHC_X86_FEAT_AVX512CD);
+ }
+ if (has_avx512dq) {
+ SET_FEAT(feats, GHC_X86_FEAT_AVX512DQ);
+ }
+ if (has_avx512vl) {
+ SET_FEAT(feats, GHC_X86_FEAT_AVX512VL);
+ }
+ }
+
+ if (has_gfni) {
+ SET_FEAT(feats, GHC_X86_FEAT_GFNI);
+ }
+ }
+ }
+#endif
+
+ return feats;
+}
=====================================
compiler/ghc.cabal.in
=====================================
@@ -187,6 +187,7 @@ Library
else
c-sources:
cbits/cutils.c
+ cbits/cpu_features_x86.c
cbits/genSym.c
cbits/keepCAFsForGHCi.c
@@ -514,6 +515,7 @@ Library
GHC.Driver.Config.StgToCmm
GHC.Driver.Config.Tidy
GHC.Driver.Config.StgToJS
+ GHC.Driver.CpuFeatures
GHC.Driver.DynFlags
GHC.Driver.IncludeSpecs
GHC.Driver.Downsweep
=====================================
docs/users_guide/expected-undocumented-flags.txt
=====================================
@@ -75,6 +75,7 @@
-instantiated-with
-keep-hi-file
-keep-o-file
+-march=native
-n
-no-keep-hi-file
-no-keep-o-file
=====================================
docs/users_guide/using.rst
=====================================
@@ -496,7 +496,9 @@ The available mode flags are:
Print a JSON object describing the CPU features currently enabled for code
generation, together with a set of ``-m...`` flags that reproduce the
effective feature set for the current target.
- Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected.
+ Dynamic options such as ``-mavx2`` and ``-mbmi2`` are respected, so this flag
+ can also be used to inspect which features :ghc-flag:`-march=native` detected
+ and enabled.
.. ghc-flag:: --print-debug-on
:shortdesc: print whether GHC was built with ``-DDEBUG``
@@ -1854,6 +1856,34 @@ Some flags only make sense for particular target platforms.
so this flag has no effect when used with the :ref:`native code generator <native-code-gen>`
or the :ref:`LLVM backend <llvm-code-gen>`.
+.. ghc-flag:: -march=native
+ :shortdesc: (x86 only) Enable all CPU features supported by the host
+ :type: dynamic
+ :category: platform-options
+
+ (x86/x86_64 only) Probe the CPU of the machine running GHC and enable all of
+ the corresponding ``-m...`` CPU-feature options automatically (for example
+ ``-msse4.2``, ``-mavx2``, ``-mbmi2``, ``-mfma``). The detected features apply
+ to both the :ref:`native code generator <native-code-gen>` and the
+ :ref:`LLVM backend <llvm-code-gen>`.
+
+ The detected features are enabled *in addition* to any CPU-feature flags you
+ pass explicitly, regardless of their order on the command line; ``-march=native``
+ never disables a feature.
+
+ The features that were detected and enabled can be inspected with
+ :ghc-flag:`--print-enabled-cpu-features`.
+
+ .. warning::
+
+ Code compiled with ``-march=native`` may use instructions that are not
+ available on other CPUs, and is therefore not portable to a different
+ machine.
+
+ Only x86 and x86_64 targets are supported so far; the flag is rejected on
+ other targets. It is also rejected when cross-compiling, since the host CPU
+ is then unrelated to the target.
+
Haddock
-------
=====================================
testsuite/tests/codeGen/should_gen_asm/all.T
=====================================
@@ -17,6 +17,11 @@ test('msse-option-order', [unless(arch('x86_64') or arch('i386'), skip),
when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-msse4.2 -msse2'])
test('mavx-should-enable-popcnt', [unless(arch('x86_64') or arch('i386'), skip),
when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-mavx'])
+# -march=native probes the host CPU, so gate on the host actually having SSE4.2
+# (have_cpu_feature reports nothing under cross, skipping the test there too).
+test('march-native-enables-popcnt',
+ [unless((arch('x86_64') or arch('i386')) and have_cpu_feature('sse4_2'), skip),
+ when(unregisterised(), skip)], compile_grep_asm, ['hs', False, '-march=native'])
test('avx512-int64-mul', [unless(arch('x86_64'), skip),
when(unregisterised(), skip)], compile_grep_asm, ['hs', True, '-mavx512dq -mavx512vl'])
test('avx512-int64-minmax', [unless(arch('x86_64'), skip),
=====================================
testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.asm
=====================================
@@ -0,0 +1 @@
+popcnt(?![0-9])
\ No newline at end of file
=====================================
testsuite/tests/codeGen/should_gen_asm/march-native-enables-popcnt.hs
=====================================
@@ -0,0 +1,11 @@
+-- `-march=native` enables the host's CPU features. On a host with SSE4.2
+-- (gated in all.T via have_cpu_feature) this makes popCount compile to a
+-- `popcnt` instruction rather than the SSE2-baseline software fallback.
+import Data.Bits
+
+{-# NOINLINE foo #-}
+foo :: Int -> Int
+foo x = 1 + popCount x
+
+main :: IO ()
+main = print (foo 42)
=====================================
testsuite/tests/driver/all.T
=====================================
@@ -7,6 +7,12 @@ def normalise_unknown_flag(msg):
m = re.search(r'unrecognised flag: \S+', msg)
return m.group(0) + '\n' if m else msg
+def normalise_march_native_error(msg):
+ # Keep only the stable '-march=native ...' diagnostic; the program-name
+ # prefix and any usage trailer vary across configurations.
+ m = re.search(r'-march=native is [^\n]+', msg)
+ return m.group(0) + '\n' if m else msg
+
test('driver011', [extra_files(['A011.hs'])], makefile_test, ['test011'])
test('driver012', [extra_files(['A012.hs'])], makefile_test, ['test012'])
@@ -265,6 +271,45 @@ test('print_enabled_cpu_features_unknown_flag',
run_command,
['{compiler} -mavx22 --print-enabled-cpu-features'])
+# -march=native enables at least the x86_64 baseline (SSE2). The full feature
+# set is host-dependent, so we only assert the always-present baseline.
+test('march_native',
+ [unless(arch('x86_64') or arch('i386'), skip)],
+ run_command,
+ ['{compiler} -march=native --print-enabled-cpu-features | grep -o SSE2'])
+
+# On non-x86 targets -march=native must be rejected.
+test('march_native_unsupported_arch',
+ [when(arch('x86_64') or arch('i386'), skip),
+ normalise_errmsg_fun(normalise_march_native_error), exit_code(1)],
+ run_command,
+ ['{compiler} -march=native --print-enabled-cpu-features'])
+
+# -march=native is additive: its feature set is a superset of the default set.
+# We extract the "features" arrays with and without the flag and assert that no
+# baseline feature is dropped: 'grep -vxF -f native.txt base.txt' prints any
+# baseline feature absent from the -march=native set, of which we expect none.
+# (grep exits 1 when it prints nothing, so '|| true' keeps the success case from
+# failing the test; the empty-stdout check is what enforces the assertion.)
+# This avoids hard-coding the host-specific feature set.
+test('march_native_superset',
+ [unless(arch('x86_64') or arch('i386'), skip)],
+ run_command,
+ ['{compiler} --print-enabled-cpu-features | '
+ 'sed \'s/.*"features":\\[//;s/].*//;s/"//g\' | tr \',\' \'\\n\' > base.txt && '
+ '{compiler} -march=native --print-enabled-cpu-features | '
+ 'sed \'s/.*"features":\\[//;s/].*//;s/"//g\' | tr \',\' \'\\n\' > native.txt && '
+ '(grep -vxF -f native.txt base.txt || true)'])
+
+# -march=native is additive with explicit -m flags, regardless of order: an
+# explicitly requested feature (here AVX2, forced on independent of the host) is
+# still present whether the flag comes before or after -march=native.
+test('march_native_additive',
+ [unless(arch('x86_64') or arch('i386'), skip)],
+ run_command,
+ ['{compiler} -mavx2 -march=native --print-enabled-cpu-features | grep -o AVX2 && '
+ '{compiler} -march=native -mavx2 --print-enabled-cpu-features | grep -o AVX2'])
+
test('T10219', normal, run_command,
# `-x hspp` in make mode should work.
# Note: need to specify `-x hspp` before the filename.
=====================================
testsuite/tests/driver/march_native.stdout
=====================================
@@ -0,0 +1 @@
+SSE2
=====================================
testsuite/tests/driver/march_native_additive.stdout
=====================================
@@ -0,0 +1,2 @@
+AVX2
+AVX2
=====================================
testsuite/tests/driver/march_native_unsupported_arch.stderr
=====================================
@@ -0,0 +1 @@
+-march=native is only supported on x86 and x86_64 targets
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/12b5f01f8e7b3aebe2393c60fd09ec4a...
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/12b5f01f8e7b3aebe2393c60fd09ec4a...
You're receiving this email because of your account on gitlab.haskell.org.