GitLab

Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC

Commits:

72d6dc74

by aparker at 2026-04-20T20:15:44-04:00

NCG: Implement constant folding for vector simd ops (Issue #25030)

b9cab907

by sheaf at 2026-04-20T20:15:44-04:00

Mark some SIMD tests as broken on i386 optllvm

As seen in #25498, several SIMD tests are broken on i386 in the optllvm
way. This commit marks them as "expect_broken".

7 changed files:

+ changelog.d/simd_constant_folding
compiler/GHC/Cmm/Opt.hs
compiler/GHC/Utils/Misc.hs
+ testsuite/tests/simd/should_run/Makefile
+ testsuite/tests/simd/should_run/T25030.hs
+ testsuite/tests/simd/should_run/T25030.stdout
testsuite/tests/simd/should_run/all.T

Changes:

changelog.d/simd_constant_folding

 +section: codegen
 +synopsis: Implement Cmm constant folding for some SIMD vector instructions
 +issues: #25030 #26915
 +mrs: !15512
++
 +description: {
 +The Cmm constant folding pass now handles the following vector operations:
++
 +- insert and extract (broadcast was already supported)
 +- integer arithmetic operations: negation, addition, subtraction, multiplication,
 +  minimum, maximum
 +- logical operations: and, or, xor
 +}
++

compiler/GHC/Cmm/Opt.hs

@@ -24,6 +24,7 @@ import GHC.Platform
  import GHC.Types.Literal.Floating
  import Data.Maybe
 +import Control.Monad (zipWithM, guard)
  import GHC.Float
@@ -47,7 +48,6 @@ cmmMachOpFold
      -> MachOp       -- The operation from an CmmMachOp
      -> [CmmExpr]    -- The optimized arguments
      -> CmmExpr
+-
  cmmMachOpFold platform op args = fromMaybe (CmmMachOp op args) (cmmMachOpFoldM platform op args)
  -- Returns Nothing if no changes, useful for Hoopl, also reduces
@@ -65,6 +65,30 @@ cmmMachOpFoldM _ (MO_VF_Broadcast lg _w) exprs =
    case exprs of
      [CmmLit l] -> Just $! CmmLit (CmmVec $ replicate lg l)
      _ -> Nothing
++
 +cmmMachOpFoldM plat (MO_V_Extract l _)  [v, (CmmLit (CmmInt idx W32))]
 +  | idx >= 0, idx < fromIntegral l
 +  = do
 +    es <- vectorElements_maybe plat v
 +    es !! fromInteger idx
++
 +cmmMachOpFoldM plat (MO_VF_Extract l _) [v, (CmmLit (CmmInt idx W32))]
 +  | idx >= 0, idx < fromIntegral l
 +  = do
 +    es <- vectorElements_maybe plat v
 +    es !! fromInteger idx
++
 +cmmMachOpFoldM plat op [v, newval@(CmmLit _), CmmLit (CmmInt idx W32)]
 +  | MO_V_Insert  l _ <- op = foldToVecLit l
 +  | MO_VF_Insert l _ <- op = foldToVecLit l
 +  where foldToVecLit l = do
 +          guard (idx >= 0 && idx < fromIntegral l)
 +          ls <- vectorElements_maybe plat v
 +          lits <- sequence $ map toLit_maybe (replaceAt (fromIntegral idx) (Just newval) ls)
 +          Just $! CmmLit (CmmVec lits)
 +        toLit_maybe (Just (CmmLit l)) = Just l
 +        toLit_maybe _ = Nothing
++
  cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
    | MO_WF_Bitcast width <- op = case width of
        W32 | res <- castWord32ToFloat (fromInteger x)
@@ -457,6 +481,64 @@ cmmMachOpFoldM platform mop [x, (CmmLit (CmmInt n _w))]
          x2 = if p == 1 then x1 else
               CmmMachOp (MO_And rep) [x1, CmmLit (CmmInt (n-1) rep)]
 +-- Many vector MachOps are simply element-wise scalar MachOps. For these, we reduce
 +-- to the scalar case using 'vectorMachOpScalarMachOp_maybe' and 'vectorElements_maybe'.
++
 +-- Unary vector MachOps.
 +cmmMachOpFoldM plat op [v]
 +  | Just scalar_op <- vectorMachOpToScalarMachOp_maybe op
 +  = do es <- vectorElements_maybe plat v
 +       ls <- mapM (foldToLit plat scalar_op) es
 +       Just $! CmmLit $ CmmVec ls
++
 +  where foldToLit plat mop (Just a) = do
 +          CmmLit l <- cmmMachOpFoldM plat mop [a]
 +          return l
 +        foldToLit _ _ _ = Nothing
++
 +-- Binary vector MachOps.
 +cmmMachOpFoldM plat op [v1, v2]
 +  | Just scalar_op <- vectorMachOpToScalarMachOp_maybe op
 +  = do
 +      es1 <- vectorElements_maybe plat v1
 +      es2 <- vectorElements_maybe plat v2
 +      ls <- zipWithM (foldToLit plat scalar_op) es1 es2
 +      Just $! CmmLit $ CmmVec ls
 +  -- MIN/MAX don't have scalar equivalents, so handle them manually.
 +  | MO_VS_Max _ w <- op = do
 +      es1 <- vectorElements_maybe plat v1
 +      es2 <- vectorElements_maybe plat v2
 +      ls <- zipWithM (foldOp (narrowS w) max) es1 es2
 +      Just $! CmmLit $ CmmVec ls
 +  | MO_VU_Max _ w <- op = do
 +      es1 <- vectorElements_maybe plat v1
 +      es2 <- vectorElements_maybe plat v2
 +      ls <- zipWithM (foldOp (narrowU w) max) es1 es2
 +      Just $! CmmLit $ CmmVec ls
 +  | MO_VS_Min _ w <- op = do
 +      es1 <- vectorElements_maybe plat v1
 +      es2 <- vectorElements_maybe plat v2
 +      ls <- zipWithM (foldOp (narrowS w) min) es1 es2
 +      Just $! CmmLit $ CmmVec ls
 +  | MO_VU_Min _ w <- op = do
 +      es1 <- vectorElements_maybe plat v1
 +      es2 <- vectorElements_maybe plat v2
 +      ls <- zipWithM (foldOp (narrowU w) min) es1 es2
 +      Just $! CmmLit $ CmmVec ls
++
 +  where
 +    foldToLit plat mop (Just a1) (Just a2) = do
 +      CmmLit l <- cmmMachOpFoldM plat mop [a1, a2]
 +      return l
 +    foldToLit _ _ _ _  = Nothing
++
 +    foldOp do_narrow op
 +      (Just (CmmLit (CmmInt x rep)))
 +      (Just (CmmLit (CmmInt y _)))
 +        = Just $! CmmInt (do_narrow x `op` do_narrow y) rep
 +    foldOp _ _ _ _ = Nothing
++
++
  -- ToDo (#7116): optimise floating-point multiplication, e.g. x*2.0 -> x+x
  -- Unfortunately this needs a unique supply because x might not be a
  -- register.  See #2253 (program 6) for an example.
@@ -473,6 +555,59 @@ validOffsetRep :: Width -> Bool
  validOffsetRep rep = widthInBits rep <= finiteBitSize (undefined :: Int)
 +-- Is this a vector 'MachOp' that is an element-wise lift of
 +-- a scalar 'MachOp'? If so, returns the corresponding scalar 'MachOp'.
 +vectorMachOpToScalarMachOp_maybe :: MachOp -> Maybe MachOp
 +vectorMachOpToScalarMachOp_maybe m = case m of
 +  MO_VS_Neg _ w -> Just $ MO_S_Neg w
 +  MO_VF_Neg _ w -> Just $ MO_F_Neg w
 +  MO_V_Add  _ w -> Just $ MO_Add w
 +  MO_V_Sub  _ w -> Just $ MO_Sub w
 +  MO_V_Mul  _ w -> Just $ MO_Mul w
 +  MO_VF_Add _ w -> Just $ MO_F_Add w
 +  MO_VF_Sub _ w -> Just $ MO_F_Sub w
 +  MO_VF_Mul _ w -> Just $ MO_F_Mul w
 +  MO_VF_Min _ w -> Just $ MO_F_Min w
 +  MO_VF_Max _ w -> Just $ MO_F_Max w
 +  MO_V_And  _ w -> Just $ MO_And w
 +  MO_V_Or   _ w -> Just $ MO_Or w
 +  MO_V_Xor  _ w -> Just $ MO_Xor w
 +  _ -> Nothing
++
++
 +-- | Helper function that tells us what we know about the elements of a vector.
 +--
 +-- Returns 'Nothing' for non-vectors, and @[Nothing, Nothing, ...]@ for vectors
 +-- with unknown elements.
 +vectorElements_maybe :: Platform -> CmmExpr -> Maybe [Maybe CmmExpr]
 +vectorElements_maybe _plat (CmmLit (CmmVec es)) = Just $! map (Just . CmmLit) es
++
 +vectorElements_maybe _plat (CmmMachOp (MO_V_Broadcast l _) args)
 +  | [CmmLit v] <- args = Just $! replicate l (Just $! CmmLit v)
 +vectorElements_maybe _plat (CmmMachOp (MO_VF_Broadcast l _) args)
 +  | [CmmLit v] <- args = Just $! replicate l (Just $! CmmLit v)
++
 +vectorElements_maybe plat (CmmMachOp (MO_V_Insert _ _) args)
 +  | [v, e, (CmmLit (CmmInt i _w))] <- args
 +  , Just es <- vectorElements_maybe plat v
 +      = Just $! (replaceAt (fromInteger i) (Just $! e) es)
++
 +vectorElements_maybe plat (CmmMachOp (MO_VF_Insert _ _) args)
 +  | [v, e, (CmmLit (CmmInt i _w))] <- args
 +  , Just es <- vectorElements_maybe plat v
 +    = Just $! (replaceAt (fromInteger i) (Just $! e) es)
++
 +vectorElements_maybe plat (CmmMachOp mop _)
 +  | isVecType result_type = Just $! replicate (vecLength result_type) Nothing
 +  where result_type = machOpResultType plat mop []
++
 +vectorElements_maybe _plat (CmmReg reg)
 +  | isVecType reg_type = Just $! replicate (vecLength reg_type) Nothing
 +  where reg_type = cmmRegType reg
++
 +vectorElements_maybe _ _ = Nothing
++
++
  {- Note [Comparison operators]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  If we have

compiler/GHC/Utils/Misc.hs

@@ -56,7 +56,7 @@ module GHC.Utils.Misc (
          -- * List operations controlled by another list
          takeList, dropList, splitAtList, split,
 -        dropTail, capitalise,
 +        replaceAt, dropTail, capitalise,
          -- * Sorting
          sortWith, minWith, nubSort, ordNub, ordNubOn,
@@ -718,6 +718,14 @@ splitAtList xs ys = go 0# xs ys
        go n  []     bs     = (take (I# n) ys, bs) -- = splitAt n ys
        go n  (_:as) (_:bs) = go (n +# 1#) as bs
 +-- | given an index n and element y, replace the nth element of list xs with y
 +replaceAt :: Int -> a -> [a] -> [a]
 +replaceAt n y xs
 +  | n >= length xs = xs
 +  | n < 0 = xs
 +  | otherwise = before ++ (y : drop 1 after)
 +      where (before, after) = splitAt n xs
++
  -- | drop from the end of a list
  dropTail :: Int -> [a] -> [a]
  -- Specification: dropTail n = reverse . drop n . reverse

testsuite/tests/simd/should_run/Makefile

 +TOP=../../..
 +include $(TOP)/mk/boilerplate.mk
 +include $(TOP)/mk/test.mk
++
 +T25030:
 +	'$(TEST_HC)' $(TEST_HC_OPTS) T25030.hs -v0 -O1 -fforce-recomp -ddump-cmm > T25030.cmm 2>&1
++
 +	# testFoldPlus: 111111+121212=232323, 121212+131313=252525 should be folded
 +	grep -m 1 -o "232323" T25030.cmm
 +	grep -m 1 -o "252525" T25030.cmm
 +	# operands should not appear in the output
 +	grep -o "111111" T25030.cmm || echo "Does not appear: 111111"
 +	grep -o "121212" T25030.cmm || echo "Does not appear: 121212"
 +	grep -o "131313" T25030.cmm || echo "Does not appear: 131313"
++
 +	# testFoldMax: max(333333,333332)=333333 should be folded
 +	grep -m 1 -o "333333" T25030.cmm
 +	# lesser operand should not appear
 +	grep -o "333332" T25030.cmm || echo "Does not appear: 333332"
++
 +	# testNeg: negate(343434)=-343434 should be folded
 +	grep -m 1 -o -- "-343434" T25030.cmm
++
 +	# testInserts: insert 363636 into broadcast(353535) and extract it;
 +	# should fold to constant 363636
 +	grep -m 1 -o "363636" T25030.cmm
 +	# broadcast operand should not appear
 +	grep -o "353535" T25030.cmm || echo "Does not appear: 353535"
++
 +	# testInserts2: 383838+393939=777777 should be folded
 +	grep -m 1 -o "777777" T25030.cmm
 +	# addends should not appear
 +	grep -o "383838" T25030.cmm || echo "Does not appear: 383838"
++
 +	# testOverwrite: inserting 404040,404041 into broadcast(414141) should fold to <404040,404041>
 +	grep -m 1 -o "404040" T25030.cmm
 +	grep -m 1 -o "404041" T25030.cmm
 +	# original broadcast value should not appear
 +	grep -o "414141" T25030.cmm || echo "Does not appear: 414141"
++
 +	# testExtractFromInsert: extract(insert(unknown_v, 454545, 3), 3) should fold to 454545
 +	grep -m 1 -o "454545" T25030.cmm

testsuite/tests/simd/should_run/T25030.hs

 +{-# LANGUAGE MagicHash, UnboxedTuples, LexicalNegation, ExtendedLiterals #-}
++
 +import GHC.Prim
 +import GHC.Int
++
 +-- Cmm constant folding tests for vector operations
++
 +data IntX2 = IX2# Int64X2#
 +data IntX4 = IX4# Int32X4#
++
 +instance Show IntX2 where
 +  show (IX2# d) = case (unpackInt64X2# d) of
 +    (# a, b #) -> show ((I64# a), (I64# b))
++
 +instance Show IntX4 where
 +  show (IX4# v) = case (unpackInt32X4# v) of
 +    (# a, b, c, d #) -> show ((I32# a), (I32# b), (I32# c), (I32# d))
++
 +testFoldPlus = do
 +  let v1    = packInt64X2# (# 111111#Int64,  121212#Int64 #)
 +  let v2    = packInt64X2# (# 121212#Int64,  131313#Int64 #)
 +  print $ IX2# $ plusInt64X2# v1 v2 -- expect to see 232323 and 252525 here,
 +                                    -- and not 111111, 121212, or 131313
++
 +testFoldMax = do
 +  let v1    = broadcastInt32X4# 333333#Int32
 +  let v2    = broadcastInt32X4# 333332#Int32
 +  print $ IX4# $ maxInt32X4# v1 v2 -- expect to see 333333 here and not 333332
++
 +testFoldMin = do
 +  let v1 = broadcastInt32X4# 474747#Int32
 +  let v2 = broadcastInt32X4# 474748#Int32
 +  print $ IX4# $ minInt32X4# v1 v2 -- expect to see 474747 here and not 474748
++
 +testNeg = do
 +  let v1 = broadcastInt32X4# 343434#Int32
 +  print $ IX4# $ negateInt32X4# v1 -- expect to see -343434 here, not positive 343434
++
++
 +testInserts = do
 +  let v1 = broadcastInt32X4# 353535#Int32
 +  let v2 = insertInt32X4# v1 363636#Int32 0#
 +  let (# a, _, _, _ #) = unpackInt32X4# v2
 +  print $ (I32# a) -- expect to see 363636 here, not 353535
++
++
 +testInserts2 = do
 +  let v1 = broadcastInt32X4# 373737#Int32
 +  let v2 = insertInt32X4# v1 383838#Int32 0#
 +  let v3 = plusInt32X4# v2 (broadcastInt32X4# 393939#Int32)
 +  let (# a, _, _, _ #) = unpackInt32X4# v3
 +  print $ (I32# a) -- expect to see 777777 == 383838+393939 here, and not 373737, 383838, or 393939
++
 +{-# INLINE testOverwrite #-}
 +testOverwrite :: Int64X2# -> IO ()
 +testOverwrite v = do
 +  let v1 = insertInt64X2# v 404040#Int64 0#
 +  let v2 = insertInt64X2# v1 404041#Int64 1#
 +  print $ IX2# v2 -- expect <404040, 404041> to appear in the cmm as a single assignment,
 +                  -- rather than a series of inserts
++
 +{-# NOINLINE testExtractFromInsert #-}
 +testExtractFromInsert :: Int32X4# -> IO ()
 +testExtractFromInsert v = do
 +  let v2 = insertInt32X4# v 454545#Int32 3#
 +  let (# _, _, _, d #) = unpackInt32X4# v2
 +  print (I32# d) -- 454545 should fold as a constant even though v is a runtime value
++
++
 +main = do
 +  testFoldPlus
 +  testFoldMax
 +  testFoldMin
 +  testNeg
 +  testInserts
 +  testInserts2
 +  testOverwrite (broadcastInt64X2# 414141#Int64)
 +  testExtractFromInsert (broadcastInt32X4# 464646#Int32)
++

testsuite/tests/simd/should_run/T25030.stdout

 +232323
 +252525
 +Does not appear: 111111
 +Does not appear: 121212
 +Does not appear: 131313
 +333333
 +333333
 +333333
 +Does not appear: 333332
 +-343434
 +-343434
 +-343434
 +363636
 +Does not appear: 353535
 +777777
 +Does not appear: 383838
 +404040
 +404041
 +Does not appear: 414141
 +454545

testsuite/tests/simd/should_run/all.T

@@ -49,6 +49,8 @@ test('int16x8_shuffle_baseline', [], compile_and_run, [''])
  test('int32x4_shuffle_baseline', [], compile_and_run, [''])
  test('int64x2_shuffle_baseline', [], compile_and_run, [''])
 +test('T25030', [when(arch('i386'), expect_broken_for(25498, ['optllvm']))], makefile_test, [])
++
  test('T25658', [], compile_and_run, ['']) # #25658 is a bug with SSE2 code generation
  test('T25659', [], compile_and_run, [''])
@@ -83,6 +85,7 @@ test('simd007', [], compile_and_run, [''])
  test('simd008', [], compile_and_run, [''])
  test('simd009', [ req_th
                  , extra_files(['Simd009b.hs', 'Simd009c.hs'])
 +                , when(arch('i386'), expect_broken_for(25498, ['optllvm']))
+                 ]
                , multimod_compile_and_run, ['simd009', ''])
  test('simd010', [], compile_and_run, [''])
@@ -174,7 +177,7 @@ test('T25062_V64'
      , compile_and_run if have_cpu_feature('avx512f') else compile
      , [''])
 -test('T25169', [], compile_and_run, [''])
 +test('T25169', [when(arch('i386'), expect_broken_for(25498, ['optllvm']))], compile_and_run, [''])
  test('T25455', [], compile_and_run, [''])
  test('T25486', [], compile_and_run, [''])