Magnus pushed to branch wip/mangoiv/ghc-9.12-bp at Glasgow Haskell Compiler / GHC Commits: 82c521c6 by Ian Duncan at 2026-05-25T13:39:24+02:00 AArch64: use ASR not LSR for MO_U_Shr at W8/W16 The unsigned right shift (MO_U_Shr) for sub-word widths (W8, W16) with a variable shift amount was emitting ASR (arithmetic/signed shift right) after zero-extending with UXTB/UXTH. This should be LSR (logical/unsigned shift right). After zero-extension the upper bits happen to be 0 so ASR produces the same result, but it is semantically wrong and would break if the zero-extension were ever optimized away. Includes assembly output test (grep for lsr) and runtime test verifying unsigned right shift of Word8 and Word16 values. (cherry picked from commit 50188615342098345fc2822ea223ab23791bbf49) - - - - - 12 changed files: - + changelog.d/T26979 - compiler/GHC/CmmToAsm/AArch64/CodeGen.hs - + testsuite/tests/codeGen/should_gen_asm/aarch64-shl-subword.asm - + testsuite/tests/codeGen/should_gen_asm/aarch64-shl-subword.hs - + testsuite/tests/codeGen/should_gen_asm/aarch64-ushr-subword.asm - + testsuite/tests/codeGen/should_gen_asm/aarch64-ushr-subword.hs - testsuite/tests/codeGen/should_gen_asm/all.T - + testsuite/tests/codeGen/should_run/aarch64-subword-ops.hs - + testsuite/tests/codeGen/should_run/aarch64-subword-ops.stdout - + testsuite/tests/codeGen/should_run/aarch64-ushr-subword-run.hs - + testsuite/tests/codeGen/should_run/aarch64-ushr-subword-run.stdout - testsuite/tests/codeGen/should_run/all.T Changes: ===================================== changelog.d/T26979 ===================================== @@ -0,0 +1,11 @@ +section: compiler +issues: #26979 +mrs: !15620 +synopsis: + On AArch64, use an arithmetic instead of a logical right shift when + sign extending at 8/16 bit word size. +description: + The unsigned right shift ``MO_U_Shr`` for sub-word widths (``W8``, ``W16``) + with a variable shift amount now correctly emits ``LSR`` instead of ``ASR``. + ``ASR`` was semantically wrong, and would break if the zero-extension were + ever optimized away. ===================================== compiler/GHC/CmmToAsm/AArch64/CodeGen.hs ===================================== @@ -525,6 +525,15 @@ opRegWidth w = pprPanic "opRegWidth" (text "Unsupported width" <+> ppr w) -- sub-word-size value always contains the zero-extended form of that value -- in between operations. -- +-- IMPORTANT: this invariant only holds within a single expression tree as +-- generated by the NCG (via truncateReg after each sub-word operation). It +-- does NOT hold at function entry points or across basic block boundaries, +-- because the GHC calling convention does not guarantee that callers +-- zero-extend sub-word arguments. Therefore, any operation that is sensitive +-- to the upper bits of its input (e.g. unsigned right shift, unsigned +-- division) must explicitly zero- or sign-extend its operands rather than +-- assuming they are already extended. +-- -- For instance, consider the program, -- -- test(bits64 buffer) @@ -543,7 +552,7 @@ opRegWidth w = pprPanic "opRegWidth" (text "Unsupported width" <+> ppr w) -- Next we compute `c`: The `%not` requires no extension of its operands, but -- we must still truncate the result back down to 8-bits. Finally the `%shrl` -- requires no extension and no truncate since we can assume that --- `c` is zero-extended. +-- `c` is zero-extended (it was produced by a truncateReg in the same block). -- -- TODO: -- Don't use Width in Operands @@ -877,17 +886,30 @@ getRegister' config plat expr CmmMachOp (MO_U_Quot w) [x, y] | w == W8 -> do (reg_x, _format_x, code_x) <- getSomeReg x (reg_y, _format_y, code_y) <- getSomeReg y - return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTB (OpReg w reg_x) (OpReg w reg_x)) `snocOL` - (UXTB (OpReg w reg_y) (OpReg w reg_y)) `snocOL` - (UDIV (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))) + tmp_x <- getNewRegNat (intFormat w) + tmp_y <- getNewRegNat (intFormat w) + return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTB (OpReg w tmp_x) (OpReg w reg_x)) `snocOL` + (UXTB (OpReg w tmp_y) (OpReg w reg_y)) `snocOL` + (UDIV (OpReg w dst) (OpReg w tmp_x) (OpReg w tmp_y))) CmmMachOp (MO_U_Quot w) [x, y] | w == W16 -> do (reg_x, _format_x, code_x) <- getSomeReg x (reg_y, _format_y, code_y) <- getSomeReg y - return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTH (OpReg w reg_x) (OpReg w reg_x)) `snocOL` - (UXTH (OpReg w reg_y) (OpReg w reg_y)) `snocOL` - (UDIV (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))) + tmp_x <- getNewRegNat (intFormat w) + tmp_y <- getNewRegNat (intFormat w) + return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTH (OpReg w tmp_x) (OpReg w reg_x)) `snocOL` + (UXTH (OpReg w tmp_y) (OpReg w reg_y)) `snocOL` + (UDIV (OpReg w dst) (OpReg w tmp_x) (OpReg w tmp_y))) -- 2. Shifts. x << n, x >> n. + -- Sub-word left shifts by a constant: use UBFM (UBFIZ alias) to shift + -- and mask in a single instruction. See Note [Signed arithmetic on AArch64]. + CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W8, 0 <= n, n < 8 -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (UBFM (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger ((32 - n) `mod` 32))) (OpImm (ImmInteger (7 - n))))) + CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W16, 0 <= n, n < 16 -> do + (reg_x, _format_x, code_x) <- getSomeReg x + return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (UBFM (OpReg w dst) (OpReg w reg_x) (OpImm (ImmInteger ((32 - n) `mod` 32))) (OpImm (ImmInteger (15 - n))))) + CmmMachOp (MO_Shl w) [x, (CmmLit (CmmInt n _))] | w == W32 || w == W64 , 0 <= n, n < fromIntegral (widthInBits w) -> do @@ -901,8 +923,11 @@ getRegister' config plat expr CmmMachOp (MO_S_Shr w) [x, y] | w == W8 -> do (reg_x, _format_x, code_x) <- getSomeReg x (reg_y, _format_y, code_y) <- getSomeReg y - return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (SXTB (OpReg w reg_x) (OpReg w reg_x)) `snocOL` - (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)) `snocOL` + -- Use a temporary register to avoid sign-extending reg_x in-place, + -- as other operations may use reg_x. + tmp <- getNewRegNat (intFormat w) + return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (SXTB (OpReg w tmp) (OpReg w reg_x)) `snocOL` + (ASR (OpReg w dst) (OpReg w tmp) (OpReg w reg_y)) `snocOL` (UXTB (OpReg w dst) (OpReg w dst))) -- See Note [Signed arithmetic on AArch64] CmmMachOp (MO_S_Shr w) [x, (CmmLit (CmmInt n _))] | w == W16, 0 <= n, n < 16 -> do @@ -912,8 +937,11 @@ getRegister' config plat expr CmmMachOp (MO_S_Shr w) [x, y] | w == W16 -> do (reg_x, _format_x, code_x) <- getSomeReg x (reg_y, _format_y, code_y) <- getSomeReg y - return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (SXTH (OpReg w reg_x) (OpReg w reg_x)) `snocOL` - (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y)) `snocOL` + -- Use a temporary register to avoid sign-extending reg_x in-place, + -- as other operations may use reg_x. + tmp <- getNewRegNat (intFormat w) + return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (SXTH (OpReg w tmp) (OpReg w reg_x)) `snocOL` + (ASR (OpReg w dst) (OpReg w tmp) (OpReg w reg_y)) `snocOL` (UXTH (OpReg w dst) (OpReg w dst))) -- See Note [Signed arithmetic on AArch64] CmmMachOp (MO_S_Shr w) [x, (CmmLit (CmmInt n _))] @@ -928,8 +956,8 @@ getRegister' config plat expr CmmMachOp (MO_U_Shr w) [x, y] | w == W8 -> do (reg_x, _format_x, code_x) <- getSomeReg x (reg_y, _format_y, code_y) <- getSomeReg y - return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTB (OpReg w reg_x) (OpReg w reg_x)) `snocOL` - (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))) + tmp <- getNewRegNat (intFormat w) + return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` UXTB (OpReg w tmp) (OpReg w reg_x) `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w tmp) (OpReg w reg_y))) CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W16, 0 <= n, n < 16 -> do (reg_x, _format_x, code_x) <- getSomeReg x @@ -937,8 +965,8 @@ getRegister' config plat expr CmmMachOp (MO_U_Shr w) [x, y] | w == W16 -> do (reg_x, _format_x, code_x) <- getSomeReg x (reg_y, _format_y, code_y) <- getSomeReg y - return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` annExpr expr (UXTH (OpReg w reg_x) (OpReg w reg_x)) - `snocOL` (ASR (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))) + tmp <- getNewRegNat (intFormat w) + return $ Any (intFormat w) (\dst -> code_x `appOL` code_y `snocOL` UXTH (OpReg w tmp) (OpReg w reg_x) `snocOL` annExpr expr (LSR (OpReg w dst) (OpReg w tmp) (OpReg w reg_y))) CmmMachOp (MO_U_Shr w) [x, (CmmLit (CmmInt n _))] | w == W32 || w == W64 ===================================== testsuite/tests/codeGen/should_gen_asm/aarch64-shl-subword.asm ===================================== @@ -0,0 +1 @@ +ubfm ===================================== testsuite/tests/codeGen/should_gen_asm/aarch64-shl-subword.hs ===================================== @@ -0,0 +1,8 @@ +{-# LANGUAGE MagicHash #-} +module ShlSubWord (shlW8) where + +import GHC.Exts +import GHC.Word + +shlW8 :: Word8 -> Word8 +shlW8 (W8# w) = W8# (uncheckedShiftLWord8# w 4#) ===================================== testsuite/tests/codeGen/should_gen_asm/aarch64-ushr-subword.asm ===================================== @@ -0,0 +1 @@ +lsr ===================================== testsuite/tests/codeGen/should_gen_asm/aarch64-ushr-subword.hs ===================================== @@ -0,0 +1,9 @@ +{-# LANGUAGE MagicHash #-} +module UShrSubWord (ushrW8) where + +import GHC.Exts +import GHC.Word + +ushrW8 :: Word8 -> Int -> Word8 +ushrW8 x n = x `shiftR` n + where shiftR (W8# w) (I# i) = W8# (wordToWord8# (word8ToWord# w `uncheckedShiftRL#` i)) ===================================== testsuite/tests/codeGen/should_gen_asm/all.T ===================================== @@ -12,3 +12,11 @@ test('bytearray-memcpy-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, test('T18137', [when(opsys('darwin'), skip), only_ways(llvm_ways)], compile_grep_asm, ['hs', False, '-fllvm -split-sections']) test('T24941', [only_ways(['optasm'])], compile, ['-fregs-graph']) +is_aarch64_codegen = [ + unless(arch('aarch64'), skip), + when(unregisterised(), skip), +] + +# AArch64-specific tests +test('aarch64-ushr-subword', is_aarch64_codegen, compile_grep_asm, ['hs', True, '-O']) +test('aarch64-shl-subword', is_aarch64_codegen, compile_grep_asm, ['hs', True, '-O']) ===================================== testsuite/tests/codeGen/should_run/aarch64-subword-ops.hs ===================================== @@ -0,0 +1,115 @@ +{-# LANGUAGE MagicHash #-} +module Main where + +import GHC.Exts +import GHC.Word +import GHC.Int + +-- Uses sub-word primops directly so that the NCG sees MO_Shl W8, +-- MO_U_Shr W8, MO_S_Shr W8 etc. (the Bits class widens to Word#/Int#). + +-- NOINLINE to prevent constant folding. + +-- MO_U_Shr W8/W16 variable shift +{-# NOINLINE ushrW8 #-} +ushrW8 :: Word8 -> Int -> Word8 +ushrW8 (W8# w) (I# i) = W8# (uncheckedShiftRLWord8# w i) + +{-# NOINLINE ushrW16 #-} +ushrW16 :: Word16 -> Int -> Word16 +ushrW16 (W16# w) (I# i) = W16# (uncheckedShiftRLWord16# w i) + +-- MO_S_Shr W8/W16 variable shift +{-# NOINLINE sshrI8 #-} +sshrI8 :: Int8 -> Int -> Int8 +sshrI8 (I8# x) (I# i) = I8# (uncheckedShiftRAInt8# x i) + +{-# NOINLINE sshrI16 #-} +sshrI16 :: Int16 -> Int -> Int16 +sshrI16 (I16# x) (I# i) = I16# (uncheckedShiftRAInt16# x i) + +-- MO_Shl W8/W16 variable shift +{-# NOINLINE shlW8 #-} +shlW8 :: Word8 -> Int -> Word8 +shlW8 (W8# w) (I# i) = W8# (uncheckedShiftLWord8# w i) + +{-# NOINLINE shlW16 #-} +shlW16 :: Word16 -> Int -> Word16 +shlW16 (W16# w) (I# i) = W16# (uncheckedShiftLWord16# w i) + +-- quot exercising MO_U_Quot W8/W16 +{-# NOINLINE quotW8 #-} +quotW8 :: Word8 -> Word8 -> Word8 +quotW8 (W8# x) (W8# y) = W8# (quotWord8# x y) + +{-# NOINLINE quotW16 #-} +quotW16 :: Word16 -> Word16 -> Word16 +quotW16 (W16# x) (W16# y) = W16# (quotWord16# x y) + +-- Register clobbering: use a value both in a shift/quot and afterward. +-- If the sign/zero extension clobbers the source register, the second +-- use sees the wrong value. + +{-# NOINLINE sshrAndAdd8 #-} +sshrAndAdd8 :: Int8 -> Int -> Int8 +sshrAndAdd8 a n = sshrI8 a n + a + +{-# NOINLINE sshrAndAdd16 #-} +sshrAndAdd16 :: Int16 -> Int -> Int16 +sshrAndAdd16 a n = sshrI16 a n + a + +{-# NOINLINE quotAndAdd8 #-} +quotAndAdd8 :: Word8 -> Word8 -> Word8 +quotAndAdd8 a b = quotW8 a b + a + b + +{-# NOINLINE quotAndAdd16 #-} +quotAndAdd16 :: Word16 -> Word16 -> Word16 +quotAndAdd16 a b = quotW16 a b + a + b + +main :: IO () +main = do + putStrLn "-- MO_U_Shr variable shift" + print (ushrW8 0x80 1) -- 64 + print (ushrW8 0xFF 4) -- 15 + print (ushrW8 0x42 0) -- 66 + print (ushrW16 0x8000 1) -- 16384 + print (ushrW16 0xFFFF 8) -- 255 + print (ushrW16 0x1234 0) -- 4660 + + putStrLn "-- MO_S_Shr variable shift" + print (sshrI8 (-1) 1) -- -1 + print (sshrI8 (-128) 1) -- -64 + print (sshrI8 127 1) -- 63 + print (sshrI8 0x42 3) -- 8 + print (sshrI16 (-1) 1) -- -1 + print (sshrI16 (-32768) 1) -- -16384 + print (sshrI16 32767 8) -- 127 + + putStrLn "-- MO_Shl variable shift" + print (shlW8 0x01 0) -- 1 + print (shlW8 0x01 4) -- 16 + print (shlW8 0xFF 1) -- 254 + print (shlW8 0x42 3) -- 16 + print (shlW16 0x0001 0) -- 1 + print (shlW16 0x0001 8) -- 256 + print (shlW16 0xFFFF 1) -- 65534 + print (shlW16 0x1234 4) -- 9024 + + putStrLn "-- MO_U_Quot" + print (quotW8 255 10) -- 25 + print (quotW8 200 7) -- 28 + print (quotW8 1 1) -- 1 + print (quotW16 65535 256) -- 255 + print (quotW16 1000 3) -- 333 + + putStrLn "-- register clobbering: shift + reuse" + print (sshrAndAdd8 (-128) 1) -- 64 (wraps: -64 + -128 = -192 = 64 as Int8) + print (sshrAndAdd8 0x42 1) -- 99 + print (sshrAndAdd16 (-32768) 1) -- 16384 (wraps) + print (sshrAndAdd16 0x1234 4) -- 4951 + + putStrLn "-- register clobbering: quot + reuse" + print (quotAndAdd8 200 7) -- 235 + print (quotAndAdd8 255 10) -- 34 (wraps: 290 mod 256) + print (quotAndAdd16 1000 3) -- 1336 + print (quotAndAdd16 65535 256) -- 510 (wraps: 66046 mod 65536) ===================================== testsuite/tests/codeGen/should_run/aarch64-subword-ops.stdout ===================================== @@ -0,0 +1,40 @@ +-- MO_U_Shr variable shift +64 +15 +66 +16384 +255 +4660 +-- MO_S_Shr variable shift +-1 +-64 +63 +8 +-1 +-16384 +127 +-- MO_Shl variable shift +1 +16 +254 +16 +1 +256 +65534 +9024 +-- MO_U_Quot +25 +28 +1 +255 +333 +-- register clobbering: shift + reuse +64 +99 +16384 +4951 +-- register clobbering: quot + reuse +235 +34 +1336 +510 ===================================== testsuite/tests/codeGen/should_run/aarch64-ushr-subword-run.hs ===================================== @@ -0,0 +1,9 @@ +import Data.Bits (shiftR) +import Data.Word (Word8, Word16) + +main :: IO () +main = do + print (shiftR (0x80 :: Word8) 1) + print (shiftR (0xFF :: Word8) 4) + print (shiftR (0x8000 :: Word16) 1) + print (shiftR (0xFFFF :: Word16) 8) ===================================== testsuite/tests/codeGen/should_run/aarch64-ushr-subword-run.stdout ===================================== @@ -0,0 +1,4 @@ +64 +15 +16384 +255 ===================================== testsuite/tests/codeGen/should_run/all.T ===================================== @@ -275,3 +275,6 @@ test('T27072d', [req_c, only_ways(['dyn']), when(not opsys('darwin'), skip), # Skipped on Darwin (Apple linker doesn't support --wrap). test('T27072w', [req_c, js_skip, when(opsys('darwin'), skip)], compile_and_run, ['T27072w_c.c -no-hs-main -optl-Wl,--wrap=hs_spt_remove']) +# AArch64-specific runtime tests +test('aarch64-ushr-subword-run', [unless(arch('aarch64'), skip)], compile_and_run, ['-O']) +test('aarch64-subword-ops', [unless(arch('aarch64'), skip)], compile_and_run, ['-O']) View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/82c521c62b1842672e6f411750df4c61... -- View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/82c521c62b1842672e6f411750df4c61... You're receiving this email because of your account on gitlab.haskell.org.
participants (1)
-
Magnus (@MangoIV)