[Git][ghc/ghc][wip/marge_bot_batch_merge_job] 4 commits: Fix lexing "\^\" (#25937)

Marge Bot pushed to branch wip/marge_bot_batch_merge_job at Glasgow Haskell Compiler / GHC Commits: 6467d61e by Brandon Chinn at 2025-04-29T18:36:03-04:00 Fix lexing "\^\" (#25937) This broke in the refactor in !13128, where the old code parsed escape codes and collapsed string gaps at the same time, but the new code collapsed gaps first, then resolved escape codes. The new code used a naive heuristic to skip escaped backslashes, but didn't account for "\^\". - - - - - 99868a86 by Jens Petersen at 2025-04-29T18:36:44-04:00 hadrian: default selftest to disabled - - - - - 840d64c3 by Zubin Duggal at 2025-04-29T19:09:31-04:00 get-win32-tarballs.py: List tarball files to be downloaded if we cannot find them Fixes #25929 - - - - - 3787f533 by Ben Gamari at 2025-04-29T19:09:31-04:00 Move Data ModuleName instance to Language.Haskell.Syntax.Module.Name Fixes #25968. - - - - - 11 changed files: - compiler/GHC/Parser/Errors/Types.hs - compiler/GHC/Parser/String.hs - compiler/GHC/Unit/Types.hs - compiler/Language/Haskell/Syntax/Module/Name.hs - hadrian/README.md - hadrian/hadrian.cabal - m4/fp_setup_windows_toolchain.m4 - + testsuite/tests/parser/should_run/T25937.hs - + testsuite/tests/parser/should_run/T25937.stdout - testsuite/tests/parser/should_run/all.T - + testsuite/tests/parser/should_run/parser_unit_tests.hs Changes: ===================================== compiler/GHC/Parser/Errors/Types.hs ===================================== @@ -611,6 +611,7 @@ data LexErr | LexUnterminatedComment -- ^ Unterminated `{-' | LexUnterminatedOptions -- ^ Unterminated OPTIONS pragma | LexUnterminatedQQ -- ^ Unterminated quasiquotation + deriving (Show,Eq,Ord) -- | Errors from the Cmm parser data CmmParserError ===================================== compiler/GHC/Parser/String.hs ===================================== @@ -36,6 +36,7 @@ import GHC.Utils.Panic (panic) type BufPos = Int data StringLexError = StringLexError LexErr BufPos + deriving (Show, Eq) lexString :: Int -> StringBuffer -> Either StringLexError String lexString = lexStringWith processChars processChars @@ -122,20 +123,49 @@ bufferLocatedChars initialBuf len = go initialBuf -- ----------------------------------------------------------------------------- -- Lexing phases +-- | Collapse all string gaps in the given input. +-- +-- Iterates through the input in `go` until we encounter a backslash. The +-- @stringchar Alex regex only allows backslashes in two places: escape codes +-- and string gaps. +-- +-- * If the next character is a space, it has to be the start of a string gap +-- AND it must end, since the @gap Alex regex will only match if it ends. +-- Collapse the gap and continue the main iteration loop. +-- +-- * Otherwise, this is an escape code. If it's an escape code, there are +-- ONLY three possibilities (see the @escape Alex regex): +-- 1. The escape code is "\\" +-- 2. The escape code is "\^\" +-- 3. The escape code does not have a backslash, other than the initial +-- backslash +-- +-- In the first two possibilities, just skip them and continue the main +-- iteration loop ("skip" as in "keep in the list as-is"). In the last one, +-- we can just skip the backslash, then continue the main iteration loop. +-- the rest of the escape code will be skipped as normal characters in the +-- string; no need to fully parse a proper escape code. collapseGaps :: HasChar c => [c] -> [c] collapseGaps = go where go = \case - c1@(Char '\\') : c2@(Char c) : cs - -- #25784: string gaps are semantically equivalent to "\&" + -- Match the start of a string gap + drop gap + -- #25784: string gaps are semantically equivalent to "\&" + c1@(Char '\\') : Char c : cs | is_space c -> c1 : setChar '&' c1 : go (dropGap cs) - | otherwise -> c1 : c2 : go cs + -- Match all possible escape characters that include a backslash + c1@(Char '\\') : c2@(Char '\\') : cs + -> c1 : c2 : go cs + c1@(Char '\\') : c2@(Char '^') : c3@(Char '\\') : cs + -> c1 : c2 : c3 : go cs + -- Otherwise, just keep looping c : cs -> c : go cs [] -> [] dropGap = \case Char '\\' : cs -> cs _ : cs -> dropGap cs + -- Unreachable since gaps must end; see docstring [] -> panic "gap unexpectedly ended" resolveEscapes :: HasChar c => [c] -> Either (c, LexErr) [c] ===================================== compiler/GHC/Unit/Types.hs ===================================== @@ -1,5 +1,3 @@ -{-# OPTIONS_GHC -Wno-orphans #-} -- instance Data ModuleName - {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveTraversable #-} @@ -117,13 +115,6 @@ data GenModule unit = Module } deriving (Eq,Ord,Data,Functor) --- TODO: should be moved back into Language.Haskell.Syntax.Module.Name -instance Data ModuleName where - -- don't traverse? - toConstr _ = abstractConstr "ModuleName" - gunfold _ _ = error "gunfold" - dataTypeOf _ = mkNoRepType "ModuleName" - -- | A Module is a pair of a 'Unit' and a 'ModuleName'. type Module = GenModule Unit ===================================== compiler/Language/Haskell/Syntax/Module/Name.hs ===================================== @@ -3,6 +3,7 @@ module Language.Haskell.Syntax.Module.Name where import Prelude import Data.Char (isAlphaNum) +import Data.Data import Control.DeepSeq import qualified Text.ParserCombinators.ReadP as Parse import System.FilePath @@ -12,6 +13,14 @@ import GHC.Data.FastString -- | A ModuleName is essentially a simple string, e.g. @Data.List@. newtype ModuleName = ModuleName FastString deriving (Show, Eq) +instance Data ModuleName where + -- don't traverse? + toConstr x = constr + where + constr = mkConstr (dataTypeOf x) "{abstract:ModuleName}" [] Prefix + gunfold _ _ = error "gunfold" + dataTypeOf _ = mkNoRepType "ModuleName" + instance Ord ModuleName where nm1 `compare` nm2 = stableModuleNameCmp nm1 nm2 ===================================== hadrian/README.md ===================================== @@ -226,8 +226,8 @@ tested in CI. If you use an untested flavour such as "Quick" then you run the risk that not all tests will pass. In particular you can rely on the `validate` and `perf` flavours being tested but no others. -`build selftest` runs tests of the build system. The current test coverage -is close to zero (see [#197][test-issue]). +`build selftest` (no longer enabled by default) runs tests of the build system. +The current test coverage is close to zero (see [#197][test-issue]). #### Running linters ===================================== hadrian/hadrian.cabal ===================================== @@ -27,7 +27,7 @@ flag threaded -- See also #21913 flag selftest manual: True - default: True + default: False description: Enables the hadrian selftest rules which require QuickCheck. Disabling it thus saves on a few dependencies which can be problematic when bootstrapping. ===================================== m4/fp_setup_windows_toolchain.m4 ===================================== @@ -17,12 +17,13 @@ AC_DEFUN([FP_INSTALL_WINDOWS_TOOLCHAIN],[ else action="download" fi - $PYTHON mk/get-win32-tarballs.py $action $mingw_arch > missing-win32-tarballs + $PYTHON mk/get-win32-tarballs.py $action $mingw_arch case $? in 0) rm missing-win32-tarballs ;; 2) + $PYTHON mk/get-win32-tarballs.py list $mingw_arch > missing-win32-tarballs echo echo "Error:" echo "Needed msys2 tarballs are missing. You have a few options to get them," ===================================== testsuite/tests/parser/should_run/T25937.hs ===================================== @@ -0,0 +1,2 @@ +main :: IO () +main = print "\^\ " ===================================== testsuite/tests/parser/should_run/T25937.stdout ===================================== @@ -0,0 +1 @@ +"\FS " ===================================== testsuite/tests/parser/should_run/all.T ===================================== @@ -1,3 +1,8 @@ +test('parser_unit_tests', + normal, + compile_and_run, + ['-package ghc']) + test('readRun001', normal, compile_and_run, ['']) test('readRun002', normal, compile_and_run, ['']) test('readRun003', normal, compile_and_run, ['']) @@ -21,6 +26,7 @@ test('RecordDotSyntax3', [extra_files(['RecordDotSyntaxA.hs'])], multimod_compil test('RecordDotSyntax4', [extra_files(['RecordDotSyntaxA.hs'])], multimod_compile_and_run, ['RecordDotSyntax4', '']) test('RecordDotSyntax5', normal, compile_and_run, ['']) test('ListTuplePunsConstraints', extra_files(['ListTuplePunsConstraints.hs']), ghci_script, ['ListTuplePunsConstraints.script']) +test('T25937', normal, compile_and_run, ['']) # Multiline strings test('MultilineStrings', normal, compile_and_run, ['']) ===================================== testsuite/tests/parser/should_run/parser_unit_tests.hs ===================================== @@ -0,0 +1,82 @@ +import GHC.Data.StringBuffer (stringToStringBuffer) +import qualified GHC.Data.StringBuffer as StringBuffer (StringBuffer (..)) +import GHC.Parser.String (lexString, lexMultilineString) + +import qualified Control.Exception as E +import Control.Monad (forM_, unless) + +main :: IO () +main = do + forM_ tests $ \(testName, test) -> do + result <- E.try test + case result of + Right () -> pure () + Left (e :: E.SomeException) + | Just e' <- E.asyncExceptionFromException e -> do + E.throwIO (e' :: E.AsyncException) + | otherwise -> do + putStrLn $ ">>> FAIL: " ++ testName + putStrLn $ E.displayException e + +{----- Test infrastructure -----} + +data TestFailure = TestFailure String + deriving (Show) + +instance E.Exception TestFailure where + displayException (TestFailure msg) = "Test failure:\n" ++ msg + +testFailure :: String -> IO a +testFailure = E.throwIO . TestFailure + +shouldBe :: (Eq a, Show a) => a -> a -> IO () +shouldBe actual expected = + unless (actual == expected) $ + testFailure $ + "Got: " ++ show actual ++ "\n" ++ + "Expected: " ++ show expected + +type TestCase = (String, IO ()) + +testCase :: String -> IO () -> TestCase +testCase = (,) + +{----- Tests -----} + +tests :: [TestCase] +tests = concat + [ stringTests + ] + +-- | Unit tests for GHC.Parser.String +stringTests :: [TestCase] +stringTests = concat + [ escapedBackslashTests + ] + where + withBuf f s = let buf = stringToStringBuffer s in f (StringBuffer.len buf) buf + + -- Test all situations where backslashes can appear in escape characters (#25937) + escapedBackslashTests = + [ testCase label $ do + withBuf lexStr input `shouldBe` Right output + | (lexLabel, lexStr) <- [("strings", lexString), ("multiline strings", lexMultilineString)] + , (label, input, output) <- + [ ( "escaped backslashes in " ++ lexLabel ++ " not mistaken for string gaps" + , [' ', '\\', '\\', ' ', '\\', '\\', ' '] + , " \\ \\ " + ) + , ( "escaped \\FS in " ++ lexLabel ++ " not mistaken for beginning of string gap" + , ['\\', '^', '\\'] + , "\FS" + ) + , ( "escaped \\FS in " ++ lexLabel ++ " not mistaken for unterminated string gap" + , ['\\', '^', '\\', ' '] + , "\FS " + ) + , ( "escaped \\FS in " ++ lexLabel ++ " does not collapse mistaken string gap" + , ['\\', '^', '\\', ' ', '\\', 'n'] + , "\FS \n" + ) + ] + ] View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/9f68f02d88638486313decb87228f63... -- View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/9f68f02d88638486313decb87228f63... You're receiving this email because of your account on gitlab.haskell.org.
participants (1)
-
Marge Bot (@marge-bot)