-------------------------------------------------------------------------------- -- $Id: RegexParser.hs,v 1.2 2003/10/24 15:37:38 graham Exp $ -- -- Copyright (c) 2003, G. KLYNE. All rights reserved. -- See end of this file for licence information. -------------------------------------------------------------------------------- -- | -- Module : RegexParser -- Copyright : (c) 2003, Graham Klyne -- License : GPL V2 -- -- Maintainer : Graham Klyne -- Stability : provisional -- Portability : H98 -- -- This Module implements a simple regular expression parser, yielding -- a regular expression matcher based on the Dfa library by Andrew Bromage -- (cf. http://cvs.sourceforge.net/viewcvs.py/haskell-libs/libs/text/) -- -- Uses the Parsec monadic parser library (but not the Parsec tokenizer) -- -------------------------------------------------------------------------------- module RegexParser ( parseDfaFromString ) where import Parsec import Dfa ( Re(..) , matchRe ) ---------------------------------------------------------------------- -- Define parser state and helper functions ---------------------------------------------------------------------- -- N3 parser state type RegexState = () {- data RegexState = RegexState { foo :: () -- Null state? } -} ---------------------------------------------------------------------- -- Define top-level parser function: -- accepts a string and returns a graph or error ---------------------------------------------------------------------- -- |Parse a string representation of a regular expression, returning -- Either: -- Left -> a string describing a syntax error in theregular expression, -- Right -> a Dfa regular expression matcher -- -- To use the regular expression matcher, apply function matchRe -- (or matchRe2) from the Dfa module. -- -- Regular expression syntaxd recognized: -- -- regex = reterm* -- -- reterm = reprim reterm1 -- -- reterm1 = "|" reprim -- | reprim "*" -- | reprim "+" -- | reprim "?" -- | () -- -- reprim = "(" regex ")" -- | "[" crange* "]" -- | cmatch -- -- crange = cmatch crange1 -- -- crange1 = "-" cmatch -- | () -- -- cmatch = "\" ch -- | anych -- | ch -- -- anych = "." -- -- ch = any character not including "(", ")", "[", "]", "\", "." -- -- The syntax is interpreted such that special treatment of -- characters takes precendence over the ch production. -- -- The parsed regex is returned as a value of type Re Char, -- where: -- -- data Re t = ReOr [Re t] -- | ReCat [Re t] -- | ReStar (Re t) -- | RePlus (Re t) -- | ReOpt (Re t) -- | ReTerm [t] -- parseDfaFromString :: String -> (Either String (Re Char)) parseDfaFromString input = let pstate = () result = runParser regex pstate "" input in case result of Left er -> Left (show er) Right re -> Right re ---------------------------------------------------------------------- -- Syntax productions ---------------------------------------------------------------------- type RegexParser a = GenParser Char RegexState a regex :: RegexParser (Re Char) regex = do { rs <- many reterm ; return $ ReCat rs } reterm :: RegexParser (Re Char) reterm = do { r1 <- reprim ; rt <- reterm1 r1 ; return rt } reterm1 :: (Re Char) -> RegexParser (Re Char) reterm1 r1 = do { char '|' ; r2 <- reprim ; return $ ReOr [r1,r2] } <|> do { char '*' ; return $ ReStar r1 } <|> do { char '+' ; return $ RePlus r1 } <|> do { char '?' ; return $ ReOpt r1 } <|> return r1 "reterm (Regular expression term)" reprim :: RegexParser (Re Char) reprim = do { char '(' ; rp <- regex ; char ')' ; return rp } <|> do { char '[' ; cr <- many crange ; char ']' ; return $ ReOr (concat cr) } <|> do { c1 <- cmatch ; return $ ReTerm c1 } "reprim (Regular expression primary value)" crange :: RegexParser [(Re Char)] crange = do { c1 <- cmatch ; cr <- crange1 c1 ; return cr } crange1 :: [Char] -> RegexParser [(Re Char)] crange1 c1 = do { char '-' ; c2 <- cmatch ; return $ map ReTerm $ map (:[]) [head c1..head c2] } <|> return (map ReTerm [c1]) "crange (Regular expression character or character range)" cmatch :: RegexParser [Char] cmatch = do { char '\\' ; c <- satisfy (const True) ; return [c] } <|> do { char '.' ; return ['\032'..'\254'] -- mumble } <|> do { c <- satisfy $ not . (`elem` "()[]\\.") ; return [c] } "cmatch (escape, '.' or any character other than '(', ')', '[' or ']')" {- Test cases, not exhaustive (Right rei) = parseDfaFromString "(-|+)?[0123456789]+" testrei1 = matchRe rei "123" -- True testrei2 = matchRe rei "+456" -- True testrei3 = matchRe rei "-789" -- True testrei4 = matchRe rei "00123" -- True testrei5 = matchRe rei "+00456" -- True testrei6 = matchRe rei "-00789" -- True testrei7 = matchRe rei " 123" -- False testrei8 = matchRe rei "123 " -- False testrei9 = matchRe rei "123x" -- False testrei = and [ testrei1, testrei2, testrei3, testrei4, testrei5 , testrei6 , not testrei7, not testrei8, not testrei9 ] (Right ref) = parseDfaFromString "(-|+)?[0123456789]*(\\.[0123456789]+)?([eE](-|+)?[0123456789]+)?" testref1 = matchRe ref "123" testref2 = matchRe ref "+456" testref3 = matchRe ref "-789" testref4 = matchRe ref "123.456" testref5 = matchRe ref "+456.789E23" testref6 = matchRe ref "-00789.3400e+12" testref7 = matchRe ref "-00789.3400e-12" testref8 = matchRe ref ".1234" testref9 = matchRe ref "123." -- False testref = and [ testref1, testref2, testref3, testref4, testref5 , testref6, testref7, testref8 , not testref9 ] -} -------------------------------------------------------------------------------- -- -- Copyright (c) 2003, G. KLYNE. All rights reserved. -- -- This is free software; you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation; either version 2 of the License, or -- (at your option) any later version. -- -- Swish is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with Swish; if not, write to: -- The Free Software Foundation, Inc., -- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -- -------------------------------------------------------------------------------- -- $Source: /file/cvsdev/HaskellRDF/RegexParser.hs,v $ -- $Author: graham $ -- $Revision: 1.2 $ -- $Log: RegexParser.hs,v $ -- Revision 1.2 2003/10/24 15:37:38 graham -- Tidy up and add some more test cases -- -- Revision 1.1 2003/10/24 15:21:25 graham -- Add Dfa regular expression parser --