[Haskell-cafe] Irrefutable pattern love and new regex engine.

22 Jan 2008

      I've been using the creation of a regular expression engine as an
ongoing project to learn Haskell.  Last night I created the newest
iteration.

My love for the irrefutable pattern can be found in the definition of
the rexn ( apply repetition to extracted nodes ) function below.
/snip
      rexn ns pps = let ( ~( xs , rps ) ,
                          ~( ~( nxs ) ,
                             ~( rxs , rrps ) ) ) = ( exn nxs pps ,
                                                       case rps of
                                                         ('?':'?':rr)
-> ( ( ns ) ,

    ( ns ++ xs , rr ) )
                                                         ('?':rr)
-> ( ( ns ) ,

    ( xs ++ ns , rr ) )
/snip

Beautiful, beautiful lazy execution and irrefutable pattern matching.
The data flow is able to easily twist in and out of functions, even as
an argument to the function call that creates it.

Is this kind of usage normal?

Any and all comments, suggestions, refutations, and criticisms are welcome.

-- regular expression engine -- (c) 2008 michael speer

import Char ( isSpace )
-- import Debug.Trace ( trace )

xor :: Bool -> Bool -> Bool
xor True  a = not a
xor False a = a

data RxToken = RxStart     -- matchable start of target string
             | RxChar Char -- a literal character to match
             | RxBound     -- inserted wherever alphanums touch whitespace
             | RxEnd       -- matchable end of target string
             | RxEOF       -- an additional token to push through to
catch anything trying for RxEnd
                           -- RxEOF is never matched.
               deriving ( Show )

rxTokenize tts = RxStart : case tts of
                             []        -> RxEnd : RxEOF : []
                             tts@(t:_) -> case not $ isSpace t of
                                            True  -> RxBound : rxt tts
                                            False -> rxt tts
    where
      rxt (t:[]) | not $ isSpace t = RxChar t : RxBound : RxEnd : RxEOF : []
                 | otherwise       = RxChar t : RxEnd : RxEOF : []
      rxt (t:ts@(t':_)) | isSpace t `xor` isSpace t' = RxChar t :
RxBound : rxt ts
                        | otherwise                  = RxChar t : rxt ts

data RxTransform = RxTransform ( RxNode -> RxToken -> [ RxNode ] )
                 | RxNullTransform

data RxNode = RxActive { rxTransforms :: [RxTransform] ,
                         rxMatched    :: String ,
                         rxNumSubs    :: Integer ,
                         rxSubExprs   :: [ String ] }
            | RxComplete { rxMatched :: String ,
                           rxNumSubs :: Integer ,
                           rxSubExprs :: [ String ] }

instance Show RxNode where
    show (RxComplete matched _ _) = ""

data RxDepth = RxTop | RxSub deriving ( Show )

rxCompile pps = let ( xs , rps ) = oexn [success] RxTop pps
                in case length rps of
                     0 -> RxActive { rxTransforms = xs ,
                                     rxMatched    = [] ,
                                     rxNumSubs    =  0 ,
                                     rxSubExprs   = [] }
                     _ -> error $ "Not all of pattern consumed :
remains : " ++ rps
    where
      -- or together different expression sections -- (a|b|c)
      oexn ns  RxTop [] = ( ns , [] )
      oexn _  RxSub  [] = error "Pattern ended while still in sub expression"
      oexn ns     d pps = let ( ~( xs  , rps  ) ,
                                ~( nxs , nrps ) ) = ( aexn ns pps ,
                                                      case rps of
                                                        ('|':rr) ->
let ( inxs , irps ) = oexn ns d rr
                                                                    in
( xs ++ inxs , irps )
                                                        (')':rr) -> case d of

RxTop -> error "Erroneous close parenthesis in pattern "

RxSub -> ( xs , rr )
                                                        []       -> case d of

RxTop -> ( xs , [] )

RxSub -> error "End of pattern while still in sub expression" )
                          in ( nxs , nrps )
      -- and together extracted nodes in a given expression segment -- abd?dfs
      aexn ns pps = let ( ~( xs  , rps  ) ,
                          ~( nxs , nrps ) ) = ( rexn nxs pps ,
                                                case rps of
                                                  ('|':_) -> ( ns , rps )
                                                  (')':_) -> ( ns , rps )
                                                  []      -> ( ns , rps )
                                                  _       -> aexn ns rps )
                    in ( xs , nrps )
      -- replication application - weee!
      rexn ns pps = let ( ~( xs , rps ) ,
                          ~( ~( nxs ) ,
                             ~( rxs , rrps ) ) ) = ( exn nxs pps ,
                                                       case rps of
                                                         ('?':'?':rr)
-> ( ( ns ) ,

    ( ns ++ xs , rr ) )
                                                         ('?':rr)
-> ( ( ns ) ,

    ( xs ++ ns , rr ) )
                                                         ('*':'?':rr)
-> ( ( ns ++ xs ) ,

    ( ns ++ xs , rr ) )
                                                         ('*':rr)
-> ( ( xs ++ ns ) ,

    ( xs ++ ns , rr ) )
                                                         ('+':'?':rr)
-> ( ( ns ++ xs ) ,

    ( xs , rr ) )
                                                         ('+':rr)
-> ( ( xs ++ ns ) ,

    ( xs , rr ) )
                                                         _
-> ( ( ns ) ,

    ( xs , rps ) )
                                                     )
                    in
                      ( rxs , rrps )
      -- extract node ( including an entire subexpression as a single node )
      exn _  ('?':_)  = error "Bad question mark operator"
      exn _  ('*':_)  = error "Bad splat operator"
      exn _  ('+':_)  = error "Bad plus sign operator"
      exn ns ('(':ps) = oexn ns RxSub ps
      exn ns (p:ps)   = ( [ RxTransform ( \rxn k -> case k of
                                                      (RxChar c) -> if c == p
                                                                    then

 [ RxActive { rxTransforms = ns ,

              rxMatched    = c : rxMatched rxn ,

              rxNumSubs    = rxNumSubs rxn ,

              rxSubExprs   = rxSubExprs rxn } ]
                                                                    else
                                                                        []
                                                      (RxStart)  -> [rxn]
                                                      (RxBound)  -> [rxn]
                                                      _          -> []
                                        ) ] ,
                          ps )

      exn ns []       = error "can this be reached?"

      success = RxTransform ( \ rxn k -> [ RxComplete { rxMatched  =
reverse $ rxMatched rxn ,
                                                        rxNumSubs  =
rxNumSubs rxn ,
                                                        rxSubExprs =
map reverse $ rxSubExprs rxn } ] )

rxExec n tts = iexec [n] (rxTokenize tts)
    where
      iexec (win@(RxComplete _ _ _ ):_) _      = Just win
      iexec []                          _      = Nothing
      iexec nns                         (k:ks) = iexec ( concatMap (
\n -> case n of

      (RxComplete _ _ _)   -> [n]

      a@(RxActive _ _ _ _) -> concatMap (\xf -> case xf of

                                                  (RxTransform f)   ->
f a k

                                                  (RxNullTransform) ->
[]

                                        ) (rxTransforms a) ) nns ) ks

main = do
  print $ rxTokenize "this is a test"
  print $ rxExec (rxCompile "hello|world") "hello"
  print $ rxExec (rxCompile "hello|world") "world"
  print $ rxExec (rxCompile "abcde|ab")    "abcd"
  print $ rxExec (rxCompile "ab?c")        "ac"
  print $ rxExec (rxCompile "ab?c")        "abc"
  print $ rxExec (rxCompile "ab*c")        "ac"
  print $ rxExec (rxCompile "ab*c")        "abc"
  print $ rxExec (rxCompile "ab*c")        "abbbc"
  print $ rxExec (rxCompile "ab+c")        "ac"
  print $ rxExec (rxCompile "ab+c")        "abc"
  print $ rxExec (rxCompile "ab+c")        "abbbbbc"
  print $ rxExec (rxCompile "(a|b)+")      "aaabbbabaaababaaabbbabbaba"
  print $ rxExec (rxCompile "abc|")        "zyx"

[Haskell-cafe] Irrefutable pattern love and new regex engine.

Michael Speer