diff -ur regex-pcre-0.94.4.orig/Text/Regex/PCRE/String.hs regex-pcre-0.94.4/Text/Regex/PCRE/String.hs --- regex-pcre-0.94.4.orig/Text/Regex/PCRE/String.hs 2012-05-30 18:44:14.000000000 -0300 +++ regex-pcre-0.94.4/Text/Regex/PCRE/String.hs 2012-08-23 17:22:14.114641657 -0300 @@ -46,11 +46,16 @@ ) where import Text.Regex.PCRE.Wrap -- all -import Foreign.C.String(withCStringLen,withCString) -import Data.Array(Array,listArray) +import Foreign.C.String(CStringLen,withCStringLen,withCString) +import Foreign.Storable(peekByteOff) +import Data.Word(Word8) +import Data.Array.IO(IOUArray,newArray,readArray,writeArray) +import Data.Array(Array,listArray,bounds,elems) import System.IO.Unsafe(unsafePerformIO) -import Text.Regex.Base.RegexLike(RegexMaker(..),RegexLike(..),RegexContext(..),MatchLength,MatchOffset) +import Text.Regex.Base.RegexLike(RegexMaker(..),RegexLike(..),RegexContext(..),MatchLength,MatchOffset,MatchArray) import Text.Regex.Base.Impl(polymatch,polymatchM) +import GHC.IO.Encoding(getForeignEncoding,textEncodingName) +import Control.Monad(forM) instance RegexContext Regex String String where match = polymatch @@ -72,7 +77,7 @@ matchOnce regex str = unsafePerformIO $ execute regex str >>= unwrap matchAll regex str = unsafePerformIO $ - withCStringLen str (wrapMatchAll regex) >>= unwrap + withCStringLen str (wrapMatchAllFixPos regex) >>= unwrap matchCount regex str = unsafePerformIO $ withCStringLen str (wrapCount regex) >>= unwrap @@ -91,7 +96,7 @@ -- string, or: -- 'Just' an array of (offset,length) pairs where index 0 is whole match, and the rest are the captured subexpressions. execute regex str = do - maybeStartEnd <- withCStringLen str (wrapMatch 0 regex) + maybeStartEnd <- withCStringLen str (wrapMatchFixPos 0 regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) -- Right (Just []) -> fail "got [] back!" -- should never happen @@ -115,9 +120,94 @@ ,getSub matchedStartStop ,drop stop str ,map getSub subStartStop) - maybeStartEnd <- withCStringLen str (wrapMatch 0 regex) + maybeStartEnd <- withCStringLen str (wrapMatchFixPos 0 regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) -- Right (Just []) -> fail "got [] back!" -- should never happen Right (Just parts) -> return . Right . Just . matchedParts $ parts Left err -> return (Left err) + + + +-- | wrapMatchFixPos calls wrapMatch and fixes the string offsets +-- in the result so that they are valid in the original Haskell string +-- +-- +wrapMatchFixPos :: StartOffset + -> Regex + -> CStringLen + -> IO (Either WrapError (Maybe [(StartOffset,EndOffset)])) +wrapMatchFixPos startOffset regex cstr_len = do + maybeStartEnd <- wrapMatch startOffset regex cstr_len + case maybeStartEnd of + Right (Just parts) -> do maybeMapPos <- decode_positions cstr_len + case maybeMapPos of + Just mapPos -> fmap (Right . Just) $ forM parts $ \(s,e)-> do + s' <- readArray mapPos s + e' <- readArray mapPos (pred e) + return (s',succ e') + Nothing -> return maybeStartEnd + _ -> return maybeStartEnd + +-- | wrapMatchAllFixPos calls wrapMatchAll and fixes the string offsets +-- in the result so that they are valid in the original Haskell string +-- +-- +wrapMatchAllFixPos :: Regex + -> CStringLen + -> IO (Either WrapError [ MatchArray ]) +wrapMatchAllFixPos regex cstr_len = do + putStrLn "wrapMatchAllFixPos" + maybeStartLen <- wrapMatchAll regex cstr_len + case maybeStartLen of + Right parts -> do maybeMapPos <- decode_positions cstr_len + case maybeMapPos of + Just mapPos -> fmap Right $ forM parts $ \arr -> + fmap (listArray (bounds arr)) $ forM (elems arr) $ \(s,n) -> do + s' <- readArray mapPos s + return (s',n) + Nothing -> return maybeStartLen + _ -> return maybeStartLen + + +-- | utf8_range determines how many bytes are needed to represent a UTF-8 +-- character given its first byte +-- +-- +utf8_range :: Word8 -> Int +utf8_range c | c <= 0x7f = 1 + | c <= 0xdf = 2 + | c <= 0xef = 3 + | otherwise = 4 + + +-- | utf8_decode_pos_array constructs an array that maps positions in an +-- UTF-8 encoded C string to the related positions in the corresponding +-- unencoded Haskell string. +-- +-- +utf8_decode_pos_array :: CStringLen -> IO (IOUArray Int Int) +utf8_decode_pos_array (cstr,len) = + do arr <- newArray (0,pred len) 0 :: IO (IOUArray Int Int) + + let loop x i n + | i == len = return arr + | n == 0 = do c <- peekByteOff cstr i :: IO Word8 + loop (succ x) i (utf8_range c) + | otherwise = do writeArray arr i x + loop x (succ i) (pred n) + + loop (-1) 0 0 + +-- | decode_positions constructs an array that maps positions in an +-- encoded C string to the related positions in the corresponding unencoded +-- Haskell string. Currently it works only for UTF-8 strings. +-- +-- +decode_positions :: CStringLen -> IO (Maybe (IOUArray Int Int)) +decode_positions cstr_len = do + enc <- getForeignEncoding + case textEncodingName enc of + "UTF-8" -> fmap Just $ utf8_decode_pos_array cstr_len + _ -> return Nothing +