I'm working on measuring and improving the performance of the text library at the moment, and the very first test I tried demonstrated a piece of behaviour that I'm not completely able to understand. Actually, I'm not able to understand what's going on at all, beyond a very shallow level. All the comments below pertain to GHC 6.10.4.
import qualified Data.ByteString as Bimport Data.Text.Encoding as Timport qualified Data.Text as Timport System.Environment (getArgs)import Control.Monad (forM_)main = doargs <- getArgsforM_ args $ \a -> dos <- B.readFile alet t = T.decodeUtf8 sprint (T.length t)
streamUtf8 :: OnDecodeError -> ByteString -> Stream CharstreamUtf8 onErr bs = Stream next 0 (maxSize l)wherel = B.length bsnext i| i >= l = Done| U8.validate1 x1 = Yield (unsafeChr8 x1) (i+1)| {- etc. -}{-# INLINE [0] streamUtf8 #-}
And the streaming length is:length :: Text -> Intlength t = Stream.length (Stream.stream t){-# INLINE length #-}
length :: Stream Char -> Intlength = S.lengthI{-# INLINE[1] length #-}
lengthI :: Integral a => Stream Char -> alengthI (Stream next s0 _len) = loop_length 0 s0whereloop_length !z s = case next s ofDone -> zSkip s' -> loop_length z s'Yield _ s' -> loop_length (z + 1) s'{-# INLINE[0] lengthI #-}
main = doargs <- getArgsforM_ args $ \a -> dos <- B.readFile alet !t = decodeUtf8 s {- <-- notice the strictness annotation -}print (T.length t)