Re: [Haskell-cafe] Difference between Lazy ByteStrings and Strings in alex

14 Feb 2007


      Yes, that was a typo :-)

On Tuesday 13 February 2007 22:54, Stefan O'Rear wrote:
...
On Tue, Feb 13, 2007 at 10:43:11PM -0500, Jefferson Heard wrote:
...
I am running GHC 2.6 now, and am using -O3 as my optimization parameter. 
I'm
I think you will get much better performance with GHC 6.6.  The optimizer
has been improved a *lot* in the last 10 years.
(I hope that was a typo!!)
...
Non-lazy version
{
module Main
    where
import qualified FileReader
}
%wrapper "basic"
$letter = [a-zA-Z]
$digit = 0-9
$alphanum = [a-zA-Z0-9]
$punct = [\! \@ \# \$ \% \^ \& \* \( \) \_ \- \+ \= \{ \[ \} \] \\ \| \;
\: \' \" \, \. \? \/ \` \~]
$dec = \.
$posneg = [\- \+]
@date1 = jan($punct|uary)?\ $digit{1,2}(\,\ $digit{2,4})?
| feb($punct|ruary)?\ $digit{1,2}(\,\ $digit{2,4})?
       | mar($punct|ch)?\ $digit{1,2}(\,\ $digit{2,4})?
       | apr($punct|il)?\ $digit{1,2}(\,\ $digit{2,4})?
       | may?\ $digit{1,2}(\,\ $digit{2,4})?
       | jun($punct|e)?\ $digit{1,2}(\,\ $digit{2,4})?
       | jul($punct|y)?\ $digit{1,2}(\,\ $digit{2,4})?
       | aug($punct|ust)?\ $digit{1,2}(\,\ $digit{2,4})?
       | sep($punct|tember)?\ $digit{1,2}(\,\ $digit{2,4})?
       | sept($punct)?\ $digit{1,2}(\,\ $digit{2,4})?
       | oct($punct|ober)?\ $digit{1,2}(\,\ $digit{2,4})?
       | nov($punct|ember)?\ $digit{1,2}(\,\ $digit{2,4})?
       | dec($punct|ember)?\ $digit{1,2}(\,\ $digit{2,4})?
@date2 = $digit{1,2} $punct $digit{1,2} $punct $digit{2,4}
@time = $digit{1,2} \: $digit{2} (am|pm)?
@word = $alphanum+
@number = $posneg? $digit+
| $posneg? $digit+ $dec $digit+
        | $posneg? $digit+ (\,$digit{3})+
        | $posneg? $digit? (\,$digit{3})+ $dec $digit+
$white = [\t\r\n\v\f\ ]
@doc = \< DOC \>
@tag = \< $alphanum+ \>
| \<\/ $alphanum+ \>
tokens :-
  @doc    { \s -> "" }
  @tag    ;
  $white+ ;
  @time   { \s -> s }
  @number { \s -> s }
  @word   { \s -> s }
  $punct  ;
  .       ;
{
printCount c [] = print c
printCount c (l:ls) = if l == "" then printCount (c+1) ls else printCount
c ls
main = do
    file <- readFile "trecfile1"
    printCount 0 (alexScanTokens file)
}
FTR, regular strings are lazy - too lazy, which is where the performance
problems come from.
...
--
-------------------------------------------------------------------------
----------------------------------- Version depending on ByteString.Lazy
-- note that the grammar is the same, so it has been omitted
--
-------------------------------------------------------------------------
-----------------------------------
... grammar ...
{
type AlexInput = (Char,     -- previous char
                  B.ByteString)   -- current input string
takebytes :: Int -> B.ByteString -> String
takebytes (0) _ =  ""
takebytes n s = c : takebytes (n-1) cs
    where c = B.index s 0
          cs = B.drop 1 s
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (_, bytestring)
| bytestring == B.empty = Nothing
    | otherwise             = Just (c , (c,cs))
where c = B.index bytestring 0
          cs = B.drop 1 bytestring
Hm, you might do better with more specialized functions.
...
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (_, bytestring)
| B.null bytestring = Nothing
    | otherwise         = Just (c , (c,cs))
where c = B.head bytestring
          cs = B.tail bytestring
or even:
...
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (_, bytestring)
| B.null bytestring = Nothing
    | otherwise         = Just (c , (c,cs))
where c = B.unsafeHead bytestring
          cs = B.unsafeTail bytestring
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (c,_) = c
If you are certian this isn't the first character, you might do better
using B.unsafeIndex (-1).
...
alexScanTokens :: B.ByteString -> [String]
alexScanTokens str = go ('\n',str)
  where go inp@(_,str) =
          case alexScan inp 0 of
                AlexToken inp' len act -> act (takebytes len str) : go
inp' AlexSkip  inp' len     -> go inp'
                AlexEOF -> []
                AlexError _ -> error "lexical error"
printCount :: Int -> [String] -> IO ()
printCount c [] = print c
printCount c (l:ls) = if l == "" then printCount (c+1) ls else printCount
c ls
main = do
    file <- B.readFile "trecfile1"
    printCount 0 (alexScanTokens file)
}