module UTF8 ( fromUTF8 , toUTF8 ) where import Data.Word import Data.Bits mask = (.&.) -- for readability add = (.|.) -- for readability fromUTF8 :: [Word8] -> [Char] fromUTF8 [] = [] fromUTF8 (w:ws) | w < 0x80 {- 0xxxxxxx -} = toEnum (fromEnum w) : fromUTF8 ws | w >= 0xc0 {- 1111110x -} = bytes 5 (fromEnum (w`mask`0x01)) ws | w >= 0xe0 {- 111110xx -} = bytes 4 (fromEnum (w`mask`0x03)) ws | w >= 0xf0 {- 11110xxx -} = bytes 3 (fromEnum (w`mask`0x07)) ws | w >= 0xf8 {- 1110xxxx -} = bytes 2 (fromEnum (w`mask`0x0f)) ws | w >= 0xfc {- 110xxxxx -} = bytes 1 (fromEnum (w`mask`0x1f)) ws | otherwise = error "incorrect UTF-8 encoding: wrong 7th-bit in first byte" where bytes :: Int -> Int -> [Word8] -> [Char] bytes 0 acc ws = toEnum acc : fromUTF8 ws bytes n acc [] = error "incorrect UTF-8 encoding: missing bytes" bytes n 0 ws = error "incorrect UTF-8 encoding: non-minimal" bytes n acc (w:ws) | w >= 0x80 = bytes (n-1) ((acc`shiftL`6) + fromEnum (w`mask`0x3f)) ws | otherwise = error "incorrect UTF-8 encoding: 8-bit not set in trailer" toUTF8 :: [Char] -> [Word8] toUTF8 = utf8 . map fromEnum where utf8 :: [Int] -> [Word8] utf8 [] = [] utf8 (c:cs) | c < 0x80 = toEnum c : utf8 cs | c < 0x800 = bytes 1 (add 0xc0) c $ utf8 cs | c < 0x10000 = bytes 2 (add 0xe0) c $ utf8 cs | c < 0x200000 = bytes 3 (add 0xf0) c $ utf8 cs | c < 0x4000000 = bytes 4 (add 0xf8) c $ utf8 cs | c < 0x7fffffff = bytes 5 (add 0xfc) c $ utf8 cs | otherwise = error "toUTF8: character outside permissible range" bytes :: Int -> (Word8->Word8) -> Int -> [Word8] -> [Word8] bytes 0 header c rest = header (toEnum c): rest bytes n header c rest = bytes (n-1) header (c`shiftR`6) $ (toEnum ((c`mask`0x3f) + 0x80)) : rest