
My optimized (and fixed) version of the code is attached. I benchmarked it with:
module Main(main) where
import KMPSeq import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as BC
infile = "endo.dna"
Modified by one character from the original copied from the endo.dna The leading "IIIII" is often overlapping with the suffix of a prefix of substring so KMP should help here
substring1 = BC.pack "IIIIIPIIIPIIIIIPIIIPFFICCPIIIPFFFFFPIIIPFFFFFPIIIPIIFIIPIIIPIIIIIPIIIPIIIIIPIIIPIIIIIPIIIPCCCCCPI"
original "IIIIIPIIIPIIIIIPIIIPFFICCPIIIPFFFFFPIIIPFFFFFPIIIPIIIIIPIIIPIIIIIPIIIPIIIIIPIIIPIIIIIPIIIPCCCCCPI"
getSearch = do B.readFile infile
main :: IO () main = do print . kmpMatch substring1 =<< getSearch
The bug fix is actually a big speed win for this benchmark. The long commentary I wrote has been sent to Justin Bailey with each step I took and each measurement I made. (800 lines with all the pasted profiles). The speedup was from 18.5 seconds to 2.6 seconds on the above benchmark. Cheers, Chris Kuklewicz
From Martin Amis' short story "The Janitor on Mars":
"...The War with the Scythers of the Orion Spur was hotly prosecuted for just over a billion years. Who won? We did. They're still there, the Scythers. Their planet is still there. The nature of war changed, during that trillennium. It was no longer nuclear or quantum-gravitational. It was neurological. Informational. Life goes on for the Scythers, but its quality has been subtly reduced. We fixed it so that they think they're simulations in a deterministic computer universe. It is believed that this is the maximum suffering you can visit on a type-v world." http://www.metafilter.com/mefi/18176 -- Code by Justin Bailey with optimizations by Chris Kuklewicz {-# OPTIONS_GHC -fbang-patterns #-} module KMPSeq (kmpMatch) where import qualified Data.Array.Base as Base (unsafeAt) import qualified Data.Array.Unboxed as Unboxed (UArray) import qualified Data.Array.IArray as IArray ((!), Array, array) import Data.List as List (map, filter, length, null, take, maximum, foldr, all, drop) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString as S import qualified Data.ByteString.Base as B (unsafeHead,unsafeTail,unsafeDrop,unsafeIndex) import GHC.Int (Int64) import Test.QuickCheck import Data.ByteString as S (isSubstringOf, pack, ByteString) -- Only used during testing import Debug.Trace (trace) import System.IO.Unsafe import System.IO {- Returns the first index for which the pattern given is found in the string given, or -1 if the pattern does not appear. -} kmpMatch :: B.ByteString -- ^ The pattern -> B.ByteString -- ^ The string to search -> Int64 -- ^ Result kmpMatch patLazy search | B.null patLazy = 0 -- Empty pattern matches right away. | otherwise = let pat :: S.ByteString pat = S.concat (B.toChunks patLazy) lookupTable = computeLookupS pat patLen = fromIntegral $ S.length pat -- currPatIdx is our position in the pattern - sometimes we don't -- have to match the whole pattern again. -- -- currIdx is our position in the string. We B.drop elements from str -- on each iteration so have to track our position separately. kmpMatch' :: B.ByteString -> Int64 -> Int -> Int64 kmpMatch' !str !currIdx !patShift | B.null str = -1 | otherwise = -- Find length of prefix in our pattern which matches string (possibly zero) let matchLength :: Int matchLength = let matchLength' :: B.ByteString -> Int -> Int matchLength' !str !cnt | cnt == patLen = patLen | B.null str = 0 | B.unsafeIndex pat cnt == B.head str = matchLength' (B.tail str) (succ cnt) | otherwise = cnt in matchLength' str patShift shiftAmt = Base.unsafeAt lookupTable matchLength {-# INLINE matchLen64 #-} matchLen64 = fromIntegral (matchLength-patShift) in case matchLength of _ | matchLength == patLen -> currIdx - fromIntegral patShift -- found complete match | matchLength == patShift -> kmpMatch' (B.tail str) (currIdx + 1) 0 | shiftAmt > 0 -> kmpMatch' (B.drop matchLen64 str) (currIdx + matchLen64) shiftAmt | otherwise -> kmpMatch' (B.drop matchLen64 str) (currIdx + matchLen64) patShift in kmpMatch' search 0 0 -- These tests have failed in the past -- kmpMatch (toLazyBS "ccb") (toLazyBS "abcdcccb") == 5 -- kmpMatch (toLazyBS "bbaa") (toLazyBS "bdcdbbbbaa") == 6 -- kmpMatch (toLazyBS "cccadadc") (toLazyBS "adaccccadadc") == 4 -- This next one has caused an infinite loop previously. -- kmpMatch (toLazyBS "bbbb") (toLazyBS "dddcaaaddaaabdacbcccabbada") {-| Given our pattern, get all the prefixes of the pattern. For each of those prefixes, find the longest prefix from the original pattern that is also a suffix of the prefix segment being considered, and is not equal to it. The argument given to overlap is the length of the prefix matched so far, and the length of the longest prefix, which is a suffix and is not equal to it, is the value overlap returns. If a given prefix has no possible overlap, it is mapped to -1. -} overlap :: S.ByteString -> [(Int, Int)] overlap pat = let patternLength = S.length pat -- Necessary to reverse prefixes since 'isSuffixOf' not implemented for lazy bytestrings. prefixes = List.map S.reverse $ List.take (fromIntegral patternLength) $ S.inits pat longestPreSuffix prefix = -- Find prefixes that are a prefix of this suffix, but -- are not equal to it. let suffixes = List.filter (\p -> p `S.isPrefixOf` prefix && p /= prefix) prefixes in if S.null prefix || List.null suffixes then 0 else List.maximum (List.map S.length suffixes) in List.map (\prefix -> (fromIntegral $ S.length prefix, fromIntegral $longestPreSuffix prefix)) prefixes {- | Given a string representing a search pattern, this function returns a function which represents, for each prefix of that pattern, the maximally long prefix of the pattern which is a suffix of the indicated pattern segment. If there is no such prefix, 0 is returned. -} computeLookupS :: S.ByteString -> Unboxed.UArray Int Int computeLookupS pat = let patLen = fromIntegral $ S.length pat table :: Unboxed.UArray Int Int table = {-# SCC "computeLookup_table" #-} IArray.array (0, patLen - 1) (overlap pat) in table computeLookup :: B.ByteString -> (Int -> Int) computeLookup pat = let patLen = fromIntegral $ B.length pat table :: Unboxed.UArray Int Int table = {-# SCC "computeLookup_table" #-} IArray.array (0, patLen - 1) (overlap . S.concat . B.toChunks $ pat) in Base.unsafeAt table -- Types, instances and utility functions for testing purposes. newtype PatternChar = PatternChar Char deriving Show instance Arbitrary PatternChar where arbitrary = oneof (List.map (return . PatternChar) ['a', 'b', 'c', 'd']) patternsToString :: [PatternChar] -> B.ByteString patternsToString chars = B.pack $ List.foldr (\(PatternChar char) str -> (toEnum $ fromEnum char) : str) [] chars patternsToStrictString :: [PatternChar] -> S.ByteString patternsToStrictString chars = S.pack $ List.foldr (\(PatternChar char) str -> (toEnum $ fromEnum char) : str) [] chars toLazyBS :: String -> B.ByteString toLazyBS = B.pack . List.map (toEnum . fromEnum) toStrictBS :: String -> S.ByteString toStrictBS = S.pack . List.map (toEnum . fromEnum) -- Test that 0 and 1 element always return 0, if present. prop_testZero :: [PatternChar] -> Property prop_testZero pat = let lookup = computeLookup (patternsToString pat) in not (List.null pat) ==> if List.length pat > 1 then lookup 0 == 0 && lookup 1 == 0 else lookup 0 == 0 -- Test that all overlaps found are actually prefixes of the -- pattern string prop_testSubset :: [PatternChar] -> Property prop_testSubset pat = let patStr = patternsToString pat lookup = computeLookup patStr prefix len = B.take (fromIntegral len) testPrefix len = if lookup len == 0 then True else (prefix (lookup len) patStr) `B.isPrefixOf` (prefix len patStr) in not (List.null pat) ==> trivial (B.null patStr) $ List.all testPrefix [0 .. List.length pat - 1] -- Test that the prefix given is the maximal prefix. That is, -- add one more character makes it either equal to the string -- or not a prefix. prop_testCorrectPrefix :: [PatternChar] -> Property prop_testCorrectPrefix pat = let patStr = patternsToString pat lookup = computeLookup patStr isNeverSuffix len = let origPrefix = prefix len patStr -- Drop 1 to remove empty list allPrefixes = List.drop 1 $ B.inits origPrefix in List.all (\p -> B.null p || p == origPrefix || not ((B.reverse p) `B.isPrefixOf` (B.reverse origPrefix))) allPrefixes prefix len = B.take (fromIntegral len) -- True if the prefix returned from lookup for the length given produces -- a string which is a suffix of the original prefix. isRealSuffix len = (B.reverse (prefix (lookup len) patStr)) `B.isPrefixOf` (B.reverse $ prefix len patStr) isLongestSuffix len = let prefixPlus = prefix ((lookup len) + 1) patStr inputPrefix = prefix len patStr in prefixPlus == inputPrefix || not ((B.reverse prefixPlus) `B.isPrefixOf` (B.reverse inputPrefix)) testTable len = if lookup len == 0 then isNeverSuffix len else isRealSuffix len && isLongestSuffix len in not (List.null pat) ==> List.all testTable [0 .. List.length pat - 1] -- Verify that, if a match is found, it is where it's supposed to be in -- the string and it can be independently found by other means. prop_testMatch :: [PatternChar] -> [PatternChar] -> Property prop_testMatch pat search = let patStr = patternsToString pat searchStr = patternsToString search strictStr = patternsToStrictString search strictPat = patternsToStrictString pat patLen = B.length patStr searchLen = B.length searchStr matchIdx = kmpMatch patStr searchStr verify = if matchIdx > -1 then (B.take patLen $ B.drop matchIdx $ searchStr) == patStr && strictPat `S.isSubstringOf` strictStr else (B.null patStr && B.null searchStr) || not (strictPat `S.isSubstringOf` strictStr) in not (List.null pat) ==> trivial (B.null searchStr) $ classify (patLen > searchLen) "Bigger pattern than search" $ classify (patLen < searchLen) "Smaller pattern than search" $ classify (patLen == searchLen) "Equal pattern and search" $ classify (matchIdx > -1) "Match Exists" $ classify (matchIdx == -1) "Match Doesn't Exist" $ verify -- Test a pattern that was known to fail. prop_testBadPat :: [PatternChar] -> Property prop_testBadPat search = let patStr = toLazyBS "bbc" patLen = B.length patStr searchStr = patternsToString search matchIdx = kmpMatch patStr searchStr strictStr = patternsToStrictString search strictPat = toStrictBS "bbc" verify = if matchIdx > -1 then (B.take patLen $ B.drop matchIdx $ searchStr) == patStr && strictPat `S.isSubstringOf` strictStr else (B.null patStr && B.null searchStr) || not (strictPat `S.isSubstringOf` strictStr) in trivial (List.null search) $ verify -- Test that a pattern on the end of the string is found OK. prop_testEndPattern :: [PatternChar] -> [PatternChar] -> Property prop_testEndPattern pat search = let patStr = patternsToString pat searchStr = patternsToString (search ++ pat) matchIdx = kmpMatch patStr searchStr strictStr = patternsToStrictString (search ++ pat) strictPat = patternsToStrictString pat patLen = B.length patStr verify = if matchIdx > -1 then (B.take patLen $ B.drop matchIdx $ searchStr) == patStr && strictPat `S.isSubstringOf` strictStr else (B.null patStr && B.null searchStr) || not (strictPat `S.isSubstringOf` strictStr) in not (List.null pat) ==> trivial (B.null searchStr) $ classify (matchIdx > -1) "Match Exists" $ classify (matchIdx == -1) "Match Doesn't Exists" $ verify props1 = [ prop_testZero , prop_testSubset , prop_testCorrectPrefix , prop_testBadPat ] props2 = [ prop_testMatch , prop_testEndPattern ] allTests = do mapM_ quickCheck props1 mapM_ quickCheck props2 {- *KMPSeq> allTests OK, passed 100 tests. OK, passed 100 tests. OK, passed 100 tests. OK, passed 100 tests (15% trivial). OK, passed 100 tests. 41% Bigger pattern than search, Match Doesn't Exist. 24% Smaller pattern than search, Match Doesn't Exist. 12% trivial, Bigger pattern than search, Match Doesn't Exist. 11% Smaller pattern than search, Match Exists. 9% Equal pattern and search, Match Doesn't Exist. 3% Equal pattern and search, Match Exists. OK, passed 100 tests (100% Match Exists). -}