1 {-# LANGUAGE OverloadedStrings #-}
8 import Control.Monad.State
9 import Data.Bits (shiftL, shiftR, (.&.))
10 import qualified Data.ByteString.Char8 as BS
11 import qualified Data.ByteString.Lazy as BL
12 import Data.Digest.Adler32 (adler32)
13 import qualified Data.Map as M
14 import Data.Word (Word8, Word32)
15 import qualified Crypto.Hash.MD4 as MD4
17 type Md4digest = BS.ByteString
18 type Adler32checksum = Word32
20 type Checksum = (Word32, Int, Int)
22 type Signature = (Md4digest, Adler32checksum, Int)
24 fileSignatures :: BL.ByteString -> Integer -> [Signature]
25 fileSignatures bs blockSize = zip3 strongsigs weaksigs [0..]
26 where strongsigs = map blockSig (splitBS bs blockSize)
27 weaksigs = map adler32 (splitBS bs blockSize)
29 splitBS :: BL.ByteString -> Integer -> [BL.ByteString]
30 splitBS bs blockSize | fromIntegral (BL.length bs) < blockSize = [bs]
31 splitBS bs blockSize = BL.take (fromIntegral blockSize) bs :
32 splitBS (BL.drop (fromIntegral blockSize) bs) blockSize
34 -- compute md4 digest (128 bits)
35 blockSig :: BL.ByteString -> BS.ByteString
36 blockSig = MD4.hash . BL.toStrict
38 weakSig :: BL.ByteString -> Adler32checksum
41 data Instruction = RChar Word8
45 genInstructions :: [Signature] -> Integer -> BL.ByteString -> [Instruction]
46 genInstructions f0sigs blockSize fnew =
47 evalState (go 0 fnew) sig0
49 sig0 = weakSig $ BL.take (fromIntegral blockSize) fnew
50 go :: Integer -> BL.ByteString -> State Adler32checksum [Instruction]
51 go startIdx fnew | fnew == BL.empty = return []
53 let (blk, blks) = BL.splitAt (fromIntegral blockSize) fnew
54 endIdx = startIdx + fromIntegral (BL.length blk) - 1
56 let matches = M.lookup adlerSum f0AdlerTable >>
57 M.lookup (blockSig blk) f0MD4Table
60 -- modify (`adler32Update` blk)
61 put $ rollingChecksum (fromIntegral startIdx) (fromIntegral endIdx) fnew
62 is <- go (endIdx + 1) blks
63 return $ RBlk (head idxs) : is
66 c' = BL.head blk -- FIX (should have been blks)
67 -- modify (`adler32Update` BL.singleton c)
68 put $ rollingChecksumUpdate adlerSum c c' startIdx endIdx
69 is <- go (startIdx + 1) (BL.tail (blk `mappend` blks))
71 f0AdlerTable = toAdlerMap f0sigs
72 f0MD4Table = toMD4Map f0sigs
74 toAdlerMap :: [Signature] -> M.Map Adler32checksum [Int]
75 toAdlerMap = foldr f M.empty
76 where f sig m = let (_, aSig, idx) = sig in M.insertWith (++) aSig [idx] m
78 toMD4Map :: [Signature] -> M.Map Md4digest [Int]
79 toMD4Map = foldr f M.empty
80 where f sig m = let (mSig, _, idx) = sig in M.insertWith (++) mSig [idx] m
82 recreate :: BL.ByteString -> Integer -> [Instruction] -> BL.ByteString
83 recreate f0 blockSize ins =
84 let f0blocks = splitBS f0 blockSize
87 where go f0blocks [] = mempty
88 go f0blocks (inst:insts) =
90 RBlk i -> (f0blocks !! i) `mappend` go f0blocks insts
91 RChar w -> BL.singleton w `mappend` go f0blocks insts
93 rollingChecksum :: Int -> Int -> BL.ByteString -> Word32
94 rollingChecksum strtIdx endIdx bs = a `mod` m + ((fromIntegral b) `mod` m) `shiftL` mb
95 where a = BL.foldl (\acc x -> acc + (fromIntegral x)) 0 bs'
96 b = BL.foldl (\acc x -> acc + x) 0 (BL.pack wbs')
97 bs' = BL.take (fromIntegral (endIdx - strtIdx + 1)) bs
100 wbs' = BL.zipWith (*) (BL.pack (reverse (map fromIntegral [1..(endIdx - strtIdx + 1)]))) bs'
102 checksum :: BL.ByteString -> Int -> Int -> Checksum
103 checksum bs strtIdx endIdx = (csval, strtIdx, endIdx)
104 where csval = a `mod` m + ((fromIntegral b) `mod` m) `shiftL` size
105 buffer = map fromIntegral $ take (endIdx - strtIdx) $ drop strtIdx $ BL.unpack bs
106 indices = map fromIntegral [1..(endIdx - strtIdx + 1)]
108 b = sum $ zipWith (*) (reverse indices) buffer
112 -- given the checksum a(k, l) and b(k, l), find checksum a(k+1, l+1), b(k+1, l+1)
113 checksumUpdate :: Checksum -> BL.ByteString -> Checksum
114 checksumUpdate curCheckSum bs = (csval, oldStrtIdx + 1, oldEndIdx + 1)
115 where (oldChecksum, oldStrtIdx, oldEndIdx) = curCheckSum
116 csval = a `mod` m + ((fromIntegral b) `mod` m) `shiftL` size
119 bold = oldChecksum `shiftR` size
120 aold = oldChecksum .&. (m - 1)
121 xk = head $ drop oldStrtIdx $ BL.unpack bs
122 xlPlus1 = head $ drop (oldEndIdx + 1) $BL.unpack bs
123 a = aold - fromIntegral xk + fromIntegral xlPlus1
124 b = a + bold - (fromIntegral (oldEndIdx - oldStrtIdx + 1))
126 -- given the checksum of bytes from index: startIdx to endIdx, find
127 -- the checksum for the block from (startIdx + 1 .. endIdx + 1)
128 rollingChecksumUpdate :: Word32 -> Word8 -> Word8 -> Integer -> Integer -> Word32
129 rollingChecksumUpdate oldChecksum old new strtIdx endIdx =
130 let b_Old = (oldChecksum `shiftR` 16) .&. 0xff
131 a_Old = (oldChecksum .&. 0xff)
132 a_New = (a_Old - (fromIntegral old) + (fromIntegral new)) `mod` m
133 b_New = (b_Old - ((fromIntegral endIdx) - (fromIntegral strtIdx) + 1) * (fromIntegral old) + a_New) `mod` m
136 a_New `mod` m + (b_New `mod` m) * m