From: Ramakrishnan Muthukrishnan Date: Mon, 28 Dec 2015 03:04:09 +0000 (+0530) Subject: closer to the rsync algorithm technical paper X-Git-Url: https://git.rkrishnan.org/uri/nxhtml.html?a=commitdiff_plain;h=0c12b50c31b904bb3ff6509940ef9f9aef4ea7c8;p=hs-rsync.git closer to the rsync algorithm technical paper --- diff --git a/app/Main.hs b/app/Main.hs index 0393ce4..f87a0fa 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -9,12 +9,13 @@ main :: IO () main = do let fileRemote = "fasljlajljalfjlajfasdjkg;fdk;kqpitpk;k;asdk;kg;adskg" fileLocal = "ljljalsjdgljadslfjlasjdfqporiuqplsadljfaljdf" + blockSize = 5 -- generate signatures at block boundaries for the local file and send it to remote. - fileLocalSigs = fileSignatures (BL.fromStrict (BS.pack fileLocal)) 5 + fileLocalSigs = fileSignatures (BL.fromStrict (BS.pack fileLocal)) blockSize -- at remote, take the signatures from the other size and generate instructions. - insns = genInstructions fileLocalSigs 5 (BL.fromStrict (BS.pack fileRemote)) + insns = genInstructions fileLocalSigs blockSize (BL.fromStrict (BS.pack fileRemote)) -- at the local side, take those instructions and apply to fileLocal - fileLocalNew = recreate (BL.fromStrict (BS.pack fileLocal)) 5 insns + fileLocalNew = recreate (BL.fromStrict (BS.pack fileLocal)) blockSize insns putStrLn $ "remote: " ++ fileRemote putStrLn $ "local: " ++ fileLocal BS.putStrLn $ (BS.pack "recreated: ") `BS.append` (BL.toStrict fileLocalNew) diff --git a/hs-rsync.cabal b/hs-rsync.cabal index c7394d9..14ac841 100644 --- a/hs-rsync.cabal +++ b/hs-rsync.cabal @@ -18,7 +18,10 @@ library exposed-modules: Lib build-depends: base >= 4.7 && < 5 , bytestring + , containers , cryptohash + , digest + , mtl default-language: Haskell2010 executable hs-rsync-exe diff --git a/src/Lib.hs b/src/Lib.hs index 8cfb325..693cc19 100644 --- a/src/Lib.hs +++ b/src/Lib.hs @@ -5,15 +5,25 @@ module Lib , recreate ) where +import Control.Monad.State +import Data.Bits (shiftL, (.&.), (.|.)) import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy as BL -import Data.Word (Word8) -import qualified Crypto.Hash.SHA1 as SHA1 +import Data.Char (ord) +import Data.Digest.Adler32 (adler32) +import qualified Data.Map as M +import Data.Word (Word8, Word32) +import qualified Crypto.Hash.MD4 as MD4 -type Signature = (BS.ByteString, Int) +type Md4digest = BS.ByteString +type Adler32checksum = Word32 + +type Signature = (Md4digest, Adler32checksum, Int) fileSignatures :: BL.ByteString -> Integer -> [Signature] -fileSignatures bs blockSize = zip (map blockSig (splitBS bs blockSize)) [0..] +fileSignatures bs blockSize = zip3 strongsigs weaksigs [0..] + where strongsigs = map blockSig (splitBS bs blockSize) + weaksigs = map adler32 (splitBS bs blockSize) splitBS :: BL.ByteString -> Integer -> [BL.ByteString] splitBS bs blockSize | fromIntegral (BL.length bs) < blockSize = [bs] @@ -21,8 +31,12 @@ splitBS bs blockSize | otherwise = (BL.take (fromIntegral blockSize) bs) : splitBS (BL.drop (fromIntegral blockSize) bs) blockSize +-- compute md4 digest (128 bits) blockSig :: BL.ByteString -> BS.ByteString -blockSig = SHA1.hash . BL.toStrict +blockSig = MD4.hash . BL.toStrict + +weakSig :: BL.ByteString -> Adler32checksum +weakSig = adler32 data Instruction = RChar Word8 | RBlk Int @@ -30,16 +44,40 @@ data Instruction = RChar Word8 genInstructions :: [Signature] -> Integer -> BL.ByteString -> [Instruction] genInstructions f0sigs blockSize fnew = - if (fnew == BL.empty) - then [] - else - let (blk, blks) = BL.splitAt (fromIntegral blockSize) fnew - sig = blockSig blk - in - case (lookup sig f0sigs) of - Just (idx) -> RBlk (fromIntegral idx) : genInstructions f0sigs blockSize blks - Nothing -> RChar (BL.head blk) : - genInstructions f0sigs blockSize (BL.tail (blk `mappend` blks)) + -- create two hash tables one with adler32 as key and list of block numbers as values + -- another with md4sum as key and block numbers as values. + evalState (go fnew) 0 + where + go :: BL.ByteString -> State Word32 [Instruction] + go fnew | (fnew == BL.empty) = return [] + | otherwise = + let (blk, blks) = BL.splitAt (fromIntegral blockSize) fnew + adlerSum = weakSig blk + in + case M.lookup adlerSum f0AdlerTable of + Nothing -> do + put adlerSum + is <- go (BL.tail (blk `mappend` blks)) + return $ RChar (BL.head blk) : is + Just _ -> + let md4sum = blockSig blk + in + case M.lookup md4sum f0MD4Table of + Just i -> do + is <- go blks + return $ RBlk (head i) : is + Nothing -> + return [RChar (BL.head blk)] + f0AdlerTable = toAdlerMap f0sigs + f0MD4Table = toMD4Map f0sigs + +toAdlerMap :: [Signature] -> M.Map Adler32checksum [Int] +toAdlerMap = foldr f M.empty + where f sig m = let (_, aSig, idx) = sig in M.insertWith (++) aSig [idx] m + +toMD4Map :: [Signature] -> M.Map Md4digest [Int] +toMD4Map = foldr f M.empty + where f sig m = let (mSig, _, idx) = sig in M.insertWith (++) mSig [idx] m recreate :: BL.ByteString -> Integer -> [Instruction] -> BL.ByteString recreate f0 blockSize ins =