From 0c12b50c31b904bb3ff6509940ef9f9aef4ea7c8 Mon Sep 17 00:00:00 2001
From: Ramakrishnan Muthukrishnan <ram@rkrishnan.org>
Date: Mon, 28 Dec 2015 08:34:09 +0530
Subject: [PATCH] closer to the rsync algorithm technical paper

---
 app/Main.hs    |  7 +++---
 hs-rsync.cabal |  3 +++
 src/Lib.hs     | 68 +++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/app/Main.hs b/app/Main.hs
index 0393ce4..f87a0fa 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -9,12 +9,13 @@ main :: IO ()
 main = do
   let fileRemote    = "fasljlajljalfjlajfasdjkg;fdk;kqpitpk;k;asdk;kg;adskg"
       fileLocal     = "ljljalsjdgljadslfjlasjdfqporiuqplsadljfaljdf"
+      blockSize     = 5
       -- generate signatures at block boundaries for the local file and send it to remote.
-      fileLocalSigs = fileSignatures (BL.fromStrict (BS.pack fileLocal)) 5
+      fileLocalSigs = fileSignatures (BL.fromStrict (BS.pack fileLocal)) blockSize
       -- at remote, take the signatures from the other size and generate instructions.
-      insns         = genInstructions fileLocalSigs 5 (BL.fromStrict (BS.pack fileRemote))
+      insns         = genInstructions fileLocalSigs blockSize (BL.fromStrict (BS.pack fileRemote))
       -- at the local side, take those instructions and apply to fileLocal
-      fileLocalNew  = recreate (BL.fromStrict (BS.pack fileLocal)) 5 insns
+      fileLocalNew  = recreate (BL.fromStrict (BS.pack fileLocal)) blockSize insns
   putStrLn $ "remote: " ++ fileRemote
   putStrLn $ "local:  " ++ fileLocal
   BS.putStrLn $ (BS.pack "recreated: ") `BS.append` (BL.toStrict fileLocalNew)
diff --git a/hs-rsync.cabal b/hs-rsync.cabal
index c7394d9..14ac841 100644
--- a/hs-rsync.cabal
+++ b/hs-rsync.cabal
@@ -18,7 +18,10 @@ library
   exposed-modules:     Lib
   build-depends:       base >= 4.7 && < 5
                      , bytestring
+                     , containers
                      , cryptohash
+                     , digest
+                     , mtl
   default-language:    Haskell2010
 
 executable hs-rsync-exe
diff --git a/src/Lib.hs b/src/Lib.hs
index 8cfb325..693cc19 100644
--- a/src/Lib.hs
+++ b/src/Lib.hs
@@ -5,15 +5,25 @@ module Lib
     , recreate
     ) where
 
+import Control.Monad.State
+import Data.Bits (shiftL, (.&.), (.|.))
 import qualified Data.ByteString.Char8 as BS
 import qualified Data.ByteString.Lazy as BL
-import Data.Word (Word8)
-import qualified Crypto.Hash.SHA1 as SHA1
+import Data.Char (ord)
+import Data.Digest.Adler32 (adler32)
+import qualified Data.Map as M
+import Data.Word (Word8, Word32)
+import qualified Crypto.Hash.MD4 as MD4
 
-type Signature = (BS.ByteString, Int)
+type Md4digest       = BS.ByteString
+type Adler32checksum = Word32
+
+type Signature     = (Md4digest, Adler32checksum, Int)
 
 fileSignatures :: BL.ByteString -> Integer -> [Signature]
-fileSignatures bs blockSize = zip (map blockSig (splitBS bs blockSize)) [0..]
+fileSignatures bs blockSize = zip3 strongsigs weaksigs [0..]
+  where strongsigs = map blockSig (splitBS bs blockSize)
+        weaksigs   = map adler32 (splitBS bs blockSize)
 
 splitBS :: BL.ByteString -> Integer -> [BL.ByteString]
 splitBS bs blockSize | fromIntegral (BL.length bs) < blockSize = [bs]
@@ -21,8 +31,12 @@ splitBS bs blockSize | otherwise =
                          (BL.take (fromIntegral blockSize) bs) :
                          splitBS (BL.drop (fromIntegral blockSize) bs) blockSize
 
+-- compute md4 digest (128 bits)
 blockSig :: BL.ByteString -> BS.ByteString
-blockSig = SHA1.hash . BL.toStrict
+blockSig = MD4.hash . BL.toStrict
+
+weakSig :: BL.ByteString -> Adler32checksum
+weakSig = adler32
 
 data Instruction = RChar Word8
                  | RBlk  Int
@@ -30,16 +44,40 @@ data Instruction = RChar Word8
 
 genInstructions :: [Signature] -> Integer -> BL.ByteString -> [Instruction]
 genInstructions f0sigs blockSize fnew =
-  if (fnew == BL.empty)
-  then []
-  else
-    let (blk, blks) = BL.splitAt (fromIntegral blockSize) fnew
-        sig         = blockSig blk
-    in
-      case (lookup sig f0sigs) of
-        Just (idx) -> RBlk (fromIntegral idx) : genInstructions f0sigs blockSize blks
-        Nothing    -> RChar (BL.head blk) :
-          genInstructions f0sigs blockSize (BL.tail (blk `mappend` blks))
+    -- create two hash tables one with adler32 as key and list of block numbers as values
+    -- another with md4sum as key and block numbers as values.
+  evalState (go fnew) 0
+  where
+    go :: BL.ByteString -> State Word32 [Instruction]
+    go fnew | (fnew == BL.empty) = return []
+            | otherwise =
+                let (blk, blks) = BL.splitAt (fromIntegral blockSize) fnew
+                    adlerSum    = weakSig blk
+                in
+                  case M.lookup adlerSum f0AdlerTable of
+                    Nothing -> do
+                      put adlerSum
+                      is <- go (BL.tail (blk `mappend` blks))
+                      return $ RChar (BL.head blk) : is
+                    Just _ ->
+                      let md4sum = blockSig blk
+                      in
+                        case M.lookup md4sum f0MD4Table of
+                          Just i -> do
+                            is <- go blks
+                            return $ RBlk (head i) : is
+                          Nothing ->
+                            return [RChar (BL.head blk)]
+    f0AdlerTable = toAdlerMap f0sigs
+    f0MD4Table   = toMD4Map f0sigs
+
+toAdlerMap :: [Signature] -> M.Map Adler32checksum [Int]
+toAdlerMap = foldr f M.empty
+  where f sig m = let (_, aSig, idx) = sig in M.insertWith (++) aSig [idx] m
+
+toMD4Map :: [Signature] -> M.Map Md4digest [Int]
+toMD4Map = foldr f M.empty
+  where f sig m = let (mSig, _, idx) = sig in M.insertWith (++) mSig [idx] m
 
 recreate :: BL.ByteString -> Integer -> [Instruction] -> BL.ByteString
 recreate f0 blockSize ins =
-- 
2.45.2