From c049941529f6a657d8b56c6164b6407639804be2 Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@allmydata.com>
Date: Thu, 7 Jun 2007 21:47:21 -0700
Subject: [PATCH] move almost all hashing to SHA256, consolidate into
 hashutil.py

The only SHA-1 hash that remains is used in the permutation of nodeids,
where we need to decide if we care about performance or long-term security.
I suspect that we could use a much weaker hash (and faster) hash for
this purpose. In the long run, we'll be doing thousands of such hashes
for each file uploaded or downloaded (one per known peer).
---
 src/allmydata/download.py         | 19 +++++++--------
 src/allmydata/encode.py           | 17 +++++++------
 src/allmydata/hashtree.py         |  6 -----
 src/allmydata/interfaces.py       |  5 ++--
 src/allmydata/test/test_encode.py |  7 +++---
 src/allmydata/test/test_system.py |  2 +-
 src/allmydata/test/test_upload.py |  2 +-
 src/allmydata/upload.py           | 14 +++++------
 src/allmydata/uri.py              |  2 +-
 src/allmydata/util/hashutil.py    | 40 +++++++++++++++++++++++++++++++
 src/allmydata/workqueue.py        |  7 +++---
 11 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/src/allmydata/download.py b/src/allmydata/download.py
index 79bd2e72..1d8729cf 100644
--- a/src/allmydata/download.py
+++ b/src/allmydata/download.py
@@ -1,15 +1,14 @@
 
-import os, random, sha
+import os, random
 from zope.interface import implements
 from twisted.python import log
 from twisted.internet import defer
 from twisted.application import service
 
-from allmydata.util import idlib, mathutil, bencode
+from allmydata.util import idlib, mathutil, bencode, hashutil
 from allmydata.util.assertutil import _assert
 from allmydata import codec, hashtree
 from allmydata.Crypto.Cipher import AES
-from allmydata.Crypto.Hash import SHA256
 from allmydata.uri import unpack_uri
 from allmydata.interfaces import IDownloadTarget, IDownloader
 
@@ -32,8 +31,8 @@ class Output:
         self.downloadable = downloadable
         self._decryptor = AES.new(key=key, mode=AES.MODE_CTR,
                                   counterstart="\x00"*16)
-        self._verifierid_hasher = sha.new(netstring("allmydata_verifierid_v1"))
-        self._fileid_hasher = sha.new(netstring("allmydata_fileid_v1"))
+        self._verifierid_hasher = hashutil.verifierid_hasher()
+        self._fileid_hasher = hashutil.fileid_hasher()
         self.length = 0
         self._segment_number = 0
         self._plaintext_hash_tree = None
@@ -54,7 +53,7 @@ class Output:
         # 2*segment_size.
         self._verifierid_hasher.update(crypttext)
         if self._crypttext_hash_tree:
-            ch = SHA256.new(netstring("allmydata_crypttext_segment_v1"))
+            ch = hashutil.crypttext_segment_hasher()
             ch.update(crypttext)
             crypttext_leaves = {self._segment_number: ch.digest()}
             self._crypttext_hash_tree.set_hashes(leaves=crypttext_leaves)
@@ -66,7 +65,7 @@ class Output:
 
         self._fileid_hasher.update(plaintext)
         if self._plaintext_hash_tree:
-            ph = SHA256.new(netstring("allmydata_plaintext_segment_v1"))
+            ph = hashutil.plaintext_segment_hasher()
             ph.update(plaintext)
             plaintext_leaves = {self._segment_number: ph.digest()}
             self._plaintext_hash_tree.set_hashes(leaves=plaintext_leaves)
@@ -140,7 +139,7 @@ class ValidatedBucket:
 
             #log.msg("checking block_hash(shareid=%d, blocknum=%d) len=%d" %
             #        (self.sharenum, blocknum, len(blockdata)))
-            blockhash = hashtree.block_hash(blockdata)
+            blockhash = hashutil.block_hash(blockdata)
             # we always validate the blockhash
             bh = dict(enumerate(blockhashes))
             # replace blockhash root with validated value
@@ -350,7 +349,7 @@ class FileDownloader:
         # comes back, and compare it against the version in our URI. If they
         # don't match, ignore their data and try someone else.
         def _validate(proposal, bucket):
-            h = hashtree.thingA_hash(proposal)
+            h = hashutil.thingA_hash(proposal)
             if h != self._thingA_hash:
                 self._fetch_failures["thingA"] += 1
                 msg = ("The copy of thingA we received from %s was bad" %
@@ -392,7 +391,7 @@ class FileDownloader:
 
         verifierid = d['verifierid']
         assert isinstance(verifierid, str)
-        assert len(verifierid) == 20
+        assert len(verifierid) == 32
         self._verifierid = verifierid
         self._fileid = d['fileid']
         self._roothash = d['share_root_hash']
diff --git a/src/allmydata/encode.py b/src/allmydata/encode.py
index 7879955e..b905c466 100644
--- a/src/allmydata/encode.py
+++ b/src/allmydata/encode.py
@@ -3,10 +3,9 @@
 from zope.interface import implements
 from twisted.internet import defer
 from twisted.python import log
-from allmydata.hashtree import HashTree, block_hash, thingA_hash
+from allmydata.hashtree import HashTree
 from allmydata.Crypto.Cipher import AES
-from allmydata.Crypto.Hash import SHA256
-from allmydata.util import mathutil, bencode
+from allmydata.util import mathutil, bencode, hashutil
 from allmydata.util.assertutil import _assert
 from allmydata.codec import CRSEncoder
 from allmydata.interfaces import IEncoder
@@ -224,8 +223,8 @@ class Encoder(object):
         # of additional shares which can be substituted if the primary ones
         # are unavailable
 
-        plaintext_hasher = SHA256.new(netstring("allmydata_plaintext_segment_v1"))
-        crypttext_hasher = SHA256.new(netstring("allmydata_crypttext_segment_v1"))
+        plaintext_hasher = hashutil.plaintext_segment_hasher()
+        crypttext_hasher = hashutil.crypttext_segment_hasher()
 
         # memory footprint: we only hold a tiny piece of the plaintext at any
         # given time. We build up a segment's worth of cryptttext, then hand
@@ -258,8 +257,8 @@ class Encoder(object):
         codec = self._tail_codec
         input_piece_size = codec.get_block_size()
 
-        plaintext_hasher = SHA256.new(netstring("allmydata_plaintext_segment_v1"))
-        crypttext_hasher = SHA256.new(netstring("allmydata_crypttext_segment_v1"))
+        plaintext_hasher = hashutil.plaintext_segment_hasher()
+        crypttext_hasher = hashutil.crypttext_segment_hasher()
 
         for i in range(self.required_shares):
             input_piece = self.infile.read(input_piece_size)
@@ -297,7 +296,7 @@ class Encoder(object):
             shareid = shareids[i]
             d = self.send_subshare(shareid, segnum, subshare)
             dl.append(d)
-            subshare_hash = block_hash(subshare)
+            subshare_hash = hashutil.block_hash(subshare)
             self.subshare_hashes[shareid].append(subshare_hash)
         dl = self._gather_responses(dl)
         def _logit(res):
@@ -437,7 +436,7 @@ class Encoder(object):
     def send_thingA_to_all_shareholders(self):
         log.msg("%s: sending thingA" % self)
         thingA = bencode.bencode(self.thingA_data)
-        self.thingA_hash = thingA_hash(thingA)
+        self.thingA_hash = hashutil.thingA_hash(thingA)
         dl = []
         for shareid in self.landlords.keys():
             dl.append(self.send_thingA(shareid, thingA))
diff --git a/src/allmydata/hashtree.py b/src/allmydata/hashtree.py
index a3b1c5b4..f1a839f6 100644
--- a/src/allmydata/hashtree.py
+++ b/src/allmydata/hashtree.py
@@ -446,9 +446,3 @@ class IncompleteHashTree(CompleteBinaryTreeMixin, list):
             for i in added:
                 self[i] = None
             raise
-
-def block_hash(data):
-    return tagged_hash("encoded subshare", data)
-
-def thingA_hash(data):
-    return tagged_hash("thingA", data)
diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py
index ce2cbe84..86c95bdb 100644
--- a/src/allmydata/interfaces.py
+++ b/src/allmydata/interfaces.py
@@ -11,7 +11,6 @@ Hash = StringConstraint(maxLength=HASH_SIZE,
 Nodeid = StringConstraint(maxLength=20,
                           minLength=20) # binary format 20-byte SHA1 hash
 FURL = StringConstraint(1000)
-Verifierid = StringConstraint(20)
 StorageIndex = StringConstraint(32)
 URI = StringConstraint(300) # kind of arbitrary
 MAX_BUCKETS = 200  # per peer
@@ -121,12 +120,12 @@ RIMutableDirectoryNode_ = Any() # TODO: how can we avoid this?
 class RIMutableDirectoryNode(RemoteInterface):
     def list():
         return ListOf( TupleOf(str, # name, relative to directory
-                               ChoiceOf(RIMutableDirectoryNode_, Verifierid)),
+                               ChoiceOf(RIMutableDirectoryNode_, URI)),
                        maxLength=100,
                        )
 
     def get(name=str):
-        return ChoiceOf(RIMutableDirectoryNode_, Verifierid)
+        return ChoiceOf(RIMutableDirectoryNode_, URI)
 
     def add_directory(name=str):
         return RIMutableDirectoryNode_
diff --git a/src/allmydata/test/test_encode.py b/src/allmydata/test/test_encode.py
index fd5bfa4d..07f0f058 100644
--- a/src/allmydata/test/test_encode.py
+++ b/src/allmydata/test/test_encode.py
@@ -7,7 +7,6 @@ from allmydata import encode, download, hashtree
 from allmydata.util import hashutil
 from allmydata.uri import pack_uri
 from allmydata.Crypto.Cipher import AES
-import sha
 from cStringIO import StringIO
 
 def netstring(s):
@@ -300,11 +299,11 @@ class Roundtrip(unittest.TestCase):
             peer = FakeBucketWriter(mode)
             shareholders[shnum] = peer
         e.set_shareholders(shareholders)
-        fileid_hasher = sha.new(netstring("allmydata_fileid_v1"))
+        fileid_hasher = hashutil.fileid_hasher()
         fileid_hasher.update(data)
         cryptor = AES.new(key=nonkey, mode=AES.MODE_CTR,
                           counterstart="\x00"*16)
-        verifierid_hasher = sha.new(netstring("allmydata_verifierid_v1"))
+        verifierid_hasher = hashutil.verifierid_hasher()
         verifierid_hasher.update(cryptor.encrypt(data))
 
         e.set_thingA_data({'verifierid': verifierid_hasher.digest(),
@@ -322,7 +321,7 @@ class Roundtrip(unittest.TestCase):
         if "corrupt_key" in recover_mode:
             key = flip_bit(key)
 
-        URI = pack_uri(storage_index="S" * 20,
+        URI = pack_uri(storage_index="S" * 32,
                        key=key,
                        thingA_hash=thingA_hash,
                        needed_shares=e.required_shares,
diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py
index 868b3ad1..7a97b00c 100644
--- a/src/allmydata/test/test_system.py
+++ b/src/allmydata/test/test_system.py
@@ -216,7 +216,7 @@ class SystemTest(testutil.SignalMixin, unittest.TestCase):
         # change the storage index, which means we'll be asking about the
         # wrong file, so nobody will have any shares
         d = uri.unpack_uri(gooduri)
-        assert len(d['storage_index']) == 20
+        assert len(d['storage_index']) == 32
         d['storage_index'] = self.flip_bit(d['storage_index'])
         return uri.pack_uri(**d)
 
diff --git a/src/allmydata/test/test_upload.py b/src/allmydata/test/test_upload.py
index 37386ceb..d5b20c1c 100644
--- a/src/allmydata/test/test_upload.py
+++ b/src/allmydata/test/test_upload.py
@@ -26,7 +26,7 @@ class GoodServer(unittest.TestCase):
         self.failUnless(uri.startswith("URI:"))
         d = unpack_uri(uri)
         self.failUnless(isinstance(d['storage_index'], str))
-        self.failUnlessEqual(len(d['storage_index']), 20)
+        self.failUnlessEqual(len(d['storage_index']), 32)
         self.failUnless(isinstance(d['key'], str))
         self.failUnlessEqual(len(d['key']), 16)
 
diff --git a/src/allmydata/upload.py b/src/allmydata/upload.py
index 51d40896..b63b1a39 100644
--- a/src/allmydata/upload.py
+++ b/src/allmydata/upload.py
@@ -4,14 +4,14 @@ from twisted.internet import defer
 from twisted.application import service
 from foolscap import Referenceable
 
-from allmydata.util import idlib
+from allmydata.util import idlib, hashutil
 from allmydata import encode
 from allmydata.uri import pack_uri
 from allmydata.interfaces import IUploadable, IUploader
 from allmydata.Crypto.Cipher import AES
 
 from cStringIO import StringIO
-import collections, random, sha
+import collections, random
 
 class NotEnoughPeersError(Exception):
     pass
@@ -75,10 +75,10 @@ class FileUploader:
 
     def set_id_strings(self, verifierid, fileid):
         assert isinstance(verifierid, str)
-        assert len(verifierid) == 20
+        assert len(verifierid) == 32
         self._verifierid = verifierid
         assert isinstance(fileid, str)
-        assert len(fileid) == 20
+        assert len(fileid) == 32
         self._fileid = fileid
 
     def set_encryption_key(self, key):
@@ -298,8 +298,8 @@ class Uploader(service.MultiService):
 
     def compute_id_strings(self, f):
         # return a list of (fileid, encryptionkey, verifierid)
-        fileid_hasher = sha.new(netstring("allmydata_fileid_v1"))
-        enckey_hasher = sha.new(netstring("allmydata_encryption_key_v1"))
+        fileid_hasher = hashutil.fileid_hasher()
+        enckey_hasher = hashutil.key_hasher()
         f.seek(0)
         BLOCKSIZE = 64*1024
         while True:
@@ -313,7 +313,7 @@ class Uploader(service.MultiService):
 
         # now make a second pass to determine the verifierid. It would be
         # nice to make this involve fewer passes.
-        verifierid_hasher = sha.new(netstring("allmydata_verifierid_v1"))
+        verifierid_hasher = hashutil.verifierid_hasher()
         key = enckey[:16]
         cryptor = AES.new(key=key, mode=AES.MODE_CTR,
                           counterstart="\x00"*16)
diff --git a/src/allmydata/uri.py b/src/allmydata/uri.py
index 9cfce767..9a602fae 100644
--- a/src/allmydata/uri.py
+++ b/src/allmydata/uri.py
@@ -9,7 +9,7 @@ def pack_uri(storage_index, key, thingA_hash,
              needed_shares, total_shares, size):
     # applications should pass keyword parameters into this
     assert isinstance(storage_index, str)
-    assert len(storage_index) == 20 # sha1 hash. TODO: sha256
+    assert len(storage_index) == 32 # sha256 hash
 
     assert isinstance(thingA_hash, str)
     assert len(thingA_hash) == 32 # sha56 hash
diff --git a/src/allmydata/util/hashutil.py b/src/allmydata/util/hashutil.py
index ccadccf0..c85ec238 100644
--- a/src/allmydata/util/hashutil.py
+++ b/src/allmydata/util/hashutil.py
@@ -16,3 +16,43 @@ def tagged_pair_hash(tag, val1, val2):
     s.update(netstring(val2))
     return s.digest()
 
+# specific hash tags that we use
+
+def tagged_hasher(tag):
+    return SHA256.new(netstring(tag))
+
+def block_hash(data):
+    return tagged_hash("allmydata_encoded_subshare_v1", data)
+def block_hasher():
+    return tagged_hasher("allmydata_encoded_subshare_v1")
+
+def thingA_hash(data):
+    return tagged_hash("thingA", data)
+def thingA_hasher():
+    return tagged_hasher("thingA")
+
+def fileid_hash(data):
+    return tagged_hash("allmydata_fileid_v1", data)
+def fileid_hasher():
+    return tagged_hasher("allmydata_fileid_v1")
+
+def verifierid_hash(data):
+    return tagged_hash("allmydata_verifierid_v1", data)
+def verifierid_hasher():
+    return tagged_hasher("allmydata_verifierid_v1")
+
+def crypttext_segment_hash(data):
+    return tagged_hash("allmydata_crypttext_segment_v1", data)
+def crypttext_segment_hasher():
+    return tagged_hasher("allmydata_crypttext_segment_v1")
+
+def plaintext_segment_hash(data):
+    return tagged_hash("allmydata_plaintext_segment_v1", data)
+def plaintext_segment_hasher():
+    return tagged_hasher("allmydata_plaintext_segment_v1")
+
+def key_hash(data):
+    return tagged_hash("allmydata_encryption_key_v1", data)
+def key_hasher():
+    return tagged_hasher("allmydata_encryption_key_v1")
+
diff --git a/src/allmydata/workqueue.py b/src/allmydata/workqueue.py
index 1ad47a9e..4e28e9db 100644
--- a/src/allmydata/workqueue.py
+++ b/src/allmydata/workqueue.py
@@ -1,10 +1,11 @@
 
-import os, shutil, sha
+import os, shutil
 from zope.interface import implements
 from twisted.internet import defer
 from allmydata.util import bencode
 from allmydata.util.idlib import b2a
 from allmydata.Crypto.Cipher import AES
+from allmydata.Crypto.Hash import SHA256
 from allmydata.filetree.nodemaker import NodeMaker
 from allmydata.filetree.interfaces import INode
 from allmydata.filetree.file import CHKFileNode
@@ -382,9 +383,9 @@ def make_aes_key():
 def make_rsa_key():
     raise NotImplementedError
 def hash_sha(data):
-    return sha.new(data).digest()
+    return SHA256.new(data).digest()
 def hash_sha_to_key(data):
-    return sha.new(data).digest()[:AES_KEY_LENGTH]
+    return SHA256.new(data).digest()[:AES_KEY_LENGTH]
 def aes_encrypt(key, plaintext):
     assert isinstance(key, str)
     assert len(key) == AES_KEY_LENGTH
-- 
2.45.2