From: Brian Warner Date: Fri, 8 Jun 2007 04:47:21 +0000 (-0700) Subject: move almost all hashing to SHA256, consolidate into hashutil.py X-Git-Tag: allmydata-tahoe-0.3.0~12 X-Git-Url: https://git.rkrishnan.org/components/%22news.html/reliability?a=commitdiff_plain;h=c049941529f6a657d8b56c6164b6407639804be2;p=tahoe-lafs%2Ftahoe-lafs.git move almost all hashing to SHA256, consolidate into hashutil.py The only SHA-1 hash that remains is used in the permutation of nodeids, where we need to decide if we care about performance or long-term security. I suspect that we could use a much weaker hash (and faster) hash for this purpose. In the long run, we'll be doing thousands of such hashes for each file uploaded or downloaded (one per known peer). --- diff --git a/src/allmydata/download.py b/src/allmydata/download.py index 79bd2e72..1d8729cf 100644 --- a/src/allmydata/download.py +++ b/src/allmydata/download.py @@ -1,15 +1,14 @@ -import os, random, sha +import os, random from zope.interface import implements from twisted.python import log from twisted.internet import defer from twisted.application import service -from allmydata.util import idlib, mathutil, bencode +from allmydata.util import idlib, mathutil, bencode, hashutil from allmydata.util.assertutil import _assert from allmydata import codec, hashtree from allmydata.Crypto.Cipher import AES -from allmydata.Crypto.Hash import SHA256 from allmydata.uri import unpack_uri from allmydata.interfaces import IDownloadTarget, IDownloader @@ -32,8 +31,8 @@ class Output: self.downloadable = downloadable self._decryptor = AES.new(key=key, mode=AES.MODE_CTR, counterstart="\x00"*16) - self._verifierid_hasher = sha.new(netstring("allmydata_verifierid_v1")) - self._fileid_hasher = sha.new(netstring("allmydata_fileid_v1")) + self._verifierid_hasher = hashutil.verifierid_hasher() + self._fileid_hasher = hashutil.fileid_hasher() self.length = 0 self._segment_number = 0 self._plaintext_hash_tree = None @@ -54,7 +53,7 @@ class Output: # 2*segment_size. self._verifierid_hasher.update(crypttext) if self._crypttext_hash_tree: - ch = SHA256.new(netstring("allmydata_crypttext_segment_v1")) + ch = hashutil.crypttext_segment_hasher() ch.update(crypttext) crypttext_leaves = {self._segment_number: ch.digest()} self._crypttext_hash_tree.set_hashes(leaves=crypttext_leaves) @@ -66,7 +65,7 @@ class Output: self._fileid_hasher.update(plaintext) if self._plaintext_hash_tree: - ph = SHA256.new(netstring("allmydata_plaintext_segment_v1")) + ph = hashutil.plaintext_segment_hasher() ph.update(plaintext) plaintext_leaves = {self._segment_number: ph.digest()} self._plaintext_hash_tree.set_hashes(leaves=plaintext_leaves) @@ -140,7 +139,7 @@ class ValidatedBucket: #log.msg("checking block_hash(shareid=%d, blocknum=%d) len=%d" % # (self.sharenum, blocknum, len(blockdata))) - blockhash = hashtree.block_hash(blockdata) + blockhash = hashutil.block_hash(blockdata) # we always validate the blockhash bh = dict(enumerate(blockhashes)) # replace blockhash root with validated value @@ -350,7 +349,7 @@ class FileDownloader: # comes back, and compare it against the version in our URI. If they # don't match, ignore their data and try someone else. def _validate(proposal, bucket): - h = hashtree.thingA_hash(proposal) + h = hashutil.thingA_hash(proposal) if h != self._thingA_hash: self._fetch_failures["thingA"] += 1 msg = ("The copy of thingA we received from %s was bad" % @@ -392,7 +391,7 @@ class FileDownloader: verifierid = d['verifierid'] assert isinstance(verifierid, str) - assert len(verifierid) == 20 + assert len(verifierid) == 32 self._verifierid = verifierid self._fileid = d['fileid'] self._roothash = d['share_root_hash'] diff --git a/src/allmydata/encode.py b/src/allmydata/encode.py index 7879955e..b905c466 100644 --- a/src/allmydata/encode.py +++ b/src/allmydata/encode.py @@ -3,10 +3,9 @@ from zope.interface import implements from twisted.internet import defer from twisted.python import log -from allmydata.hashtree import HashTree, block_hash, thingA_hash +from allmydata.hashtree import HashTree from allmydata.Crypto.Cipher import AES -from allmydata.Crypto.Hash import SHA256 -from allmydata.util import mathutil, bencode +from allmydata.util import mathutil, bencode, hashutil from allmydata.util.assertutil import _assert from allmydata.codec import CRSEncoder from allmydata.interfaces import IEncoder @@ -224,8 +223,8 @@ class Encoder(object): # of additional shares which can be substituted if the primary ones # are unavailable - plaintext_hasher = SHA256.new(netstring("allmydata_plaintext_segment_v1")) - crypttext_hasher = SHA256.new(netstring("allmydata_crypttext_segment_v1")) + plaintext_hasher = hashutil.plaintext_segment_hasher() + crypttext_hasher = hashutil.crypttext_segment_hasher() # memory footprint: we only hold a tiny piece of the plaintext at any # given time. We build up a segment's worth of cryptttext, then hand @@ -258,8 +257,8 @@ class Encoder(object): codec = self._tail_codec input_piece_size = codec.get_block_size() - plaintext_hasher = SHA256.new(netstring("allmydata_plaintext_segment_v1")) - crypttext_hasher = SHA256.new(netstring("allmydata_crypttext_segment_v1")) + plaintext_hasher = hashutil.plaintext_segment_hasher() + crypttext_hasher = hashutil.crypttext_segment_hasher() for i in range(self.required_shares): input_piece = self.infile.read(input_piece_size) @@ -297,7 +296,7 @@ class Encoder(object): shareid = shareids[i] d = self.send_subshare(shareid, segnum, subshare) dl.append(d) - subshare_hash = block_hash(subshare) + subshare_hash = hashutil.block_hash(subshare) self.subshare_hashes[shareid].append(subshare_hash) dl = self._gather_responses(dl) def _logit(res): @@ -437,7 +436,7 @@ class Encoder(object): def send_thingA_to_all_shareholders(self): log.msg("%s: sending thingA" % self) thingA = bencode.bencode(self.thingA_data) - self.thingA_hash = thingA_hash(thingA) + self.thingA_hash = hashutil.thingA_hash(thingA) dl = [] for shareid in self.landlords.keys(): dl.append(self.send_thingA(shareid, thingA)) diff --git a/src/allmydata/hashtree.py b/src/allmydata/hashtree.py index a3b1c5b4..f1a839f6 100644 --- a/src/allmydata/hashtree.py +++ b/src/allmydata/hashtree.py @@ -446,9 +446,3 @@ class IncompleteHashTree(CompleteBinaryTreeMixin, list): for i in added: self[i] = None raise - -def block_hash(data): - return tagged_hash("encoded subshare", data) - -def thingA_hash(data): - return tagged_hash("thingA", data) diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py index ce2cbe84..86c95bdb 100644 --- a/src/allmydata/interfaces.py +++ b/src/allmydata/interfaces.py @@ -11,7 +11,6 @@ Hash = StringConstraint(maxLength=HASH_SIZE, Nodeid = StringConstraint(maxLength=20, minLength=20) # binary format 20-byte SHA1 hash FURL = StringConstraint(1000) -Verifierid = StringConstraint(20) StorageIndex = StringConstraint(32) URI = StringConstraint(300) # kind of arbitrary MAX_BUCKETS = 200 # per peer @@ -121,12 +120,12 @@ RIMutableDirectoryNode_ = Any() # TODO: how can we avoid this? class RIMutableDirectoryNode(RemoteInterface): def list(): return ListOf( TupleOf(str, # name, relative to directory - ChoiceOf(RIMutableDirectoryNode_, Verifierid)), + ChoiceOf(RIMutableDirectoryNode_, URI)), maxLength=100, ) def get(name=str): - return ChoiceOf(RIMutableDirectoryNode_, Verifierid) + return ChoiceOf(RIMutableDirectoryNode_, URI) def add_directory(name=str): return RIMutableDirectoryNode_ diff --git a/src/allmydata/test/test_encode.py b/src/allmydata/test/test_encode.py index fd5bfa4d..07f0f058 100644 --- a/src/allmydata/test/test_encode.py +++ b/src/allmydata/test/test_encode.py @@ -7,7 +7,6 @@ from allmydata import encode, download, hashtree from allmydata.util import hashutil from allmydata.uri import pack_uri from allmydata.Crypto.Cipher import AES -import sha from cStringIO import StringIO def netstring(s): @@ -300,11 +299,11 @@ class Roundtrip(unittest.TestCase): peer = FakeBucketWriter(mode) shareholders[shnum] = peer e.set_shareholders(shareholders) - fileid_hasher = sha.new(netstring("allmydata_fileid_v1")) + fileid_hasher = hashutil.fileid_hasher() fileid_hasher.update(data) cryptor = AES.new(key=nonkey, mode=AES.MODE_CTR, counterstart="\x00"*16) - verifierid_hasher = sha.new(netstring("allmydata_verifierid_v1")) + verifierid_hasher = hashutil.verifierid_hasher() verifierid_hasher.update(cryptor.encrypt(data)) e.set_thingA_data({'verifierid': verifierid_hasher.digest(), @@ -322,7 +321,7 @@ class Roundtrip(unittest.TestCase): if "corrupt_key" in recover_mode: key = flip_bit(key) - URI = pack_uri(storage_index="S" * 20, + URI = pack_uri(storage_index="S" * 32, key=key, thingA_hash=thingA_hash, needed_shares=e.required_shares, diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py index 868b3ad1..7a97b00c 100644 --- a/src/allmydata/test/test_system.py +++ b/src/allmydata/test/test_system.py @@ -216,7 +216,7 @@ class SystemTest(testutil.SignalMixin, unittest.TestCase): # change the storage index, which means we'll be asking about the # wrong file, so nobody will have any shares d = uri.unpack_uri(gooduri) - assert len(d['storage_index']) == 20 + assert len(d['storage_index']) == 32 d['storage_index'] = self.flip_bit(d['storage_index']) return uri.pack_uri(**d) diff --git a/src/allmydata/test/test_upload.py b/src/allmydata/test/test_upload.py index 37386ceb..d5b20c1c 100644 --- a/src/allmydata/test/test_upload.py +++ b/src/allmydata/test/test_upload.py @@ -26,7 +26,7 @@ class GoodServer(unittest.TestCase): self.failUnless(uri.startswith("URI:")) d = unpack_uri(uri) self.failUnless(isinstance(d['storage_index'], str)) - self.failUnlessEqual(len(d['storage_index']), 20) + self.failUnlessEqual(len(d['storage_index']), 32) self.failUnless(isinstance(d['key'], str)) self.failUnlessEqual(len(d['key']), 16) diff --git a/src/allmydata/upload.py b/src/allmydata/upload.py index 51d40896..b63b1a39 100644 --- a/src/allmydata/upload.py +++ b/src/allmydata/upload.py @@ -4,14 +4,14 @@ from twisted.internet import defer from twisted.application import service from foolscap import Referenceable -from allmydata.util import idlib +from allmydata.util import idlib, hashutil from allmydata import encode from allmydata.uri import pack_uri from allmydata.interfaces import IUploadable, IUploader from allmydata.Crypto.Cipher import AES from cStringIO import StringIO -import collections, random, sha +import collections, random class NotEnoughPeersError(Exception): pass @@ -75,10 +75,10 @@ class FileUploader: def set_id_strings(self, verifierid, fileid): assert isinstance(verifierid, str) - assert len(verifierid) == 20 + assert len(verifierid) == 32 self._verifierid = verifierid assert isinstance(fileid, str) - assert len(fileid) == 20 + assert len(fileid) == 32 self._fileid = fileid def set_encryption_key(self, key): @@ -298,8 +298,8 @@ class Uploader(service.MultiService): def compute_id_strings(self, f): # return a list of (fileid, encryptionkey, verifierid) - fileid_hasher = sha.new(netstring("allmydata_fileid_v1")) - enckey_hasher = sha.new(netstring("allmydata_encryption_key_v1")) + fileid_hasher = hashutil.fileid_hasher() + enckey_hasher = hashutil.key_hasher() f.seek(0) BLOCKSIZE = 64*1024 while True: @@ -313,7 +313,7 @@ class Uploader(service.MultiService): # now make a second pass to determine the verifierid. It would be # nice to make this involve fewer passes. - verifierid_hasher = sha.new(netstring("allmydata_verifierid_v1")) + verifierid_hasher = hashutil.verifierid_hasher() key = enckey[:16] cryptor = AES.new(key=key, mode=AES.MODE_CTR, counterstart="\x00"*16) diff --git a/src/allmydata/uri.py b/src/allmydata/uri.py index 9cfce767..9a602fae 100644 --- a/src/allmydata/uri.py +++ b/src/allmydata/uri.py @@ -9,7 +9,7 @@ def pack_uri(storage_index, key, thingA_hash, needed_shares, total_shares, size): # applications should pass keyword parameters into this assert isinstance(storage_index, str) - assert len(storage_index) == 20 # sha1 hash. TODO: sha256 + assert len(storage_index) == 32 # sha256 hash assert isinstance(thingA_hash, str) assert len(thingA_hash) == 32 # sha56 hash diff --git a/src/allmydata/util/hashutil.py b/src/allmydata/util/hashutil.py index ccadccf0..c85ec238 100644 --- a/src/allmydata/util/hashutil.py +++ b/src/allmydata/util/hashutil.py @@ -16,3 +16,43 @@ def tagged_pair_hash(tag, val1, val2): s.update(netstring(val2)) return s.digest() +# specific hash tags that we use + +def tagged_hasher(tag): + return SHA256.new(netstring(tag)) + +def block_hash(data): + return tagged_hash("allmydata_encoded_subshare_v1", data) +def block_hasher(): + return tagged_hasher("allmydata_encoded_subshare_v1") + +def thingA_hash(data): + return tagged_hash("thingA", data) +def thingA_hasher(): + return tagged_hasher("thingA") + +def fileid_hash(data): + return tagged_hash("allmydata_fileid_v1", data) +def fileid_hasher(): + return tagged_hasher("allmydata_fileid_v1") + +def verifierid_hash(data): + return tagged_hash("allmydata_verifierid_v1", data) +def verifierid_hasher(): + return tagged_hasher("allmydata_verifierid_v1") + +def crypttext_segment_hash(data): + return tagged_hash("allmydata_crypttext_segment_v1", data) +def crypttext_segment_hasher(): + return tagged_hasher("allmydata_crypttext_segment_v1") + +def plaintext_segment_hash(data): + return tagged_hash("allmydata_plaintext_segment_v1", data) +def plaintext_segment_hasher(): + return tagged_hasher("allmydata_plaintext_segment_v1") + +def key_hash(data): + return tagged_hash("allmydata_encryption_key_v1", data) +def key_hasher(): + return tagged_hasher("allmydata_encryption_key_v1") + diff --git a/src/allmydata/workqueue.py b/src/allmydata/workqueue.py index 1ad47a9e..4e28e9db 100644 --- a/src/allmydata/workqueue.py +++ b/src/allmydata/workqueue.py @@ -1,10 +1,11 @@ -import os, shutil, sha +import os, shutil from zope.interface import implements from twisted.internet import defer from allmydata.util import bencode from allmydata.util.idlib import b2a from allmydata.Crypto.Cipher import AES +from allmydata.Crypto.Hash import SHA256 from allmydata.filetree.nodemaker import NodeMaker from allmydata.filetree.interfaces import INode from allmydata.filetree.file import CHKFileNode @@ -382,9 +383,9 @@ def make_aes_key(): def make_rsa_key(): raise NotImplementedError def hash_sha(data): - return sha.new(data).digest() + return SHA256.new(data).digest() def hash_sha_to_key(data): - return sha.new(data).digest()[:AES_KEY_LENGTH] + return SHA256.new(data).digest()[:AES_KEY_LENGTH] def aes_encrypt(key, plaintext): assert isinstance(key, str) assert len(key) == AES_KEY_LENGTH