From: Brian Warner Date: Thu, 12 Apr 2007 20:13:25 +0000 (-0700) Subject: rename chunk.py to hashtree.py X-Git-Tag: tahoe_v0.1.0-0-UNSTABLE~110 X-Git-Url: https://git.rkrishnan.org/vdrive/%22news.html/$rel_link?a=commitdiff_plain;h=d8215e0c6f0ad7468db1003ece7e086669eaf841;p=tahoe-lafs%2Ftahoe-lafs.git rename chunk.py to hashtree.py --- diff --git a/src/allmydata/chunk.py b/src/allmydata/chunk.py deleted file mode 100644 index 3e4e09ed..00000000 --- a/src/allmydata/chunk.py +++ /dev/null @@ -1,380 +0,0 @@ -# -*- test-case-name: allmydata.test.test_hashtree -*- - -""" -Read and write chunks from files. - -Version 1.0.0. - -A file is divided into blocks, each of which has size L{BLOCK_SIZE} -(except for the last block, which may be smaller). Blocks are encoded -into chunks. One publishes the hash of the entire file. Clients -who want to download the file first obtain the hash, then the clients -can receive chunks in any order. Cryptographic hashing is used to -verify each received chunk before writing to disk. Thus it is -impossible to download corrupt data if one has the correct file hash. - -One obtains the hash of a complete file via -L{CompleteChunkFile.file_hash}. One can read chunks from a complete -file by the sequence operations of C{len()} and subscripting on a -L{CompleteChunkFile} object. One can open an empty or partially -downloaded file with L{PartialChunkFile}, and read and write chunks -to this file. A chunk will fail to write if its contents and index -are not consistent with the overall file hash passed to -L{PartialChunkFile} when the partial chunk file was first created. - -The chunks have an overhead of less than 4% for files of size -less than C{10**20} bytes. - -Benchmarks: - - - On a 3 GHz Pentium 3, it took 3.4 minutes to first make a - L{CompleteChunkFile} object for a 4 GB file. Up to 10 MB of - memory was used as the constructor ran. A metafile filename - was passed to the constructor, and so the hash information was - written to the metafile. The object used a negligible amount - of memory after the constructor was finished. - - Creation of L{CompleteChunkFile} objects in future runs of the - program took negligible time, since the hash information was - already stored in the metafile. - -@var BLOCK_SIZE: Size of a block. See L{BlockFile}. -@var MAX_CHUNK_SIZE: Upper bound on the size of a chunk. - See L{CompleteChunkFile}. - -free (adj.): unencumbered; not under the control of others -Written by Connelly Barnes in 2005 and released into the -public domain with no warranty of any kind, either expressed -or implied. It probably won't make your computer catch on fire, -or eat your children, but it might. Use at your own risk. -""" - -from allmydata.util import idlib -from allmydata.util.hashutil import tagged_hash, tagged_pair_hash - -__version__ = '1.0.0-allmydata' - -BLOCK_SIZE = 65536 -MAX_CHUNK_SIZE = BLOCK_SIZE + 4096 - -def roundup_pow2(x): - """ - Round integer C{x} up to the nearest power of 2. - """ - ans = 1 - while ans < x: - ans *= 2 - return ans - - -class CompleteBinaryTreeMixin: - """ - Adds convenience methods to a complete binary tree. - - Assumes the total number of elements in the binary tree may be - accessed via C{__len__}, and that each element can be retrieved - using list subscripting. - - Tree is indexed like so:: - - - 0 - / \ - 1 2 - / \ / \ - 3 4 5 6 - / \ / \ / \ / \ - 7 8 9 10 11 12 13 14 - - """ - def parent(self, i): - """ - Index of the parent of C{i}. - """ - if i < 1 or (hasattr(self, '__len__') and i >= len(self)): - raise IndexError('index out of range: ' + repr(i)) - return (i - 1) // 2 - - def lchild(self, i): - """ - Index of the left child of C{i}. - """ - ans = 2 * i + 1 - if i < 0 or (hasattr(self, '__len__') and ans >= len(self)): - raise IndexError('index out of range: ' + repr(i)) - return ans - - def rchild(self, i): - """ - Index of right child of C{i}. - """ - ans = 2 * i + 2 - if i < 0 or (hasattr(self, '__len__') and ans >= len(self)): - raise IndexError('index out of range: ' + repr(i)) - return ans - - def sibling(self, i): - """ - Index of sibling of C{i}. - """ - parent = self.parent(i) - if self.lchild(parent) == i: - return self.rchild(parent) - else: - return self.lchild(parent) - - def needed_for(self, i): - """ - Return a list of node indices that are necessary for the hash chain. - """ - if i < 0 or i >= len(self): - raise IndexError('index out of range: ' + repr(i)) - needed = [] - here = i - while here != 0: - needed.append(self.sibling(here)) - here = self.parent(here) - return needed - - def depth_first(self, i=0): - yield i, 0 - try: - for child,childdepth in self.depth_first(self.lchild(i)): - yield child, childdepth+1 - except IndexError: - pass - try: - for child,childdepth in self.depth_first(self.rchild(i)): - yield child, childdepth+1 - except IndexError: - pass - - def dump(self): - lines = [] - for i,depth in self.depth_first(): - lines.append("%s%3d: %s" % (" "*depth, i, idlib.b2a_or_none(self[i]))) - return "\n".join(lines) + "\n" - -def empty_leaf_hash(i): - return tagged_hash('Merkle tree empty leaf', "%d" % i) -def pair_hash(a, b): - return tagged_pair_hash('Merkle tree internal node', a, b) - -class HashTree(CompleteBinaryTreeMixin, list): - """ - Compute Merkle hashes at any node in a complete binary tree. - - Tree is indexed like so:: - - - 0 - / \ - 1 2 - / \ / \ - 3 4 5 6 - / \ / \ / \ / \ - 7 8 9 10 11 12 13 14 <- List passed to constructor. - - """ - def __init__(self, L): - """ - Create complete binary tree from list of hash strings. - - The list is augmented by hashes so its length is a power of 2, and - then this is used as the bottom row of the hash tree. - - The augmenting is done so that if the augmented element is at - index C{i}, then its value is C{hash(tagged_hash('Merkle tree empty leaf', '%d'%i))}. - """ - # Augment the list. - start = len(L) - end = roundup_pow2(len(L)) - L = L + [None] * (end - start) - for i in range(start, end): - L[i] = empty_leaf_hash(i) - # Form each row of the tree. - rows = [L] - while len(rows[-1]) != 1: - last = rows[-1] - rows += [[pair_hash(last[2*i], last[2*i+1]) - for i in xrange(len(last)//2)]] - # Flatten the list of rows into a single list. - rows.reverse() - self[:] = sum(rows, []) - - -class NotEnoughHashesError(Exception): - pass - -class BadHashError(Exception): - pass - -class IncompleteHashTree(CompleteBinaryTreeMixin, list): - """I am a hash tree which may or may not be complete. I can be used to - validate inbound data from some untrustworthy provider who has a subset of - leaves and a sufficient subset of internal nodes. - - Initially I am completely unpopulated. Over time, I will become filled with - hashes, just enough to validate particular leaf nodes. - - If you desire to validate leaf number N, first find out which hashes I need - by calling needed_hashes(N). This will return a list of node numbers (which - will nominally be the sibling chain between the given leaf and the root, - but if I already have some of those nodes, needed_hashes(N) will only - return a subset). Obtain these hashes from the data provider, then tell me - about them with set_hash(i, HASH). Once I have enough hashes, you can tell - me the hash of the leaf with set_leaf_hash(N, HASH), and I will either - return None or raise BadHashError. - - The first hash to be set will probably be 0 (the root hash), since this is - the one that will come from someone more trustworthy than the data - provider. - - """ - - def __init__(self, num_leaves): - L = [None] * num_leaves - start = len(L) - end = roundup_pow2(len(L)) - self.first_leaf_num = end - 1 - L = L + [None] * (end - start) - rows = [L] - while len(rows[-1]) != 1: - last = rows[-1] - rows += [[None for i in xrange(len(last)//2)]] - # Flatten the list of rows into a single list. - rows.reverse() - self[:] = sum(rows, []) - - def needed_hashes(self, hashes=[], leaves=[]): - hashnums = set(list(hashes)) - for leafnum in leaves: - hashnums.add(self.first_leaf_num + leafnum) - maybe_needed = set() - for hashnum in hashnums: - maybe_needed.update(self.needed_for(hashnum)) - maybe_needed.add(0) # need the root too - return set([i for i in maybe_needed if self[i] is None]) - - - def set_hashes(self, hashes={}, leaves={}, must_validate=False): - """Add a bunch of hashes to the tree. - - I will validate these to the best of my ability. If I already have a copy - of any of the new hashes, the new values must equal the existing ones, or - I will raise BadHashError. If adding a hash allows me to compute a parent - hash, those parent hashes must match or I will raise BadHashError. If I - raise BadHashError, I will forget about all the hashes that you tried to - add, leaving my state exactly the same as before I was called. If I - return successfully, I will remember all those hashes. - - If every hash that was added was validated, I will return True. If some - could not be validated because I did not have enough parent hashes, I - will return False. As a result, if I am called with both a leaf hash and - the root hash was already set, I will return True if and only if the leaf - hash could be validated against the root. - - If must_validate is True, I will raise NotEnoughHashesError instead of - returning False. If I raise NotEnoughHashesError, I will forget about all - the hashes that you tried to add. TODO: really? - - 'leaves' is a dictionary uses 'leaf index' values, which range from 0 - (the left-most leaf) to num_leaves-1 (the right-most leaf), and form the - base of the tree. 'hashes' uses 'hash_index' values, which range from 0 - (the root of the tree) to 2*num_leaves-2 (the right-most leaf). leaf[i] - is the same as hash[num_leaves-1+i]. - - The best way to use me is to obtain the root hash from some 'good' - channel, then call set_hash(0, root). Then use the 'bad' channel to - obtain data block 0 and the corresponding hash chain (a dict with the - same hashes that needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4, - 8:h8} when len(L)=8). Hash the data block to create leaf0. Then call:: - - good = iht.set_hashes(hashes=hashchain, leaves={0: leaf0}) - - If 'good' is True, the data block was valid. If 'good' is False, the - hashchain did not have the right blocks and we don't know whether the - data block was good or bad. If set_hashes() raises an exception, either - the data was corrupted or one of the received hashes was corrupted. - """ - - assert isinstance(hashes, dict) - assert isinstance(leaves, dict) - new_hashes = hashes.copy() - for leafnum,leafhash in leaves.iteritems(): - hashnum = self.first_leaf_num + leafnum - if hashnum in new_hashes: - assert new_hashes[hashnum] == leafhash - new_hashes[hashnum] = leafhash - - added = set() # we'll remove these if the check fails - - try: - # first we provisionally add all hashes to the tree, comparing any - # duplicates - for i in new_hashes: - if self[i]: - if self[i] != new_hashes[i]: - raise BadHashError("new hash does not match existing hash at [%d]" - % i) - else: - self[i] = new_hashes[i] - added.add(i) - - # then we start from the bottom and compute new parent hashes upwards, - # comparing any that already exist. When this phase ends, all nodes - # that have a sibling will also have a parent. - - hashes_to_check = list(new_hashes.keys()) - # leaf-most first means reverse sorted order - while hashes_to_check: - hashes_to_check.sort() - i = hashes_to_check.pop(-1) - if i == 0: - # The root has no sibling. How lonely. - continue - if self[self.sibling(i)] is None: - # without a sibling, we can't compute a parent - continue - parentnum = self.parent(i) - # make sure we know right from left - leftnum, rightnum = sorted([i, self.sibling(i)]) - new_parent_hash = pair_hash(self[leftnum], self[rightnum]) - if self[parentnum]: - if self[parentnum] != new_parent_hash: - raise BadHashError("h([%d]+[%d]) != h[%d]" % (leftnum, rightnum, - parentnum)) - else: - self[parentnum] = new_parent_hash - added.add(parentnum) - hashes_to_check.insert(0, parentnum) - - # then we walk downwards from the top (root), and anything that is - # reachable is validated. If any of the hashes that we've added are - # unreachable, then they are unvalidated. - - reachable = set() - if self[0]: - reachable.add(0) - # TODO: this could be done more efficiently, by starting from each - # element of new_hashes and walking upwards instead, remembering a set - # of validated nodes so that the searches for later new_hashes goes - # faster. This approach is O(n), whereas O(ln(n)) should be feasible. - for i in range(1, len(self)): - if self[i] and self.parent(i) in reachable: - reachable.add(i) - - # were we unable to validate any of the new hashes? - unvalidated = set(new_hashes.keys()) - reachable - if unvalidated: - if must_validate: - those = ",".join([str(i) for i in sorted(unvalidated)]) - raise NotEnoughHashesError("unable to validate hashes %s" % those) - - except (BadHashError, NotEnoughHashesError): - for i in added: - self[i] = None - raise - - # if there were hashes that could not be validated, we return False - return not unvalidated - diff --git a/src/allmydata/download.py b/src/allmydata/download.py index 3265184d..57ba5645 100644 --- a/src/allmydata/download.py +++ b/src/allmydata/download.py @@ -7,7 +7,7 @@ from twisted.application import service from allmydata.util import idlib, mathutil, hashutil from allmydata.util.assertutil import _assert -from allmydata import codec, chunk +from allmydata import codec, hashtree from allmydata.Crypto.Cipher import AES from allmydata.uri import unpack_uri from allmydata.interfaces import IDownloadTarget, IDownloader @@ -52,7 +52,7 @@ class ValidatedBucket: self.sharenum = sharenum self.bucket = bucket self.share_hash_tree = share_hash_tree - self.block_hash_tree = chunk.IncompleteHashTree(num_blocks) + self.block_hash_tree = hashtree.IncompleteHashTree(num_blocks) def get_block(self, blocknum): d1 = self.bucket.callRemote('get_block', blocknum) @@ -193,7 +193,7 @@ class FileDownloader: key = "\x00" * 16 self._output = Output(downloadable, key) - self._share_hashtree = chunk.IncompleteHashTree(total_shares) + self._share_hashtree = hashtree.IncompleteHashTree(total_shares) self._share_hashtree.set_hashes({0: roothash}) self.active_buckets = {} # k: shnum, v: bucket diff --git a/src/allmydata/encode.py b/src/allmydata/encode.py index 968adf9f..743e63b8 100644 --- a/src/allmydata/encode.py +++ b/src/allmydata/encode.py @@ -3,7 +3,7 @@ from zope.interface import implements from twisted.internet import defer from twisted.python import log -from allmydata.chunk import HashTree, roundup_pow2 +from allmydata.hashtree import HashTree, roundup_pow2 from allmydata.Crypto.Cipher import AES from allmydata.util import mathutil, hashutil from allmydata.util.assertutil import _assert diff --git a/src/allmydata/hashtree.py b/src/allmydata/hashtree.py new file mode 100644 index 00000000..3e4e09ed --- /dev/null +++ b/src/allmydata/hashtree.py @@ -0,0 +1,380 @@ +# -*- test-case-name: allmydata.test.test_hashtree -*- + +""" +Read and write chunks from files. + +Version 1.0.0. + +A file is divided into blocks, each of which has size L{BLOCK_SIZE} +(except for the last block, which may be smaller). Blocks are encoded +into chunks. One publishes the hash of the entire file. Clients +who want to download the file first obtain the hash, then the clients +can receive chunks in any order. Cryptographic hashing is used to +verify each received chunk before writing to disk. Thus it is +impossible to download corrupt data if one has the correct file hash. + +One obtains the hash of a complete file via +L{CompleteChunkFile.file_hash}. One can read chunks from a complete +file by the sequence operations of C{len()} and subscripting on a +L{CompleteChunkFile} object. One can open an empty or partially +downloaded file with L{PartialChunkFile}, and read and write chunks +to this file. A chunk will fail to write if its contents and index +are not consistent with the overall file hash passed to +L{PartialChunkFile} when the partial chunk file was first created. + +The chunks have an overhead of less than 4% for files of size +less than C{10**20} bytes. + +Benchmarks: + + - On a 3 GHz Pentium 3, it took 3.4 minutes to first make a + L{CompleteChunkFile} object for a 4 GB file. Up to 10 MB of + memory was used as the constructor ran. A metafile filename + was passed to the constructor, and so the hash information was + written to the metafile. The object used a negligible amount + of memory after the constructor was finished. + - Creation of L{CompleteChunkFile} objects in future runs of the + program took negligible time, since the hash information was + already stored in the metafile. + +@var BLOCK_SIZE: Size of a block. See L{BlockFile}. +@var MAX_CHUNK_SIZE: Upper bound on the size of a chunk. + See L{CompleteChunkFile}. + +free (adj.): unencumbered; not under the control of others +Written by Connelly Barnes in 2005 and released into the +public domain with no warranty of any kind, either expressed +or implied. It probably won't make your computer catch on fire, +or eat your children, but it might. Use at your own risk. +""" + +from allmydata.util import idlib +from allmydata.util.hashutil import tagged_hash, tagged_pair_hash + +__version__ = '1.0.0-allmydata' + +BLOCK_SIZE = 65536 +MAX_CHUNK_SIZE = BLOCK_SIZE + 4096 + +def roundup_pow2(x): + """ + Round integer C{x} up to the nearest power of 2. + """ + ans = 1 + while ans < x: + ans *= 2 + return ans + + +class CompleteBinaryTreeMixin: + """ + Adds convenience methods to a complete binary tree. + + Assumes the total number of elements in the binary tree may be + accessed via C{__len__}, and that each element can be retrieved + using list subscripting. + + Tree is indexed like so:: + + + 0 + / \ + 1 2 + / \ / \ + 3 4 5 6 + / \ / \ / \ / \ + 7 8 9 10 11 12 13 14 + + """ + def parent(self, i): + """ + Index of the parent of C{i}. + """ + if i < 1 or (hasattr(self, '__len__') and i >= len(self)): + raise IndexError('index out of range: ' + repr(i)) + return (i - 1) // 2 + + def lchild(self, i): + """ + Index of the left child of C{i}. + """ + ans = 2 * i + 1 + if i < 0 or (hasattr(self, '__len__') and ans >= len(self)): + raise IndexError('index out of range: ' + repr(i)) + return ans + + def rchild(self, i): + """ + Index of right child of C{i}. + """ + ans = 2 * i + 2 + if i < 0 or (hasattr(self, '__len__') and ans >= len(self)): + raise IndexError('index out of range: ' + repr(i)) + return ans + + def sibling(self, i): + """ + Index of sibling of C{i}. + """ + parent = self.parent(i) + if self.lchild(parent) == i: + return self.rchild(parent) + else: + return self.lchild(parent) + + def needed_for(self, i): + """ + Return a list of node indices that are necessary for the hash chain. + """ + if i < 0 or i >= len(self): + raise IndexError('index out of range: ' + repr(i)) + needed = [] + here = i + while here != 0: + needed.append(self.sibling(here)) + here = self.parent(here) + return needed + + def depth_first(self, i=0): + yield i, 0 + try: + for child,childdepth in self.depth_first(self.lchild(i)): + yield child, childdepth+1 + except IndexError: + pass + try: + for child,childdepth in self.depth_first(self.rchild(i)): + yield child, childdepth+1 + except IndexError: + pass + + def dump(self): + lines = [] + for i,depth in self.depth_first(): + lines.append("%s%3d: %s" % (" "*depth, i, idlib.b2a_or_none(self[i]))) + return "\n".join(lines) + "\n" + +def empty_leaf_hash(i): + return tagged_hash('Merkle tree empty leaf', "%d" % i) +def pair_hash(a, b): + return tagged_pair_hash('Merkle tree internal node', a, b) + +class HashTree(CompleteBinaryTreeMixin, list): + """ + Compute Merkle hashes at any node in a complete binary tree. + + Tree is indexed like so:: + + + 0 + / \ + 1 2 + / \ / \ + 3 4 5 6 + / \ / \ / \ / \ + 7 8 9 10 11 12 13 14 <- List passed to constructor. + + """ + def __init__(self, L): + """ + Create complete binary tree from list of hash strings. + + The list is augmented by hashes so its length is a power of 2, and + then this is used as the bottom row of the hash tree. + + The augmenting is done so that if the augmented element is at + index C{i}, then its value is C{hash(tagged_hash('Merkle tree empty leaf', '%d'%i))}. + """ + # Augment the list. + start = len(L) + end = roundup_pow2(len(L)) + L = L + [None] * (end - start) + for i in range(start, end): + L[i] = empty_leaf_hash(i) + # Form each row of the tree. + rows = [L] + while len(rows[-1]) != 1: + last = rows[-1] + rows += [[pair_hash(last[2*i], last[2*i+1]) + for i in xrange(len(last)//2)]] + # Flatten the list of rows into a single list. + rows.reverse() + self[:] = sum(rows, []) + + +class NotEnoughHashesError(Exception): + pass + +class BadHashError(Exception): + pass + +class IncompleteHashTree(CompleteBinaryTreeMixin, list): + """I am a hash tree which may or may not be complete. I can be used to + validate inbound data from some untrustworthy provider who has a subset of + leaves and a sufficient subset of internal nodes. + + Initially I am completely unpopulated. Over time, I will become filled with + hashes, just enough to validate particular leaf nodes. + + If you desire to validate leaf number N, first find out which hashes I need + by calling needed_hashes(N). This will return a list of node numbers (which + will nominally be the sibling chain between the given leaf and the root, + but if I already have some of those nodes, needed_hashes(N) will only + return a subset). Obtain these hashes from the data provider, then tell me + about them with set_hash(i, HASH). Once I have enough hashes, you can tell + me the hash of the leaf with set_leaf_hash(N, HASH), and I will either + return None or raise BadHashError. + + The first hash to be set will probably be 0 (the root hash), since this is + the one that will come from someone more trustworthy than the data + provider. + + """ + + def __init__(self, num_leaves): + L = [None] * num_leaves + start = len(L) + end = roundup_pow2(len(L)) + self.first_leaf_num = end - 1 + L = L + [None] * (end - start) + rows = [L] + while len(rows[-1]) != 1: + last = rows[-1] + rows += [[None for i in xrange(len(last)//2)]] + # Flatten the list of rows into a single list. + rows.reverse() + self[:] = sum(rows, []) + + def needed_hashes(self, hashes=[], leaves=[]): + hashnums = set(list(hashes)) + for leafnum in leaves: + hashnums.add(self.first_leaf_num + leafnum) + maybe_needed = set() + for hashnum in hashnums: + maybe_needed.update(self.needed_for(hashnum)) + maybe_needed.add(0) # need the root too + return set([i for i in maybe_needed if self[i] is None]) + + + def set_hashes(self, hashes={}, leaves={}, must_validate=False): + """Add a bunch of hashes to the tree. + + I will validate these to the best of my ability. If I already have a copy + of any of the new hashes, the new values must equal the existing ones, or + I will raise BadHashError. If adding a hash allows me to compute a parent + hash, those parent hashes must match or I will raise BadHashError. If I + raise BadHashError, I will forget about all the hashes that you tried to + add, leaving my state exactly the same as before I was called. If I + return successfully, I will remember all those hashes. + + If every hash that was added was validated, I will return True. If some + could not be validated because I did not have enough parent hashes, I + will return False. As a result, if I am called with both a leaf hash and + the root hash was already set, I will return True if and only if the leaf + hash could be validated against the root. + + If must_validate is True, I will raise NotEnoughHashesError instead of + returning False. If I raise NotEnoughHashesError, I will forget about all + the hashes that you tried to add. TODO: really? + + 'leaves' is a dictionary uses 'leaf index' values, which range from 0 + (the left-most leaf) to num_leaves-1 (the right-most leaf), and form the + base of the tree. 'hashes' uses 'hash_index' values, which range from 0 + (the root of the tree) to 2*num_leaves-2 (the right-most leaf). leaf[i] + is the same as hash[num_leaves-1+i]. + + The best way to use me is to obtain the root hash from some 'good' + channel, then call set_hash(0, root). Then use the 'bad' channel to + obtain data block 0 and the corresponding hash chain (a dict with the + same hashes that needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4, + 8:h8} when len(L)=8). Hash the data block to create leaf0. Then call:: + + good = iht.set_hashes(hashes=hashchain, leaves={0: leaf0}) + + If 'good' is True, the data block was valid. If 'good' is False, the + hashchain did not have the right blocks and we don't know whether the + data block was good or bad. If set_hashes() raises an exception, either + the data was corrupted or one of the received hashes was corrupted. + """ + + assert isinstance(hashes, dict) + assert isinstance(leaves, dict) + new_hashes = hashes.copy() + for leafnum,leafhash in leaves.iteritems(): + hashnum = self.first_leaf_num + leafnum + if hashnum in new_hashes: + assert new_hashes[hashnum] == leafhash + new_hashes[hashnum] = leafhash + + added = set() # we'll remove these if the check fails + + try: + # first we provisionally add all hashes to the tree, comparing any + # duplicates + for i in new_hashes: + if self[i]: + if self[i] != new_hashes[i]: + raise BadHashError("new hash does not match existing hash at [%d]" + % i) + else: + self[i] = new_hashes[i] + added.add(i) + + # then we start from the bottom and compute new parent hashes upwards, + # comparing any that already exist. When this phase ends, all nodes + # that have a sibling will also have a parent. + + hashes_to_check = list(new_hashes.keys()) + # leaf-most first means reverse sorted order + while hashes_to_check: + hashes_to_check.sort() + i = hashes_to_check.pop(-1) + if i == 0: + # The root has no sibling. How lonely. + continue + if self[self.sibling(i)] is None: + # without a sibling, we can't compute a parent + continue + parentnum = self.parent(i) + # make sure we know right from left + leftnum, rightnum = sorted([i, self.sibling(i)]) + new_parent_hash = pair_hash(self[leftnum], self[rightnum]) + if self[parentnum]: + if self[parentnum] != new_parent_hash: + raise BadHashError("h([%d]+[%d]) != h[%d]" % (leftnum, rightnum, + parentnum)) + else: + self[parentnum] = new_parent_hash + added.add(parentnum) + hashes_to_check.insert(0, parentnum) + + # then we walk downwards from the top (root), and anything that is + # reachable is validated. If any of the hashes that we've added are + # unreachable, then they are unvalidated. + + reachable = set() + if self[0]: + reachable.add(0) + # TODO: this could be done more efficiently, by starting from each + # element of new_hashes and walking upwards instead, remembering a set + # of validated nodes so that the searches for later new_hashes goes + # faster. This approach is O(n), whereas O(ln(n)) should be feasible. + for i in range(1, len(self)): + if self[i] and self.parent(i) in reachable: + reachable.add(i) + + # were we unable to validate any of the new hashes? + unvalidated = set(new_hashes.keys()) - reachable + if unvalidated: + if must_validate: + those = ",".join([str(i) for i in sorted(unvalidated)]) + raise NotEnoughHashesError("unable to validate hashes %s" % those) + + except (BadHashError, NotEnoughHashesError): + for i in added: + self[i] = None + raise + + # if there were hashes that could not be validated, we return False + return not unvalidated + diff --git a/src/allmydata/test/test_hashtree.py b/src/allmydata/test/test_hashtree.py index 1a51c6c6..2349ca2c 100644 --- a/src/allmydata/test/test_hashtree.py +++ b/src/allmydata/test/test_hashtree.py @@ -3,13 +3,13 @@ from twisted.trial import unittest from allmydata.util.hashutil import tagged_hash -from allmydata import chunk +from allmydata import hashtree def make_tree(numleaves): leaves = ["%d" % i for i in range(numleaves)] leaf_hashes = [tagged_hash("tag", leaf) for leaf in leaves] - ht = chunk.HashTree(leaf_hashes) + ht = hashtree.HashTree(leaf_hashes) return ht class Complete(unittest.TestCase): @@ -43,7 +43,7 @@ class Incomplete(unittest.TestCase): # first create a complete hash tree ht = make_tree(6) # then create a corresponding incomplete tree - iht = chunk.IncompleteHashTree(6) + iht = hashtree.IncompleteHashTree(6) # suppose we wanted to validate leaf[0] # leaf[0] is the same as node[7] @@ -61,7 +61,7 @@ class Incomplete(unittest.TestCase): # this should fail because there aren't enough hashes known iht.set_hashes(leaves={0: tagged_hash("tag", "0")}, must_validate=True) - except chunk.NotEnoughHashesError: + except hashtree.NotEnoughHashesError: pass else: self.fail("didn't catch not enough hashes") @@ -76,7 +76,7 @@ class Incomplete(unittest.TestCase): try: # this should fail because the hash is just plain wrong iht.set_hashes(leaves={0: tagged_hash("bad tag", "0")}) - except chunk.BadHashError: + except hashtree.BadHashError: pass else: self.fail("didn't catch bad hash") @@ -84,20 +84,20 @@ class Incomplete(unittest.TestCase): try: # this should succeed iht.set_hashes(leaves={0: tagged_hash("tag", "0")}) - except chunk.BadHashError, e: + except hashtree.BadHashError, e: self.fail("bad hash: %s" % e) try: # this should succeed too iht.set_hashes(leaves={1: tagged_hash("tag", "1")}) - except chunk.BadHashError: + except hashtree.BadHashError: self.fail("bad hash") # giving it a bad internal hash should also cause problems iht.set_hashes({13: tagged_hash("bad tag", "x")}) try: iht.set_hashes({14: tagged_hash("tag", "14")}) - except chunk.BadHashError: + except hashtree.BadHashError: pass else: self.fail("didn't catch bad hash") @@ -110,6 +110,6 @@ class Incomplete(unittest.TestCase): try: # this should succeed iht.set_hashes(leaves={4: tagged_hash("tag", "4")}) - except chunk.BadHashError, e: + except hashtree.BadHashError, e: self.fail("bad hash: %s" % e)