From: Brian Warner Date: Fri, 6 Apr 2007 16:09:57 +0000 (-0700) Subject: chunk: add IncompleteHashTree for download purposes, plus tests X-Git-Url: https://git.rkrishnan.org/index.html?a=commitdiff_plain;h=654854635a7be73f88d65ac26fcf89e7e91de9ca;p=tahoe-lafs%2Ftahoe-lafs.git chunk: add IncompleteHashTree for download purposes, plus tests --- diff --git a/src/allmydata/chunk.py b/src/allmydata/chunk.py index c3342bc1..a797c464 100644 --- a/src/allmydata/chunk.py +++ b/src/allmydata/chunk.py @@ -1,3 +1,4 @@ +# -*- test-case-name: allmydata.test.test_hashtree -*- """ Read and write chunks from files. @@ -133,6 +134,10 @@ class CompleteBinaryTreeMixin: here = self.parent(here) return needed +def empty_leaf_hash(i): + return tagged_hash('Merkle tree empty leaf', "%d" % i) +def pair_hash(a, b): + return tagged_pair_hash('Merkle tree internal node', a, b) class HashTree(CompleteBinaryTreeMixin, list): """ @@ -165,13 +170,104 @@ class HashTree(CompleteBinaryTreeMixin, list): end = roundup_pow2(len(L)) L = L + [None] * (end - start) for i in range(start, end): - L[i] = tagged_hash('Merkle tree empty leaf', "%d"%i) + L[i] = empty_leaf_hash(i) # Form each row of the tree. rows = [L] while len(rows[-1]) != 1: last = rows[-1] - rows += [[tagged_pair_hash('Merkle tree internal node', last[2*i], last[2*i+1]) for i in xrange(len(last)//2)]] + rows += [[pair_hash(last[2*i], last[2*i+1]) + for i in xrange(len(last)//2)]] # Flatten the list of rows into a single list. rows.reverse() self[:] = sum(rows, []) + +class NotEnoughHashesError(Exception): + pass + +class BadHashError(Exception): + pass + +class IncompleteHashTree(CompleteBinaryTreeMixin, list): + """I am a hash tree which may or may not be complete. I can be used to + validate inbound data from some untrustworthy provider who has a subset of + leaves and a sufficient subset of internal nodes. + + Initially I am completely unpopulated. Over time, I will become filled with + hashes, just enough to validate particular leaf nodes. + + If you desire to validate leaf number N, first find out which hashes I need + by calling needed_hashes(N). This will return a list of node numbers (which + will nominally be the sibling chain between the given leaf and the root, + but if I already have some of those nodes, needed_hashes(N) will only + return a subset). Obtain these hashes from the data provider, then tell me + about them with set_hash(i, HASH). Once I have enough hashes, you can tell + me the hash of the leaf with set_leaf_hash(N, HASH), and I will either + return None or raise BadHashError. + + The first hash to be set will probably be 0 (the root hash), since this is + the one that will come from someone more trustworthy than the data + provider. + + """ + + def __init__(self, num_leaves): + L = [None] * num_leaves + start = len(L) + end = roundup_pow2(len(L)) + self.first_leaf_num = end - 1 + L = L + [None] * (end - start) + rows = [L] + while len(rows[-1]) != 1: + last = rows[-1] + rows += [[None for i in xrange(len(last)//2)]] + # Flatten the list of rows into a single list. + rows.reverse() + self[:] = sum(rows, []) + + def needed_hashes(self, leafnum): + hashnum = self.first_leaf_num + leafnum + maybe_needed = self.needed_for(hashnum) + maybe_needed += [0] # need the root too + return set([i for i in maybe_needed if self[i] is None]) + + def set_hash(self, i, newhash): + # note that we don't attempt to validate these + self[i] = newhash + + def set_leaf(self, leafnum, leafhash): + hashnum = self.first_leaf_num + leafnum + needed = self.needed_hashes(leafnum) + if needed: + msg = "we need hashes " + ",".join([str(i) for i in needed]) + raise NotEnoughHashesError(msg) + assert self[0] is not None # we can't validate without a root + added = set() # we'll remove these if the check fails + self[hashnum] = leafhash + added.add(hashnum) + + # now propagate hash checks upwards until we reach the root + here = hashnum + while here != 0: + us = [here, self.sibling(here)] + us.sort() + leftnum, rightnum = us + lefthash = self[leftnum] + righthash = self[rightnum] + parent = self.parent(here) + parenthash = self[parent] + + ourhash = pair_hash(lefthash, righthash) + if parenthash is not None: + if ourhash != parenthash: + for i in added: + self[i] = None + raise BadHashError("h([%d]+[%d]) != h[%d]" % (leftnum, rightnum, + parent)) + else: + self[parent] = ourhash + added.add(parent) + here = self.parent(here) + + return None + diff --git a/src/allmydata/test/test_hashtree.py b/src/allmydata/test/test_hashtree.py new file mode 100644 index 00000000..5cac2cc1 --- /dev/null +++ b/src/allmydata/test/test_hashtree.py @@ -0,0 +1,96 @@ +# -*- test-case-name: allmydata.test.test_hashtree -*- + +from twisted.trial import unittest + +from allmydata.util.hashutil import tagged_hash +from allmydata import chunk + + +def make_tree(numleaves): + leaves = ["%d" % i for i in range(numleaves)] + leaf_hashes = [tagged_hash("tag", leaf) for leaf in leaves] + ht = chunk.HashTree(leaf_hashes) + return ht + +class Complete(unittest.TestCase): + def testCreate(self): + # try out various sizes + ht = make_tree(6) + ht = make_tree(8) + ht = make_tree(9) + root = ht[0] + self.failUnlessEqual(len(root), 32) + +class Incomplete(unittest.TestCase): + def testCheck(self): + # first create a complete hash tree + ht = make_tree(6) + # then create a corresponding incomplete tree + iht = chunk.IncompleteHashTree(6) + + # suppose we wanted to validate leaf[0] + # leaf[0] is the same as node[7] + self.failUnlessEqual(iht.needed_hashes(0), set([8, 4, 2, 0])) + self.failUnlessEqual(iht.needed_hashes(1), set([7, 4, 2, 0])) + iht.set_hash(0, ht[0]) + self.failUnlessEqual(iht.needed_hashes(0), set([8, 4, 2])) + self.failUnlessEqual(iht.needed_hashes(1), set([7, 4, 2])) + iht.set_hash(5, ht[5]) + self.failUnlessEqual(iht.needed_hashes(0), set([8, 4, 2])) + self.failUnlessEqual(iht.needed_hashes(1), set([7, 4, 2])) + + try: + # this should fail because there aren't enough hashes known + iht.set_leaf(0, tagged_hash("tag", "0")) + except chunk.NotEnoughHashesError: + pass + else: + self.fail("didn't catch not enough hashes") + + # provide the missing hashes + iht.set_hash(2, ht[2]) + iht.set_hash(4, ht[4]) + iht.set_hash(8, ht[8]) + self.failUnlessEqual(iht.needed_hashes(0), set([])) + + try: + # this should fail because the hash is just plain wrong + iht.set_leaf(0, tagged_hash("bad tag", "0")) + except chunk.BadHashError: + pass + else: + self.fail("didn't catch bad hash") + + try: + # this should succeed + iht.set_leaf(0, tagged_hash("tag", "0")) + except chunk.BadHashError, e: + self.fail("bad hash: %s" % e) + + try: + # this should succeed too + iht.set_leaf(1, tagged_hash("tag", "1")) + except chunk.BadHashError: + self.fail("bad hash") + + # giving it a bad internal hash should also cause problems + iht.set_hash(2, tagged_hash("bad tag", "x")) + try: + iht.set_leaf(0, tagged_hash("tag", "0")) + except chunk.BadHashError: + pass + else: + self.fail("didn't catch bad hash") + # undo our damage + iht.set_hash(2, ht[2]) + + self.failUnlessEqual(iht.needed_hashes(4), set([12, 6])) + + iht.set_hash(6, ht[6]) + iht.set_hash(12, ht[12]) + try: + # this should succeed + iht.set_leaf(4, tagged_hash("tag", "4")) + except chunk.BadHashError, e: + self.fail("bad hash: %s" % e) + diff --git a/src/allmydata/util/idlib.py b/src/allmydata/util/idlib.py index 24ca37da..185b3a3b 100644 --- a/src/allmydata/util/idlib.py +++ b/src/allmydata/util/idlib.py @@ -4,6 +4,11 @@ def b2a(i): assert isinstance(i, str), "tried to idlib.b2a non-string '%s'" % (i,) return b32encode(i).lower() +def b2a_or_none(i): + if i is None: + return None + return b2a(i) + def a2b(i): assert isinstance(i, str), "tried to idlib.a2b non-string '%s'" % (i,) try: