rename chunk.py to hashtree.py

author Brian Warner <warner@allmydata.com>

Thu, 12 Apr 2007 20:13:25 +0000 (13:13 -0700)

committer Brian Warner <warner@allmydata.com>

Thu, 12 Apr 2007 20:13:25 +0000 (13:13 -0700)
author Brian Warner <warner@allmydata.com>
Thu, 12 Apr 2007 20:13:25 +0000 (13:13 -0700)
committer Brian Warner <warner@allmydata.com>
Thu, 12 Apr 2007 20:13:25 +0000 (13:13 -0700)
diff --git a/src/allmydata/chunk.py b/src/allmydata/chunk.py

deleted file mode 100644 (file)

index 3e4e09e..0000000
--- a/src/allmydata/chunk.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# -*- test-case-name: allmydata.test.test_hashtree -*-
-
-"""
-Read and write chunks from files.
-
-Version 1.0.0.
-
-A file is divided into blocks, each of which has size L{BLOCK_SIZE}
-(except for the last block, which may be smaller).  Blocks are encoded
-into chunks.  One publishes the hash of the entire file.  Clients
-who want to download the file first obtain the hash, then the clients
-can receive chunks in any order.  Cryptographic hashing is used to
-verify each received chunk before writing to disk.  Thus it is
-impossible to download corrupt data if one has the correct file hash.
-
-One obtains the hash of a complete file via
-L{CompleteChunkFile.file_hash}.  One can read chunks from a complete
-file by the sequence operations of C{len()} and subscripting on a
-L{CompleteChunkFile} object.  One can open an empty or partially
-downloaded file with L{PartialChunkFile}, and read and write chunks
-to this file.  A chunk will fail to write if its contents and index
-are not consistent with the overall file hash passed to
-L{PartialChunkFile} when the partial chunk file was first created.
-
-The chunks have an overhead of less than 4% for files of size
-less than C{10**20} bytes.
-
-Benchmarks:
-
- - On a 3 GHz Pentium 3, it took 3.4 minutes to first make a
-   L{CompleteChunkFile} object for a 4 GB file.  Up to 10 MB of
-   memory was used as the constructor ran.  A metafile filename
-   was passed to the constructor, and so the hash information was
-   written to the metafile.  The object used a negligible amount
-   of memory after the constructor was finished.
- - Creation of L{CompleteChunkFile} objects in future runs of the
-   program took negligible time, since the hash information was
-   already stored in the metafile.
-
-@var BLOCK_SIZE:     Size of a block.  See L{BlockFile}.
-@var MAX_CHUNK_SIZE: Upper bound on the size of a chunk.
-                     See L{CompleteChunkFile}.
-
-free (adj.): unencumbered; not under the control of others
-Written by Connelly Barnes in 2005 and released into the
-public domain  with no warranty of any kind, either expressed
-or implied.  It probably won't make your computer catch on fire,
-or eat  your children, but it might.  Use at your own risk.
-"""
-
-from allmydata.util import idlib
-from allmydata.util.hashutil import tagged_hash, tagged_pair_hash
-
-__version__ = '1.0.0-allmydata'
-
-BLOCK_SIZE     = 65536
-MAX_CHUNK_SIZE = BLOCK_SIZE + 4096
-
-def roundup_pow2(x):
-  """
-  Round integer C{x} up to the nearest power of 2.
-  """
-  ans = 1
-  while ans < x:
-    ans *= 2
-  return ans
-
-
-class CompleteBinaryTreeMixin:
-  """
-  Adds convenience methods to a complete binary tree.
-
-  Assumes the total number of elements in the binary tree may be
-  accessed via C{__len__}, and that each element can be retrieved
-  using list subscripting.
-
-  Tree is indexed like so::
-
-
-                      0
-                 /        \
-              1               2
-           /    \          /    \
-         3       4       5       6
-        / \     / \     / \     / \
-       7   8   9   10  11  12  13  14
-
-  """
-  def parent(self, i):
-    """
-    Index of the parent of C{i}.
-    """
-    if i < 1 or (hasattr(self, '__len__') and i >= len(self)):
-      raise IndexError('index out of range: ' + repr(i))
-    return (i - 1) // 2
-
-  def lchild(self, i):
-    """
-    Index of the left child of C{i}.
-    """
-    ans = 2 * i + 1
-    if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
-      raise IndexError('index out of range: ' + repr(i))
-    return ans
-
-  def rchild(self, i):
-    """
-    Index of right child of C{i}.
-    """
-    ans = 2 * i + 2
-    if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
-      raise IndexError('index out of range: ' + repr(i))
-    return ans
-
-  def sibling(self, i):
-    """
-    Index of sibling of C{i}.
-    """
-    parent = self.parent(i)
-    if self.lchild(parent) == i:
-      return self.rchild(parent)
-    else:
-      return self.lchild(parent)
-
-  def needed_for(self, i):
-    """
-    Return a list of node indices that are necessary for the hash chain.
-    """
-    if i < 0 or i >= len(self):
-      raise IndexError('index out of range: ' + repr(i))
-    needed = []
-    here = i
-    while here != 0:
-      needed.append(self.sibling(here))
-      here = self.parent(here)
-    return needed
-
-  def depth_first(self, i=0):
-    yield i, 0
-    try:
-      for child,childdepth in self.depth_first(self.lchild(i)):
-        yield child, childdepth+1
-    except IndexError:
-      pass
-    try:
-      for child,childdepth in self.depth_first(self.rchild(i)):
-        yield child, childdepth+1
-    except IndexError:
-      pass
-
-  def dump(self):
-    lines = []
-    for i,depth in self.depth_first():
-      lines.append("%s%3d: %s" % ("  "*depth, i, idlib.b2a_or_none(self[i])))
-    return "\n".join(lines) + "\n"
-
-def empty_leaf_hash(i):
-  return tagged_hash('Merkle tree empty leaf', "%d" % i)
-def pair_hash(a, b):
-  return tagged_pair_hash('Merkle tree internal node', a, b)
-
-class HashTree(CompleteBinaryTreeMixin, list):
-  """
-  Compute Merkle hashes at any node in a complete binary tree.
-
-  Tree is indexed like so::
-
-
-                      0
-                 /        \
-              1               2
-           /    \          /    \
-         3       4       5       6
-        / \     / \     / \     / \
-       7   8   9   10  11  12  13  14  <- List passed to constructor.
-
-  """
-  def __init__(self, L):
-    """
-    Create complete binary tree from list of hash strings.
-
-    The list is augmented by hashes so its length is a power of 2, and
-    then this is used as the bottom row of the hash tree.
-
-    The augmenting is done so that if the augmented element is at
-    index C{i}, then its value is C{hash(tagged_hash('Merkle tree empty leaf', '%d'%i))}.
-    """
-    # Augment the list.
-    start = len(L)
-    end   = roundup_pow2(len(L))
-    L     = L + [None] * (end - start)
-    for i in range(start, end):
-      L[i] = empty_leaf_hash(i)
-    # Form each row of the tree.
-    rows = [L]
-    while len(rows[-1]) != 1:
-      last = rows[-1]
-      rows += [[pair_hash(last[2*i], last[2*i+1])
-                for i in xrange(len(last)//2)]]
-    # Flatten the list of rows into a single list.
-    rows.reverse()
-    self[:] = sum(rows, [])
-
-
-class NotEnoughHashesError(Exception):
-  pass
-
-class BadHashError(Exception):
-  pass
-
-class IncompleteHashTree(CompleteBinaryTreeMixin, list):
-  """I am a hash tree which may or may not be complete. I can be used to
-  validate inbound data from some untrustworthy provider who has a subset of
-  leaves and a sufficient subset of internal nodes.
-
-  Initially I am completely unpopulated. Over time, I will become filled with
-  hashes, just enough to validate particular leaf nodes.
-
-  If you desire to validate leaf number N, first find out which hashes I need
-  by calling needed_hashes(N). This will return a list of node numbers (which
-  will nominally be the sibling chain between the given leaf and the root,
-  but if I already have some of those nodes, needed_hashes(N) will only
-  return a subset). Obtain these hashes from the data provider, then tell me
-  about them with set_hash(i, HASH). Once I have enough hashes, you can tell
-  me the hash of the leaf with set_leaf_hash(N, HASH), and I will either
-  return None or raise BadHashError.
-
-  The first hash to be set will probably be 0 (the root hash), since this is
-  the one that will come from someone more trustworthy than the data
-  provider.
-
-  """
-
-  def __init__(self, num_leaves):
-    L = [None] * num_leaves
-    start = len(L)
-    end   = roundup_pow2(len(L))
-    self.first_leaf_num = end - 1
-    L     = L + [None] * (end - start)
-    rows = [L]
-    while len(rows[-1]) != 1:
-      last = rows[-1]
-      rows += [[None for i in xrange(len(last)//2)]]
-    # Flatten the list of rows into a single list.
-    rows.reverse()
-    self[:] = sum(rows, [])
-
-  def needed_hashes(self, hashes=[], leaves=[]):
-    hashnums = set(list(hashes))
-    for leafnum in leaves:
-      hashnums.add(self.first_leaf_num + leafnum)
-    maybe_needed = set()
-    for hashnum in hashnums:
-      maybe_needed.update(self.needed_for(hashnum))
-    maybe_needed.add(0) # need the root too
-    return set([i for i in maybe_needed if self[i] is None])
-
-
-  def set_hashes(self, hashes={}, leaves={}, must_validate=False):
-    """Add a bunch of hashes to the tree.
-
-    I will validate these to the best of my ability. If I already have a copy
-    of any of the new hashes, the new values must equal the existing ones, or
-    I will raise BadHashError. If adding a hash allows me to compute a parent
-    hash, those parent hashes must match or I will raise BadHashError. If I
-    raise BadHashError, I will forget about all the hashes that you tried to
-    add, leaving my state exactly the same as before I was called. If I
-    return successfully, I will remember all those hashes.
-
-    If every hash that was added was validated, I will return True. If some
-    could not be validated because I did not have enough parent hashes, I
-    will return False. As a result, if I am called with both a leaf hash and
-    the root hash was already set, I will return True if and only if the leaf
-    hash could be validated against the root.
-
-    If must_validate is True, I will raise NotEnoughHashesError instead of
-    returning False. If I raise NotEnoughHashesError, I will forget about all
-    the hashes that you tried to add. TODO: really?
-
-    'leaves' is a dictionary uses 'leaf index' values, which range from 0
-    (the left-most leaf) to num_leaves-1 (the right-most leaf), and form the
-    base of the tree. 'hashes' uses 'hash_index' values, which range from 0
-    (the root of the tree) to 2*num_leaves-2 (the right-most leaf). leaf[i]
-    is the same as hash[num_leaves-1+i].
-
-    The best way to use me is to obtain the root hash from some 'good'
-    channel, then call set_hash(0, root). Then use the 'bad' channel to
-    obtain data block 0 and the corresponding hash chain (a dict with the
-    same hashes that needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4,
-    8:h8} when len(L)=8). Hash the data block to create leaf0. Then call::
-
-     good = iht.set_hashes(hashes=hashchain, leaves={0: leaf0})
-
-    If 'good' is True, the data block was valid. If 'good' is False, the
-    hashchain did not have the right blocks and we don't know whether the
-    data block was good or bad. If set_hashes() raises an exception, either
-    the data was corrupted or one of the received hashes was corrupted.
-    """
-
-    assert isinstance(hashes, dict)
-    assert isinstance(leaves, dict)
-    new_hashes = hashes.copy()
-    for leafnum,leafhash in leaves.iteritems():
-      hashnum = self.first_leaf_num + leafnum
-      if hashnum in new_hashes:
-        assert new_hashes[hashnum] == leafhash
-      new_hashes[hashnum] = leafhash
-
-    added = set() # we'll remove these if the check fails
-
-    try:
-      # first we provisionally add all hashes to the tree, comparing any
-      # duplicates
-      for i in new_hashes:
-        if self[i]:
-          if self[i] != new_hashes[i]:
-            raise BadHashError("new hash does not match existing hash at [%d]"
-                               % i)
-        else:
-          self[i] = new_hashes[i]
-          added.add(i)
-
-      # then we start from the bottom and compute new parent hashes upwards,
-      # comparing any that already exist. When this phase ends, all nodes
-      # that have a sibling will also have a parent.
-
-      hashes_to_check = list(new_hashes.keys())
-      # leaf-most first means reverse sorted order
-      while hashes_to_check:
-        hashes_to_check.sort()
-        i = hashes_to_check.pop(-1)
-        if i == 0:
-          # The root has no sibling. How lonely.
-          continue
-        if self[self.sibling(i)] is None:
-          # without a sibling, we can't compute a parent
-          continue
-        parentnum = self.parent(i)
-        # make sure we know right from left
-        leftnum, rightnum = sorted([i, self.sibling(i)])
-        new_parent_hash = pair_hash(self[leftnum], self[rightnum])
-        if self[parentnum]:
-          if self[parentnum] != new_parent_hash:
-            raise BadHashError("h([%d]+[%d]) != h[%d]" % (leftnum, rightnum,
-                                                          parentnum))
-        else:
-          self[parentnum] = new_parent_hash
-          added.add(parentnum)
-          hashes_to_check.insert(0, parentnum)
-
-      # then we walk downwards from the top (root), and anything that is
-      # reachable is validated. If any of the hashes that we've added are
-      # unreachable, then they are unvalidated.
-
-      reachable = set()
-      if self[0]:
-        reachable.add(0)
-      # TODO: this could be done more efficiently, by starting from each
-      # element of new_hashes and walking upwards instead, remembering a set
-      # of validated nodes so that the searches for later new_hashes goes
-      # faster. This approach is O(n), whereas O(ln(n)) should be feasible.
-      for i in range(1, len(self)):
-        if self[i] and self.parent(i) in reachable:
-          reachable.add(i)
-
-      # were we unable to validate any of the new hashes?
-      unvalidated = set(new_hashes.keys()) - reachable
-      if unvalidated:
-        if must_validate:
-          those = ",".join([str(i) for i in sorted(unvalidated)])
-          raise NotEnoughHashesError("unable to validate hashes %s" % those)
-
-    except (BadHashError, NotEnoughHashesError):
-      for i in added:
-        self[i] = None
-      raise
-
-    # if there were hashes that could not be validated, we return False
-    return not unvalidated
-
diff --git a/src/allmydata/download.py b/src/allmydata/download.py

index 3265184da639269676f59b8318f95c735a3cce1e..57ba5645861087f4ad3eac2574930e48a1d0c846 100644 (file)
--- a/src/allmydata/download.py
+++ b/src/allmydata/download.py
@@ -7,7 +7,7 @@ from twisted.application import service
  
  from allmydata.util import idlib, mathutil, hashutil
  from allmydata.util.assertutil import _assert
-from allmydata import codec, chunk
+from allmydata import codec, hashtree
  from allmydata.Crypto.Cipher import AES
  from allmydata.uri import unpack_uri
  from allmydata.interfaces import IDownloadTarget, IDownloader
@@ -52,7 +52,7 @@ class ValidatedBucket:
          self.sharenum = sharenum
          self.bucket = bucket
          self.share_hash_tree = share_hash_tree
-        self.block_hash_tree = chunk.IncompleteHashTree(num_blocks)
+        self.block_hash_tree = hashtree.IncompleteHashTree(num_blocks)
  
      def get_block(self, blocknum):
          d1 = self.bucket.callRemote('get_block', blocknum)
@@ -193,7 +193,7 @@ class FileDownloader:
          key = "\x00" * 16
          self._output = Output(downloadable, key)
  
-        self._share_hashtree = chunk.IncompleteHashTree(total_shares)
+        self._share_hashtree = hashtree.IncompleteHashTree(total_shares)
          self._share_hashtree.set_hashes({0: roothash})
  
          self.active_buckets = {} # k: shnum, v: bucket
diff --git a/src/allmydata/encode.py b/src/allmydata/encode.py

index 968adf9f5abd9380cafddf9f5fe6db2f5f304b57..743e63b88f4f0d12b76069f1847d72cd025ac8ff 100644 (file)
--- a/src/allmydata/encode.py
+++ b/src/allmydata/encode.py
@@ -3,7 +3,7 @@
  from zope.interface import implements
  from twisted.internet import defer
  from twisted.python import log
-from allmydata.chunk import HashTree, roundup_pow2
+from allmydata.hashtree import HashTree, roundup_pow2
  from allmydata.Crypto.Cipher import AES
  from allmydata.util import mathutil, hashutil
  from allmydata.util.assertutil import _assert
diff --git a/src/allmydata/hashtree.py b/src/allmydata/hashtree.py

new file mode 100644 (file)

index 0000000..3e4e09e
--- /dev/null
+++ b/src/allmydata/hashtree.py
@@ -0,0 +1,380 @@
+# -*- test-case-name: allmydata.test.test_hashtree -*-
+
+"""
+Read and write chunks from files.
+
+Version 1.0.0.
+
+A file is divided into blocks, each of which has size L{BLOCK_SIZE}
+(except for the last block, which may be smaller).  Blocks are encoded
+into chunks.  One publishes the hash of the entire file.  Clients
+who want to download the file first obtain the hash, then the clients
+can receive chunks in any order.  Cryptographic hashing is used to
+verify each received chunk before writing to disk.  Thus it is
+impossible to download corrupt data if one has the correct file hash.
+
+One obtains the hash of a complete file via
+L{CompleteChunkFile.file_hash}.  One can read chunks from a complete
+file by the sequence operations of C{len()} and subscripting on a
+L{CompleteChunkFile} object.  One can open an empty or partially
+downloaded file with L{PartialChunkFile}, and read and write chunks
+to this file.  A chunk will fail to write if its contents and index
+are not consistent with the overall file hash passed to
+L{PartialChunkFile} when the partial chunk file was first created.
+
+The chunks have an overhead of less than 4% for files of size
+less than C{10**20} bytes.
+
+Benchmarks:
+
+ - On a 3 GHz Pentium 3, it took 3.4 minutes to first make a
+   L{CompleteChunkFile} object for a 4 GB file.  Up to 10 MB of
+   memory was used as the constructor ran.  A metafile filename
+   was passed to the constructor, and so the hash information was
+   written to the metafile.  The object used a negligible amount
+   of memory after the constructor was finished.
+ - Creation of L{CompleteChunkFile} objects in future runs of the
+   program took negligible time, since the hash information was
+   already stored in the metafile.
+
+@var BLOCK_SIZE:     Size of a block.  See L{BlockFile}.
+@var MAX_CHUNK_SIZE: Upper bound on the size of a chunk.
+                     See L{CompleteChunkFile}.
+
+free (adj.): unencumbered; not under the control of others
+Written by Connelly Barnes in 2005 and released into the
+public domain  with no warranty of any kind, either expressed
+or implied.  It probably won't make your computer catch on fire,
+or eat  your children, but it might.  Use at your own risk.
+"""
+
+from allmydata.util import idlib
+from allmydata.util.hashutil import tagged_hash, tagged_pair_hash
+
+__version__ = '1.0.0-allmydata'
+
+BLOCK_SIZE     = 65536
+MAX_CHUNK_SIZE = BLOCK_SIZE + 4096
+
+def roundup_pow2(x):
+  """
+  Round integer C{x} up to the nearest power of 2.
+  """
+  ans = 1
+  while ans < x:
+    ans *= 2
+  return ans
+
+
+class CompleteBinaryTreeMixin:
+  """
+  Adds convenience methods to a complete binary tree.
+
+  Assumes the total number of elements in the binary tree may be
+  accessed via C{__len__}, and that each element can be retrieved
+  using list subscripting.
+
+  Tree is indexed like so::
+
+
+                      0
+                 /        \
+              1               2
+           /    \          /    \
+         3       4       5       6
+        / \     / \     / \     / \
+       7   8   9   10  11  12  13  14
+
+  """
+  def parent(self, i):
+    """
+    Index of the parent of C{i}.
+    """
+    if i < 1 or (hasattr(self, '__len__') and i >= len(self)):
+      raise IndexError('index out of range: ' + repr(i))
+    return (i - 1) // 2
+
+  def lchild(self, i):
+    """
+    Index of the left child of C{i}.
+    """
+    ans = 2 * i + 1
+    if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
+      raise IndexError('index out of range: ' + repr(i))
+    return ans
+
+  def rchild(self, i):
+    """
+    Index of right child of C{i}.
+    """
+    ans = 2 * i + 2
+    if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
+      raise IndexError('index out of range: ' + repr(i))
+    return ans
+
+  def sibling(self, i):
+    """
+    Index of sibling of C{i}.
+    """
+    parent = self.parent(i)
+    if self.lchild(parent) == i:
+      return self.rchild(parent)
+    else:
+      return self.lchild(parent)
+
+  def needed_for(self, i):
+    """
+    Return a list of node indices that are necessary for the hash chain.
+    """
+    if i < 0 or i >= len(self):
+      raise IndexError('index out of range: ' + repr(i))
+    needed = []
+    here = i
+    while here != 0:
+      needed.append(self.sibling(here))
+      here = self.parent(here)
+    return needed
+
+  def depth_first(self, i=0):
+    yield i, 0
+    try:
+      for child,childdepth in self.depth_first(self.lchild(i)):
+        yield child, childdepth+1
+    except IndexError:
+      pass
+    try:
+      for child,childdepth in self.depth_first(self.rchild(i)):
+        yield child, childdepth+1
+    except IndexError:
+      pass
+
+  def dump(self):
+    lines = []
+    for i,depth in self.depth_first():
+      lines.append("%s%3d: %s" % ("  "*depth, i, idlib.b2a_or_none(self[i])))
+    return "\n".join(lines) + "\n"
+
+def empty_leaf_hash(i):
+  return tagged_hash('Merkle tree empty leaf', "%d" % i)
+def pair_hash(a, b):
+  return tagged_pair_hash('Merkle tree internal node', a, b)
+
+class HashTree(CompleteBinaryTreeMixin, list):
+  """
+  Compute Merkle hashes at any node in a complete binary tree.
+
+  Tree is indexed like so::
+
+
+                      0
+                 /        \
+              1               2
+           /    \          /    \
+         3       4       5       6
+        / \     / \     / \     / \
+       7   8   9   10  11  12  13  14  <- List passed to constructor.
+
+  """
+  def __init__(self, L):
+    """
+    Create complete binary tree from list of hash strings.
+
+    The list is augmented by hashes so its length is a power of 2, and
+    then this is used as the bottom row of the hash tree.
+
+    The augmenting is done so that if the augmented element is at
+    index C{i}, then its value is C{hash(tagged_hash('Merkle tree empty leaf', '%d'%i))}.
+    """
+    # Augment the list.
+    start = len(L)
+    end   = roundup_pow2(len(L))
+    L     = L + [None] * (end - start)
+    for i in range(start, end):
+      L[i] = empty_leaf_hash(i)
+    # Form each row of the tree.
+    rows = [L]
+    while len(rows[-1]) != 1:
+      last = rows[-1]
+      rows += [[pair_hash(last[2*i], last[2*i+1])
+                for i in xrange(len(last)//2)]]
+    # Flatten the list of rows into a single list.
+    rows.reverse()
+    self[:] = sum(rows, [])
+
+
+class NotEnoughHashesError(Exception):
+  pass
+
+class BadHashError(Exception):
+  pass
+
+class IncompleteHashTree(CompleteBinaryTreeMixin, list):
+  """I am a hash tree which may or may not be complete. I can be used to
+  validate inbound data from some untrustworthy provider who has a subset of
+  leaves and a sufficient subset of internal nodes.
+
+  Initially I am completely unpopulated. Over time, I will become filled with
+  hashes, just enough to validate particular leaf nodes.
+
+  If you desire to validate leaf number N, first find out which hashes I need
+  by calling needed_hashes(N). This will return a list of node numbers (which
+  will nominally be the sibling chain between the given leaf and the root,
+  but if I already have some of those nodes, needed_hashes(N) will only
+  return a subset). Obtain these hashes from the data provider, then tell me
+  about them with set_hash(i, HASH). Once I have enough hashes, you can tell
+  me the hash of the leaf with set_leaf_hash(N, HASH), and I will either
+  return None or raise BadHashError.
+
+  The first hash to be set will probably be 0 (the root hash), since this is
+  the one that will come from someone more trustworthy than the data
+  provider.
+
+  """
+
+  def __init__(self, num_leaves):
+    L = [None] * num_leaves
+    start = len(L)
+    end   = roundup_pow2(len(L))
+    self.first_leaf_num = end - 1
+    L     = L + [None] * (end - start)
+    rows = [L]
+    while len(rows[-1]) != 1:
+      last = rows[-1]
+      rows += [[None for i in xrange(len(last)//2)]]
+    # Flatten the list of rows into a single list.
+    rows.reverse()
+    self[:] = sum(rows, [])
+
+  def needed_hashes(self, hashes=[], leaves=[]):
+    hashnums = set(list(hashes))
+    for leafnum in leaves:
+      hashnums.add(self.first_leaf_num + leafnum)
+    maybe_needed = set()
+    for hashnum in hashnums:
+      maybe_needed.update(self.needed_for(hashnum))
+    maybe_needed.add(0) # need the root too
+    return set([i for i in maybe_needed if self[i] is None])
+
+
+  def set_hashes(self, hashes={}, leaves={}, must_validate=False):
+    """Add a bunch of hashes to the tree.
+
+    I will validate these to the best of my ability. If I already have a copy
+    of any of the new hashes, the new values must equal the existing ones, or
+    I will raise BadHashError. If adding a hash allows me to compute a parent
+    hash, those parent hashes must match or I will raise BadHashError. If I
+    raise BadHashError, I will forget about all the hashes that you tried to
+    add, leaving my state exactly the same as before I was called. If I
+    return successfully, I will remember all those hashes.
+
+    If every hash that was added was validated, I will return True. If some
+    could not be validated because I did not have enough parent hashes, I
+    will return False. As a result, if I am called with both a leaf hash and
+    the root hash was already set, I will return True if and only if the leaf
+    hash could be validated against the root.
+
+    If must_validate is True, I will raise NotEnoughHashesError instead of
+    returning False. If I raise NotEnoughHashesError, I will forget about all
+    the hashes that you tried to add. TODO: really?
+
+    'leaves' is a dictionary uses 'leaf index' values, which range from 0
+    (the left-most leaf) to num_leaves-1 (the right-most leaf), and form the
+    base of the tree. 'hashes' uses 'hash_index' values, which range from 0
+    (the root of the tree) to 2*num_leaves-2 (the right-most leaf). leaf[i]
+    is the same as hash[num_leaves-1+i].
+
+    The best way to use me is to obtain the root hash from some 'good'
+    channel, then call set_hash(0, root). Then use the 'bad' channel to
+    obtain data block 0 and the corresponding hash chain (a dict with the
+    same hashes that needed_hashes(0) tells you, e.g. {0:h0, 2:h2, 4:h4,
+    8:h8} when len(L)=8). Hash the data block to create leaf0. Then call::
+
+     good = iht.set_hashes(hashes=hashchain, leaves={0: leaf0})
+
+    If 'good' is True, the data block was valid. If 'good' is False, the
+    hashchain did not have the right blocks and we don't know whether the
+    data block was good or bad. If set_hashes() raises an exception, either
+    the data was corrupted or one of the received hashes was corrupted.
+    """
+
+    assert isinstance(hashes, dict)
+    assert isinstance(leaves, dict)
+    new_hashes = hashes.copy()
+    for leafnum,leafhash in leaves.iteritems():
+      hashnum = self.first_leaf_num + leafnum
+      if hashnum in new_hashes:
+        assert new_hashes[hashnum] == leafhash
+      new_hashes[hashnum] = leafhash
+
+    added = set() # we'll remove these if the check fails
+
+    try:
+      # first we provisionally add all hashes to the tree, comparing any
+      # duplicates
+      for i in new_hashes:
+        if self[i]:
+          if self[i] != new_hashes[i]:
+            raise BadHashError("new hash does not match existing hash at [%d]"
+                               % i)
+        else:
+          self[i] = new_hashes[i]
+          added.add(i)
+
+      # then we start from the bottom and compute new parent hashes upwards,
+      # comparing any that already exist. When this phase ends, all nodes
+      # that have a sibling will also have a parent.
+
+      hashes_to_check = list(new_hashes.keys())
+      # leaf-most first means reverse sorted order
+      while hashes_to_check:
+        hashes_to_check.sort()
+        i = hashes_to_check.pop(-1)
+        if i == 0:
+          # The root has no sibling. How lonely.
+          continue
+        if self[self.sibling(i)] is None:
+          # without a sibling, we can't compute a parent
+          continue
+        parentnum = self.parent(i)
+        # make sure we know right from left
+        leftnum, rightnum = sorted([i, self.sibling(i)])
+        new_parent_hash = pair_hash(self[leftnum], self[rightnum])
+        if self[parentnum]:
+          if self[parentnum] != new_parent_hash:
+            raise BadHashError("h([%d]+[%d]) != h[%d]" % (leftnum, rightnum,
+                                                          parentnum))
+        else:
+          self[parentnum] = new_parent_hash
+          added.add(parentnum)
+          hashes_to_check.insert(0, parentnum)
+
+      # then we walk downwards from the top (root), and anything that is
+      # reachable is validated. If any of the hashes that we've added are
+      # unreachable, then they are unvalidated.
+
+      reachable = set()
+      if self[0]:
+        reachable.add(0)
+      # TODO: this could be done more efficiently, by starting from each
+      # element of new_hashes and walking upwards instead, remembering a set
+      # of validated nodes so that the searches for later new_hashes goes
+      # faster. This approach is O(n), whereas O(ln(n)) should be feasible.
+      for i in range(1, len(self)):
+        if self[i] and self.parent(i) in reachable:
+          reachable.add(i)
+
+      # were we unable to validate any of the new hashes?
+      unvalidated = set(new_hashes.keys()) - reachable
+      if unvalidated:
+        if must_validate:
+          those = ",".join([str(i) for i in sorted(unvalidated)])
+          raise NotEnoughHashesError("unable to validate hashes %s" % those)
+
+    except (BadHashError, NotEnoughHashesError):
+      for i in added:
+        self[i] = None
+      raise
+
+    # if there were hashes that could not be validated, we return False
+    return not unvalidated
+
diff --git a/src/allmydata/test/test_hashtree.py b/src/allmydata/test/test_hashtree.py

index 1a51c6c6a536781cbb0df2c4ea0383d141d60f80..2349ca2c0dd57208f7cc342549a56bc9921b307d 100644 (file)
--- a/src/allmydata/test/test_hashtree.py
+++ b/src/allmydata/test/test_hashtree.py
@@ -3,13 +3,13 @@
  from twisted.trial import unittest
  
  from allmydata.util.hashutil import tagged_hash
-from allmydata import chunk
+from allmydata import hashtree
  
  
  def make_tree(numleaves):
      leaves = ["%d" % i for i in range(numleaves)]
      leaf_hashes = [tagged_hash("tag", leaf) for leaf in leaves]
-    ht = chunk.HashTree(leaf_hashes)
+    ht = hashtree.HashTree(leaf_hashes)
      return ht
  
  class Complete(unittest.TestCase):
@@ -43,7 +43,7 @@ class Incomplete(unittest.TestCase):
          # first create a complete hash tree
          ht = make_tree(6)
          # then create a corresponding incomplete tree
-        iht = chunk.IncompleteHashTree(6)
+        iht = hashtree.IncompleteHashTree(6)
  
          # suppose we wanted to validate leaf[0]
          #  leaf[0] is the same as node[7]
@@ -61,7 +61,7 @@ class Incomplete(unittest.TestCase):
              # this should fail because there aren't enough hashes known
              iht.set_hashes(leaves={0: tagged_hash("tag", "0")},
                             must_validate=True)
-        except chunk.NotEnoughHashesError:
+        except hashtree.NotEnoughHashesError:
              pass
          else:
              self.fail("didn't catch not enough hashes")
@@ -76,7 +76,7 @@ class Incomplete(unittest.TestCase):
          try:
              # this should fail because the hash is just plain wrong
              iht.set_hashes(leaves={0: tagged_hash("bad tag", "0")})
-        except chunk.BadHashError:
+        except hashtree.BadHashError:
              pass
          else:
              self.fail("didn't catch bad hash")
@@ -84,20 +84,20 @@ class Incomplete(unittest.TestCase):
          try:
              # this should succeed
              iht.set_hashes(leaves={0: tagged_hash("tag", "0")})
-        except chunk.BadHashError, e:
+        except hashtree.BadHashError, e:
              self.fail("bad hash: %s" % e)
  
          try:
              # this should succeed too
              iht.set_hashes(leaves={1: tagged_hash("tag", "1")})
-        except chunk.BadHashError:
+        except hashtree.BadHashError:
              self.fail("bad hash")
  
          # giving it a bad internal hash should also cause problems
          iht.set_hashes({13: tagged_hash("bad tag", "x")})
          try:
              iht.set_hashes({14: tagged_hash("tag", "14")})
-        except chunk.BadHashError:
+        except hashtree.BadHashError:
              pass
          else:
              self.fail("didn't catch bad hash")
@@ -110,6 +110,6 @@ class Incomplete(unittest.TestCase):
          try:
              # this should succeed
              iht.set_hashes(leaves={4: tagged_hash("tag", "4")})
-        except chunk.BadHashError, e:
+        except hashtree.BadHashError, e:
              self.fail("bad hash: %s" % e)
author	Brian Warner <warner@allmydata.com>
	Thu, 12 Apr 2007 20:13:25 +0000 (13:13 -0700)
committer	Brian Warner <warner@allmydata.com>
	Thu, 12 Apr 2007 20:13:25 +0000 (13:13 -0700)
src/allmydata/chunk.py	[deleted file]	patch \| blob \| history
src/allmydata/download.py		patch \| blob \| history
src/allmydata/encode.py		patch \| blob \| history
src/allmydata/hashtree.py	[new file with mode: 0644]	patch \| blob
src/allmydata/test/test_hashtree.py		patch \| blob \| history