--- /dev/null
+
+"""
+Read and write chunks from files.
+
+Version 1.0.0.
+
+A file is divided into blocks, each of which has size L{BLOCK_SIZE}
+(except for the last block, which may be smaller). Blocks are encoded
+into chunks. One publishes the hash of the entire file. Clients
+who want to download the file first obtain the hash, then the clients
+can receive chunks in any order. Cryptographic hashing is used to
+verify each received chunk before writing to disk. Thus it is
+impossible to download corrupt data if one has the correct file hash.
+
+One obtains the hash of a complete file via
+L{CompleteChunkFile.file_hash}. One can read chunks from a complete
+file by the sequence operations of C{len()} and subscripting on a
+L{CompleteChunkFile} object. One can open an empty or partially
+downloaded file with L{PartialChunkFile}, and read and write chunks
+to this file. A chunk will fail to write if its contents and index
+are not consistent with the overall file hash passed to
+L{PartialChunkFile} when the partial chunk file was first created.
+
+The chunks have an overhead of less than 4% for files of size
+less than C{10**20} bytes.
+
+Benchmarks:
+
+ - On a 3 GHz Pentium 3, it took 3.4 minutes to first make a
+ L{CompleteChunkFile} object for a 4 GB file. Up to 10 MB of
+ memory was used as the constructor ran. A metafile filename
+ was passed to the constructor, and so the hash information was
+ written to the metafile. The object used a negligible amount
+ of memory after the constructor was finished.
+ - Creation of L{CompleteChunkFile} objects in future runs of the
+ program took negligible time, since the hash information was
+ already stored in the metafile.
+
+@var BLOCK_SIZE: Size of a block. See L{BlockFile}.
+@var MAX_CHUNK_SIZE: Upper bound on the size of a chunk.
+ See L{CompleteChunkFile}.
+
+free (adj.): unencumbered; not under the control of others
+Written by Connelly Barnes in 2005 and released into the
+public domain with no warranty of any kind, either expressed
+or implied. It probably won't make your computer catch on fire,
+or eat your children, but it might. Use at your own risk.
+"""
+
+import sha
+import os
+#import os.path
+
+from allmydata.util import bencode
+
+__all__ = ['CompleteChunkFile', 'PartialChunkFile']
+
+__version__ = '1.0.0'
+
+BLOCK_SIZE = 65536
+MAX_CHUNK_SIZE = BLOCK_SIZE + 4096
+
+def hash(s):
+ """
+ Cryptographic hash function used by this module.
+ """
+ return sha.new(s).digest()
+
+
+def roundup_pow2(x):
+ """
+ Round integer C{x} up to the nearest power of 2.
+ """
+ ans = 1
+ while ans < x:
+ ans *= 2
+ return ans
+
+
+class CompleteBinaryTreeMixin:
+ """
+ Adds convenience methods to a complete binary tree.
+
+ Assumes the total number of elements in the binary tree may be
+ accessed via C{__len__}, and that each element can be retrieved
+ using list subscripting.
+
+ Tree is indexed like so::
+
+
+ 0
+ / \
+ 1 2
+ / \ / \
+ 3 4 5 6
+ / \ / \ / \ / \
+ 7 8 9 10 11 12 13 14
+
+ """
+ def parent(self, i):
+ """
+ Index of the parent of C{i}.
+ """
+ if i < 1 or (hasattr(self, '__len__') and i >= len(self)):
+ raise IndexError('index out of range: ' + repr(i))
+ return (i - 1) // 2
+
+ def lchild(self, i):
+ """
+ Index of the left child of C{i}.
+ """
+ ans = 2 * i + 1
+ if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
+ raise IndexError('index out of range: ' + repr(i))
+ return ans
+
+ def rchild(self, i):
+ """
+ Index of right child of C{i}.
+ """
+ ans = 2 * i + 2
+ if i < 0 or (hasattr(self, '__len__') and ans >= len(self)):
+ raise IndexError('index out of range: ' + repr(i))
+ return ans
+
+ def sibling(self, i):
+ """
+ Index of sibling of C{i}.
+ """
+ parent = self.parent(i)
+ if self.lchild(parent) == i:
+ return self.rchild(parent)
+ else:
+ return self.lchild(parent)
+
+ def needed(self, i):
+ """
+ Return a list of nodes that are necessary for the hash chain.
+ """
+ if i < 0 or i >= len(self):
+ raise IndexError('index out of range: ' + repr(i))
+ needed = []
+ here = i
+ while here != 0:
+ needed.append(self.sibling(here))
+ here = self.parent(here)
+ return needed
+
+
+class HashTree(CompleteBinaryTreeMixin, list):
+ """
+ Compute Merkle hashes at any node in a complete binary tree.
+
+ Tree is indexed like so::
+
+
+ 0
+ / \
+ 1 2
+ / \ / \
+ 3 4 5 6
+ / \ / \ / \ / \
+ 7 8 9 10 11 12 13 14 <- List passed to constructor.
+
+ """
+ def __init__(self, L):
+ """
+ Create complete binary tree from list of hash strings.
+
+ The list is augmented by hashes so its length is a power of 2, and
+ then this is used as the bottom row of the hash tree.
+
+ The augmenting is done so that if the augmented element is at
+ index C{i}, then its value is C{hash(bencode.bencode((i, '')))}.
+ """
+ # Augment the list.
+ start = len(L)
+ end = roundup_pow2(len(L))
+ L = L + [None] * (end - start)
+ for i in range(start, end):
+ L[i] = hash(bencode.bencode((i, '')))
+ # Form each row of the tree.
+ rows = [L]
+ while len(rows[-1]) != 1:
+ last = rows[-1]
+ rows += [[hash(last[2*i] + last[2*i+1]) for i in xrange(len(last)//2)]]
+ # Flatten the list of rows into a single list.
+ rows.reverse()
+ self[:] = sum(rows, [])
+
+
+class BlockFile:
+ """
+ Reads and writes blocks of data to a binary file.
+
+ It is assumed that the binary file does not change in size.
+
+ @ivar file_name: Full path to file.
+ @ivar file_size: Size of file in bytes.
+ @ivar block_size: Size of each block.
+ """
+ def __init__(self, file_name, mode, block_size, file_size=None):
+ """
+ Initialize block reader or writer on given file name.
+
+ If mode is 'r', the file must already exist and it is opened for
+ reading only. If mode is 'w', the file will be created with size
+ C{file_size} if it does not exist, and it is opened for reading
+ and writing.
+
+ Note that C{file_size} is ignored if the file already exists.
+ """
+ self.mode = mode
+ self.file_name = os.path.abspath(file_name)
+ assert self.mode in ['r', 'w']
+
+ if mode == 'r':
+ f = open(self.file_name, 'rb')
+ f.close()
+
+ # Create file if it doesn't exist.
+ created = False
+ if mode == 'w' and not os.path.exists(self.file_name):
+ created = True
+ buf = ' ' * 1024
+ f = open(self.file_name, 'wb')
+ for i in xrange(file_size // len(buf)):
+ f.write(buf)
+ f.write(' ' * (file_size % len(buf)))
+ f.close()
+
+ self.file_size = os.stat(self.file_name).st_size
+ if created:
+ assert self.file_size == file_size
+ self.block_size = block_size
+ self.__block_count = self.file_size // self.block_size
+ if self.file_size % self.block_size == 0:
+ self.last_block_size = self.block_size
+ else:
+ self.last_block_size = self.file_size % self.block_size
+ self.__block_count += 1
+
+ def __getitem__(self, i):
+ """
+ Get block i.
+ """
+ if i < 0 or i >= len(self):
+ raise IndexError('block index out of range: ' + repr(i))
+ f = open(self.file_name, 'rb')
+ try:
+ f.seek(i * self.block_size)
+ ans = f.read(self.block_size)
+ finally:
+ f.close()
+ return ans
+
+ def __setitem__(self, i, s):
+ """
+ Set block i.
+ """
+ if self.mode != 'w':
+ raise ValueError('file opened for reading only')
+ if i < 0 or i >= len(self):
+ raise IndexError('block index out of range: ' + repr(i))
+ if i < len(self) - 1:
+ if len(s) != self.block_size:
+ raise ValueError('length of value must equal block_size')
+ else:
+ if len(s) != self.last_block_size:
+ raise ValueError('length of value must equal last_block_size')
+ f = open(self.file_name, 'rb+')
+ try:
+ f.seek(i * self.block_size)
+ f.write(s)
+ finally:
+ f.close()
+
+ def __len__(self):
+ """
+ Get number of blocks.
+ """
+ return int(self.__block_count)
+
+
+class MetaFile(CompleteBinaryTreeMixin):
+ """
+ A L{HashTree} stored on disk, with a timestamp.
+
+ The list of hashes can be accessed using subscripting and
+ C{__len__}, in the same manner as for L{HashTree}.
+
+ Note that the constructor takes the entire list associated with
+ the L{HashTree}, not just the bottom row of the tree.
+
+ @ivar meta_name: Full path to metafile.
+ """
+ def __init__(self, meta_name, mode, L=None):
+ """
+ Open an existing meta-file for reading or writing.
+
+ If C{mode} is 'r', the meta-file must already exist and it is
+ opened for reading only, and the list C{L} is ignored. If C{mode}
+ is 'w', the file will be created if it does not exist (from the
+ list of hashes given in C{L}), and it is opened for reading and
+ writing.
+ """
+ self.meta_name = os.path.abspath(meta_name)
+ self.mode = mode
+ assert self.mode in ['r', 'w']
+
+ # A timestamp is stored at index 0. The MetaFile instance
+ # offsets all indices passed to __getitem__, __setitem__ by
+ # this offset, and pretends it has length equal to
+ # self.sublength.
+ self.offset = 1
+
+ if self.mode == 'w':
+ suggested_length = len(hash('')) * (len(L)+self.offset)
+ else:
+ suggested_length = None
+
+ created = False
+ if self.mode == 'w' and not os.path.exists(self.meta_name):
+ created = True
+
+ self.block_file = BlockFile(self.meta_name, self.mode,
+ len(hash('')),
+ suggested_length)
+ self.sublength = len(self.block_file) - self.offset
+
+ if created:
+ for i in xrange(len(L)):
+ self.block_file[i + self.offset] = L[i]
+
+ def __getitem__(self, i):
+ if i < 0 or i >= self.sublength:
+ raise IndexError('bad meta-file block index')
+ return self.block_file[i + self.offset]
+
+ def __setitem__(self, i, value):
+ if i < 0 or i >= self.sublength:
+ raise IndexError('bad meta-file block index')
+ self.block_file[i + self.offset] = value
+
+ def __len__(self):
+ return self.sublength
+
+ def set_timestamp(self, file_name):
+ """
+ Set meta file's timestamp equal to the timestamp for C{file_name}.
+ """
+ st = os.stat(file_name)
+ timestamp = bencode.bencode((st.st_size, st.st_mtime))
+ self.block_file[0] = sha.new(timestamp).digest()
+
+ def check_timestamp(self, file_name):
+ """
+ True if meta file's timestamp equals timestamp for C{file_name}.
+ """
+ st = os.stat(file_name)
+ timestamp = bencode.bencode((st.st_size, st.st_mtime))
+ return self.block_file[0] == sha.new(timestamp).digest()
+
+
+class CompleteChunkFile(BlockFile):
+ """
+ Reads chunks from a fully-downloaded file.
+
+ A chunk C{i} is created from block C{i}. Block C{i} is unencoded
+ data read from the file by the L{BlockFile}. Chunk C{i} is
+ an encoded string created from block C{i}.
+
+ Chunks can be read using list subscripting. The total number of
+ chunks (equals the total number of blocks) is given by L{__len__}.
+
+ @ivar file_name: Full path to file.
+ @ivar file_size: Size of file in bytes.
+ @ivar file_hash: Hash of file.
+ @ivar meta_name: Full path to metafile, or C{None}.
+ @ivar tree: L{HashTree} or L{MetaFile} instance for the file.
+ One can extract a hash from any node in the hash
+ tree.
+ """
+
+ def __init__(self, file_name, meta_name=None, callback=None):
+ """
+ Initialize reader on the given file name.
+
+ The entire file will be read and the hash will be computed from
+ the file. This may take a long time, so C{callback()} is called
+ frequently during this process. This allows you to reduce CPU
+ usage if you wish.
+
+ The C{meta_name} argument is optional. If it is specified, then the
+ hashes for C{file_name} will be stored under the file
+ C{meta_name}. If a C{CompleteChunkFile} is created on the same
+ file and metafile in the future, then the hashes will not need to
+ be recomputed and the constructor will return instantly. The
+ metafile contains a file and date stamp, so that if the file stored
+ in C{file_name} is modified, then the hashes will be recomputed.
+ """
+ BlockFile.__init__(self, file_name, 'r', block_size=65536)
+
+ # Whether we need to compute the hash tree
+ compute_tree = False
+
+ self.meta_name = meta_name
+ if self.meta_name != None:
+ self.meta_name = os.path.abspath(self.meta_name)
+ self.meta = None
+ if self.meta_name == None:
+ compute_tree = True
+ else:
+ try:
+ meta = MetaFile(self.meta_name, 'r')
+ assert meta.check_timestamp(self.file_name)
+ except (IOError, AssertionError):
+ compute_tree = True
+
+ # Compute the hash tree if needed.
+ if compute_tree:
+ chunk_hashes = [None] * len(self)
+ for i in xrange(len(self)):
+ triple = (self.file_size, i, BlockFile.__getitem__(self, i))
+ chunk_hashes[i] = hash(bencode.bencode(triple))
+ if callback:
+ callback()
+ self.tree = HashTree(chunk_hashes)
+ del chunk_hashes
+
+ # If a meta-file was given, make self.tree be a MetaFile instance.
+ if self.meta_name != None:
+ if compute_tree:
+ # Did we compute the hash tree? Then store it to disk.
+ self.tree = MetaFile(self.meta_name, 'w', self.tree)
+ # Update its timestamp to be consistent with the file we
+ # just hashed.
+ self.tree.set_timestamp(self.file_name)
+ else:
+ # Read existing file from disk.
+ self.tree = MetaFile(self.meta_name, 'r')
+
+ self.file_hash = self.tree[0]
+
+ def __getitem__(self, i):
+ """
+ Get chunk C{i}.
+
+ Raises C{ValueError} if the file's contents changed since the
+ CompleteFileChunkReader was instantiated.
+ """
+ return encode_chunk(BlockFile.__getitem__(self, i), i,
+ self.file_size, self.tree)
+
+
+def encode_chunk(block, index, file_size, tree):
+ """
+ Encode a chunk.
+
+ Given a block at index C{index} in a file with size C{file_size},
+ and a L{HashTree} or L{MetaFile} instance C{tree}, computes and
+ returns a chunk string for the given block.
+
+ The C{tree} argument needs to have correct hashes only at certain
+ indices. Check out the code for details. In any case, if a hash
+ is wrong an exception will be raised.
+ """
+ block_count = (len(tree) + 1) // 2
+ if index < 0 or index >= block_count:
+ raise IndexError('block index out of range: ' + repr(index))
+
+ suffix = bencode.bencode((file_size, index, block))
+ current = len(tree) - block_count + index
+ prefix = []
+ while current > 0:
+ sibling = tree.sibling(current)
+ prefix += [tree[current], tree[sibling]]
+ current = tree.parent(current)
+ prefix = ''.join(prefix)
+
+ # Encode the chunk
+ chunk = bencode.bencode((prefix, suffix))
+
+ # Check to make sure it decodes properly.
+ decode_chunk(chunk, file_size, tree)
+ return chunk
+
+
+def decode_chunk(chunk, file_size, tree):
+ """
+ Decode a chunk.
+
+ Given file with size C{file_size} and a L{HashTree} or L{MetaFile}
+ instance C{tree}, return C{(index, block, tree_items)}. Here
+ C{index} is the block index where string C{block} should be placed
+ in the file. Also C{tree_items} is a dict mapping indices within
+ the L{HashTree} or L{MetaFile} tree object associated with the
+ given file to the corresponding hashes at those indices. These
+ have been verified against the file's hash, so it is known that
+ they are correct.
+
+ Raises C{ValueError} if chunk verification fails.
+ """
+ file_hash = tree[0]
+ block_count = (len(tree) + 1) // 2
+ try:
+ # Decode the chunk
+ try:
+ (prefix, suffix) = bencode.bdecode(chunk)
+ except:
+ raise AssertionError()
+
+ assert isinstance(prefix, str)
+ assert isinstance(suffix, str)
+
+ # Verify the suffix against the hashes in the prefix.
+ hash_len = len(hash(''))
+ L = [prefix[hash_len*i:hash_len*(i+1)] for i in range(len(prefix)//hash_len)]
+ L += [file_hash]
+ assert L[0] == hash(suffix)
+ branches = []
+ for i in range(0, len(L)-1, 2):
+ if hash(L[i] + L[i+1]) == L[i+2]:
+ branches += [0]
+ elif hash(L[i+1] + L[i]) == L[i+2]:
+ branches += [1]
+ else:
+ raise AssertionError()
+
+ # Decode the suffix
+ try:
+ (claim_file_size, claim_index, block) = bencode.bdecode(suffix)
+ except:
+ raise AssertionError()
+
+ assert isinstance(claim_file_size, int) or isinstance(claim_file_size, long)
+ assert isinstance(claim_index, int) or isinstance(claim_index, long)
+ assert isinstance(block, str)
+
+ assert file_size == claim_file_size
+
+ # Compute the index of the block, and check it.
+ found_index = sum([branches[i]*2**i for i in range(len(branches))])
+ assert found_index == claim_index
+
+ # Now fill in the tree_items dict.
+ tree_items = {}
+ current = (len(tree) - block_count) + found_index
+ i = 0
+ while current > 0 and i + 1 < len(L):
+ tree_items[current] = L[i]
+ # Next item is our sibling.
+ tree_items[tree.sibling(current)] = L[i+1]
+ i += 2
+ current = tree.parent(current)
+
+ return (found_index, block, tree_items)
+ except AssertionError:
+ raise ValueError('corrupt chunk')
+
+
+class PartialChunkFile(BlockFile):
+ """
+ Reads and writes chunks to a partially downloaded file.
+
+ @ivar file_name: Full path to file.
+ @ivar file_size: Size of file in bytes.
+ @ivar file_hash: Hash of file.
+ @ivar meta_name: Full path to metafile.
+ @ivar tree: L{MetaFile} instance for the file.
+ The hashes in this hash tree are valid only for
+ nodes that we have been sent hashes for.
+ """
+ def __init__(self, file_name, meta_name, file_hash=None, file_size=None):
+ """
+ Initialize reader/writer for the given file name and metafile name.
+
+ If neither C{file_name} nor C{meta_file} exist, then both are
+ created. The C{file_hash} and C{file_size} arguments are used to
+ initialize the two files.
+
+ If both C{file_name} and C{meta_file} exist, then the hash and
+ file size arguments are ignored, and those values are instead read
+ from the files.
+
+ If one file exists and the other does not, an C{IOError} is raised.
+ """
+ self.meta_name = os.path.abspath(meta_name)
+ meta_exists = os.path.exists(self.meta_name)
+ file_exists = os.path.exists(os.path.abspath(file_name))
+
+ BlockFile.__init__(self, os.path.abspath(file_name), 'w',
+ BLOCK_SIZE, file_size)
+
+ if file_exists and not meta_exists:
+ raise IOError('metafile ' + repr(self.meta_name) +
+ ' missing for file ' + repr(self.file_name))
+ if meta_exists and not file_exists:
+ raise IOError('file ' + repr(self.file_name) +
+ ' missing for metafile ' + repr(self.meta_name))
+ tree_count = 2 * roundup_pow2(len(self)) - 1
+ self.tree = MetaFile(self.meta_name, 'w', [hash('')] * tree_count)
+
+ if not meta_exists and not file_exists:
+ self.tree[0] = file_hash
+
+ self.file_hash = self.tree[0]
+
+ def __getitem__(self, i):
+ """
+ Get chunk C{i}.
+
+ Raises C{ValueError} if chunk has not yet been downloaded or is
+ corrupted.
+ """
+ return encode_chunk(BlockFile.__getitem__(self, i), i,
+ self.file_size, self.tree)
+
+ def __setitem__(self, i, chunk):
+ """
+ Set chunk C{i}.
+
+ Raises C{ValueError} if the chunk is invalid.
+ """
+ (index, block, tree_items) = decode_chunk(chunk,
+ self.file_size, self.tree)
+ if index != i:
+ raise ValueError('incorrect index for chunk')
+ BlockFile.__setitem__(self, index, block)
+ for (tree_index, tree_value) in tree_items.items():
+ self.tree[tree_index] = tree_value
+
+
+def test(filename1='temp-out', metaname1='temp-out.meta',
+ filename2='temp-out2', metaname2='temp-out2.meta'):
+ """
+ Unit tests.
+ """
+ print 'Testing:'
+
+ import random
+ ntests = 100
+ max_file_size = 200000
+
+ # Test CompleteChunkFile.
+
+ if os.path.exists(metaname1):
+ os.remove(metaname1)
+
+ for i in range(ntests):
+ fsize = random.randrange(max_file_size)
+ # Make some random string of size 'fsize' to go in the file.
+ s = ''.join([sha.new(str(j)).digest() for j in range(fsize//20+1)])
+ assert len(s) >= fsize
+ s = s[:fsize]
+ f = open(filename1, 'wb')
+ f.write(s)
+ f.close()
+ C = CompleteChunkFile(filename1)
+ for j in range(len(C)):
+ C[j]
+ C = CompleteChunkFile(filename1, metaname1)
+ for j in range(len(C)):
+ C[j]
+ C = CompleteChunkFile(filename1, metaname1)
+ for j in range(len(C)):
+ C[j]
+ os.remove(metaname1)
+
+ os.remove(filename1)
+
+ print ' CompleteChunkFile: OK'
+
+ # Test PartialChunkFile
+
+ for i in range(ntests):
+ fsize = random.randrange(max_file_size)
+ # Make some random string of size 'fsize' to go in the file.
+ s = ''.join([sha.new(str(j)).digest() for j in range(fsize//20+1)])
+ assert len(s) >= fsize
+ s = s[:fsize]
+ f = open(filename1, 'wb')
+ f.write(s)
+ f.close()
+ C1 = CompleteChunkFile(filename1)
+ if os.path.exists(filename2):
+ os.remove(filename2)
+
+ if os.path.exists(metaname2):
+ os.remove(metaname2)
+ C2 = PartialChunkFile(filename2, metaname2, C1.file_hash, C1.file_size)
+ assert len(C1) == len(C2)
+ assert C2.tree[0] == C1.tree[0]
+ for j in range(len(C2)):
+ try:
+ C2[j]
+ ok = False
+ except ValueError:
+ ok = True
+ if not ok:
+ raise AssertionError()
+ for j in range(len(C2)//2):
+ k = random.randrange(len(C2))
+ if len(C1) > 1:
+ assert C1[k] != C1[(k+1)%len(C1)]
+ try:
+ C2[k] = C1[(k+1)%len(C1)]
+ ok = False
+ except ValueError:
+ ok = True
+ if not ok:
+ raise AssertionError()
+ C2[k] = C1[k]
+ assert C2[k] == C1[k]
+ for j in range(len(C2)):
+ C2[j] = C1[j]
+ assert C2[j] == C1[j]
+
+ os.remove(filename1)
+ os.remove(filename2)
+ os.remove(metaname2)
+
+ print ' PartialChunkFile: OK'
+
+
+if __name__ == '__main__':
+ test()
--- /dev/null
+#! /usr/bin/python
+
+import math
+from twisted.internet import defer
+from allmydata.chunk import HashTree
+from Crypto.Cipher import AES
+import sha
+
+def hash(data):
+ return sha.new(data).digest()
+
+"""
+
+The goal of the encoder is to turn the original file into a series of
+'shares'. Each share is going to a 'shareholder' (nominally each shareholder
+is a different host, but for small meshes there may be overlap). The number
+of shares is chosen to hit our reliability goals (more shares on more
+machines means more reliability), and is limited by overhead (proportional to
+numshares or log(numshares)) and the encoding technology in use (Reed-Solomon
+only permits 256 shares total). It is also constrained by the amount of data
+we want to send to each host. For estimating purposes, think of 100 shares
+out of which we need 25 to reconstruct the file.
+
+The encoder starts by cutting the original file into segments. All segments
+except the last are of equal size. The segment size is chosen to constrain
+the memory footprint (which will probably vary between 1x and 4x segment
+size) and to constrain the overhead (which will be proportional to either the
+number of segments or log(number of segments)).
+
+
+Each segment (A,B,C) is read into memory, encrypted, and encoded into
+subshares. The 'share' (say, share #1) that makes it out to a host is a
+collection of these subshares (subshare A1, B1, C1), plus some hash-tree
+information necessary to validate the data upon retrieval. Only one segment
+is handled at a time: all subshares for segment A are delivered before any
+work is begun on segment B.
+
+As subshares are created, we retain the hash of each one. The list of
+subshare hashes for a single share (say, hash(A1), hash(B1), hash(C1)) is
+used to form the base of a Merkle hash tree for that share (hashtrees[1]).
+This hash tree has one terminal leaf per subshare. The complete subshare hash
+tree is sent to the shareholder after all the data has been sent. At
+retrieval time, the decoder will ask for specific pieces of this tree before
+asking for subshares, whichever it needs to validate those subshares.
+
+[TODO: we don't really need to generate this whole subshare hash tree
+ourselves. It would be sufficient to have the shareholder generate it and
+just tell us the root. This gives us an extra level of validation on the
+transfer, though, and it is relatively cheap to compute.]
+
+Each of these subshare hash trees has a root hash. The collection of these
+root hashes for all shares are collected into the 'share hash tree', which
+has one terminal leaf per share. After sending the subshares and the complete
+subshare hash tree to each shareholder, we send them the portion of the share
+hash tree that is necessary to validate their share. The root of the share
+hash tree is put into the URI.
+
+"""
+
+
+
+
+class Encoder(object):
+
+ def setup(self, infile):
+ self.infile = infile
+ infile.seek(0, 2)
+ self.file_size = infile.tell()
+ infile.seek(0, 0)
+ fsize = 1.0 * self.file_size
+ self.segment_size = 1024
+ self.num_segments = int(math.ceil(fsize / self.segment_size))
+
+ self.num_shares = 100
+ self.share_size = self.file_size / 25
+
+ def get_reservation_size(self):
+ self.num_shares = 100
+ self.share_size = self.file_size / 25
+ overhead = self.compute_overhead()
+ return self.share_size + overhead
+
+ def setup_encryption(self):
+ self.key = "\x00"*16
+ self.cryptor = AES.new(key=self.key, mode=AES.MODE_CTR,
+ counterstart="\x00"*16)
+ self.segment_num = 0
+ self.subshare_hashes = [[]] * self.num_shares
+ # subshare_hashes[i] is a list that will be accumulated and then send
+ # to landlord[i]. This list contains a hash of each segment_share
+ # that we sent to that landlord.
+ self.share_root_hashes = [None] * self.num_shares
+
+ def start(self):
+ self.setup_encryption()
+ d = defer.succeed(None)
+ for i in range(self.num_segments):
+ d.addCallback(lambda res: self.do_segment(i))
+ d.addCallback(lambda res: self.send_all_subshare_hash_trees())
+ d.addCallback(lambda res: self.send_all_share_hash_trees())
+ d.addCallback(lambda res: self.close_all_shareholders())
+ d.addCallback(lambda res: self.done())
+ return d
+
+ def encode_segment(self, crypttext):
+ shares = [crypttext] * self.num_shares
+ return shares
+
+ def do_segment(self, segnum):
+ segment_plaintext = self.infile.read(self.segment_size)
+ segment_crypttext = self.cryptor.encrypt(segment_plaintext)
+ del segment_plaintext
+ subshares_for_this_segment = self.encode_segment(segment_crypttext)
+ del segment_crypttext
+ dl = []
+ for share_num,subshare in enumerate(subshares_for_this_segment):
+ d = self.send_subshare(share_num, self.segment_num, subshare)
+ dl.append(d)
+ self.subshare_hashes[share_num].append(hash(subshare))
+ self.segment_num += 1
+ return defer.DeferredList(dl)
+
+ def send_subshare(self, share_num, segment_num, subshare):
+ #if False:
+ # offset = hash_size + segment_num * segment_size
+ # return self.send(share_num, "write", subshare, offset)
+ return self.send(share_num, "put_subshare", segment_num, subshare)
+
+ def send(self, share_num, methname, *args, **kwargs):
+ ll = self.landlords[share_num]
+ return ll.callRemote(methname, *args, **kwargs)
+
+ def send_all_subshare_hash_trees(self):
+ dl = []
+ for share_num,hashes in enumerate(self.subshare_hashes):
+ # hashes is a list of the hashes of all subshares that were sent
+ # to shareholder[share_num].
+ dl.append(self.send_one_subshare_hash_tree(share_num, hashes))
+ return defer.DeferredList(dl)
+
+ def send_one_subshare_hash_tree(self, share_num, subshare_hashes):
+ t = HashTree(subshare_hashes)
+ all_hashes = list(t)
+ # all_hashes[0] is the root hash, == hash(ah[1]+ah[2])
+ # all_hashes[1] is the left child, == hash(ah[3]+ah[4])
+ # all_hashes[n] == hash(all_hashes[2*n+1] + all_hashes[2*n+2])
+ self.share_root_hashes[share_num] = t[0]
+ ll = self.landlords[share_num]
+ if False:
+ block = "".join(all_hashes)
+ return ll.callRemote("write", block, offset=0)
+ return ll.callRemote("put_subshare_hashes", all_hashes)
+
+ def send_all_share_hash_trees(self):
+ dl = []
+ for h in self.share_root_hashes:
+ assert h
+ # create the share hash tree
+ t = HashTree(self.share_root_hashes)
+ # the root of this hash tree goes into our URI
+ self.root_hash = t[0]
+ # now send just the necessary pieces out to each shareholder
+ for i in range(self.num_shares):
+ needed_hash_indices = t.needed_for(i)
+ dl.append(self.send_one_share_hash_tree(i, needed_hash_indices))
+ return defer.DeferredList(dl)
+
+ def send_one_share_hash_tree(self, share_num, needed_hashes):
+ ll = self.landlords[share_num]
+ return ll.callRemote("put_share_hashes", needed_hashes)
+
+ def close_all_shareholders(self):
+ dl = []
+ for ll in self.landlords:
+ dl.append(ll.callRemote("close"))
+ return defer.DeferredList(dl)
+
+ def done(self):
+ return self.root_hash
+
+
+from foolscap import RemoteInterface
+from foolscap.schema import ListOf, TupleOf, Nothing
+_None = Nothing()
+
+
+class RIStorageBucketWriter(RemoteInterface):
+ def put_subshare(segment_number=int, subshare=str):
+ return _None
+ def put_segment_hashes(all_hashes=ListOf(str)):
+ return _None
+ def put_share_hashes(needed_hashes=ListOf(TupleOf(int,str))):
+ return _None
+ #def write(data=str, offset=int):
+ # return _None
+class RIStorageBucketReader(RemoteInterface):
+ def get_share_hashes():
+ return ListOf(TupleOf(int,str))
+ def get_segment_hashes(which=ListOf(int)):
+ return ListOf(str)
+ def get_subshare(segment_number=int):
+ return str
+ #def read(size=int, offset=int):
+ # return str
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: MacRoman -*-
+"""
+A library for streaming and unstreaming of simple objects, designed
+for speed, compactness, and ease of implementation.
+
+The basic functions are bencode and bdecode. bencode takes an object
+and returns a string, bdecode takes a string and returns an object.
+bdecode raises a ValueError if you give it an invalid string.
+
+The objects passed in may be nested dicts, lists, ints, floats, strings,
+and Python boolean and None types. For example, all of the following
+may be bencoded -
+
+{'a': [0, 1], 'b': None}
+
+[None, ['a', 2, ['c', None]]]
+
+{'spam': (2,3,4)}
+
+{'name': 'Cronus', 'spouse': 'Rhea', 'children': ['Hades', 'Poseidon']}
+
+In general bdecode(bencode(spam)) == spam, but tuples and lists are
+encoded the same, so bdecode(bencode((0, 1))) is [0, 1] rather
+than (0, 1). Longs and ints are also encoded the same way, so
+bdecode(bencode(4)) is a long.
+
+Dict keys are required to be basestrings (byte strings or unicode objects),
+to avoid a mess of potential implementation incompatibilities. bencode is
+intended to be used for protocols which are going to be re-implemented many
+times, so it's very conservative in that regard.
+
+Which type is encoded is determined by the first character, 'i', 'n', 'f',
+'d', 'l', 'b', 'u', and any digit. They indicate integer, null, float,
+dict, list, boolean, unicode string, and string, respectively.
+
+Strings are length-prefixed in base 10, followed by a colon.
+
+bencode('spam') == '4:spam'
+
+Unicode string objects are indicated with an initial u, a base 10
+length-prefix, and the remaining bytes in utf-8 encoding.
+
+bencode(u'\u00bfHabla espa\u00f1ol?') == 'ËHabla espaÐol?'
+
+Nulls are indicated by a single 'n'.
+
+bencode(None) == 'n'
+
+Integers are encoded base 10 and terminated with an 'e' -
+
+bencode(3) == 'i3e'
+bencode(-20) == 'i-20e'
+
+Floats are encoded in base 10 and terminated with an 'e' -
+
+bencode(3.2) == 'f3.2e'
+bencode(-23.4532) == 'f-23.4532e'
+
+Lists are encoded in list order, terminated by an 'e' -
+
+bencode(['abc', 'd']) == 'l3:abc1:de'
+bencode([2, 'f']) == 'li2e1:fe'
+
+Dicts are encoded by containing alternating keys and values.
+The keys are encoded in sorted order, but sort order is not
+enforced on the decode. Dicts are terminated by an 'e'. Dict
+keys can be either bytestrings or unicode strings. For example -
+
+bencode({'spam': 'eggs'}) == 'd4:spam4:eggse'
+bencode({'ab': 2, 'a': None}) == 'd1:an2:abi2ee'
+bencode({'a' : 1, u'\xab': 2}) == 'd1:ai1eu4:\xfe\xff\x00\xa8i2ee'
+
+Truncated strings come first, so in sort order 'a' comes before 'abc'.
+"""
+
+# This file is licensed under the GNU Lesser General Public License v2.1.
+#
+# Originally written by Mojo Nation.
+# Rewritten by Bram Cohen.
+# Further enhanced by Allmydata to support additional Python types (Boolean
+# None, Float, and Unicode strings.)
+
+from types import IntType, LongType, FloatType, ListType, TupleType, DictType, StringType, UnicodeType, BooleanType, NoneType
+from cStringIO import StringIO
+import string
+
+def bencode(data):
+ """
+ encodes objects as strings, see module documentation for more info
+ """
+ result = StringIO()
+ bwrite(data, result)
+ return result.getvalue()
+
+def bwrite(data, result):
+ # a generic using pje's type dispatch will be faster here
+ try:
+ encoder = encoders[type(data)]
+ except KeyError:
+ encoder = None
+ # Catch subclasses of built-in types
+ for t,coder in encoders.items():
+ if isinstance(data, t):
+ encoder = coder
+ break
+ if not encoder:
+ raise ValueError("unsupported data type: %s" % type(data))
+ encoder(data, result)
+
+encoders = {}
+
+def encode_int(data, result):
+ result.write('i' + str(data) + 'e')
+
+encoders[IntType] = encode_int
+encoders[LongType] = encode_int
+
+def encode_float(data, result):
+ result.write('f' + str(data) + 'e')
+
+encoders[FloatType] = encode_float
+
+def encode_bool(data, result):
+ if data:
+ result.write('b1')
+ else:
+ result.write('b0')
+
+encoders[BooleanType] = encode_bool
+
+def encode_list(data, result):
+ result.write('l')
+ _bwrite = bwrite
+ for item in data:
+ _bwrite(item, result)
+ result.write('e')
+
+encoders[TupleType] = encode_list
+encoders[ListType] = encode_list
+encoders[set] = encode_list
+
+def encode_string(data, result):
+ result.write(str(len(data)) + ':' + data)
+
+encoders[StringType] = encode_string
+
+def encode_unicode(data, result):
+ payload = data.encode('utf-8')
+ result.write('u' + str(len(payload)) + ':' + payload)
+
+encoders[UnicodeType] = encode_unicode
+
+def encode_dict(data, result):
+ result.write('d')
+ _bwrite = bwrite
+ keylist = data.keys()
+ keylist.sort()
+ for key in keylist:
+ _bwrite(key, result)
+ _bwrite(data[key], result)
+ result.write('e')
+
+encoders[DictType] = encode_dict
+
+encoders[NoneType] = lambda data, result: result.write('n')
+
+def bdecode(s):
+ """
+ Does the opposite of bencode. Raises a ValueError if there's a problem.
+ """
+ try:
+ result, index = bread(s, 0)
+ if index != len(s):
+ raise ValueError('left over stuff at end: %s' % s[index:])
+ return result
+ except IndexError, e:
+ raise ValueError(str(e))
+ except KeyError, e:
+ raise ValueError(str(e))
+
+def bread(s, index):
+ return decoders[s[index]](s, index)
+
+decoders = {}
+
+def decode_raw_string(s, index):
+ ci = s.index(":", index)
+ ei = ci + int(s[index:ci]) + 1
+ if ei > len(s):
+ raise ValueError('length encoding indicates premature end of string')
+ return (s[ci+1:ei], ei)
+
+for c in string.digits:
+ decoders[c] = decode_raw_string
+
+def decode_unicode_string(s, index):
+ ci = s.index(":", index)
+ ei = ci + int(s[index+1:ci]) + 1
+ if ei > len(s):
+ raise ValueError('length encoding indicates premature end of string')
+ return (unicode(s[ci+1:ei], 'utf-8'), ei)
+
+decoders['u'] = decode_unicode_string
+
+def decode_int(s, index):
+ ei = s.index('e', index)
+ return (long(s[index+1:ei]), ei+1)
+
+decoders['i'] = decode_int
+
+def decode_float(s, index):
+ ei = s.index('e', index)
+ return (float(s[index+1:ei]), ei+1)
+
+decoders['f'] = decode_float
+
+def decode_bool(s, index):
+ val = s[index+1]
+ if val == '1':
+ return True, index+2
+ elif val == '0':
+ return False, index+2
+ else:
+ raise ValueError('invalid boolean encoding: %s' % s[index:index+2])
+
+decoders['b'] = decode_bool
+
+# decoders['n'] = lambda s, index: decoders_n.inc('n') or (None, index + 1)
+decoders['n'] = lambda s, index: (None, index + 1)
+
+def decode_list(s, index):
+ # decoders_n.inc('l')
+ result = []
+ index += 1
+ _bread = bread
+ while s[index] != 'e':
+ next, index = _bread(s, index)
+ result.append(next)
+ return result, index + 1
+
+decoders['l'] = decode_list
+
+def decode_dict(s, index):
+ # decoders_n.inc('d')
+ result = {}
+ index += 1
+ _decode_string = decode_raw_string
+ _decode_unicode = decode_unicode_string
+ _bread = bread
+ while s[index] != 'e':
+ if s[index] in string.digits:
+ key, index = _decode_string(s, index)
+ elif s[index] == "u":
+ key, index = _decode_unicode(s, index)
+ else:
+ raise ValueError("dict key must be basestring")
+ if key in result:
+ raise ValueError("dict key was repeated")
+ value, index = _bread(s, index)
+ result[key] = value
+ return result, index + 1
+
+decoders['d'] = decode_dict
+
+def test_decode_raw_string():
+ assert decode_raw_string('1:a', 0) == ('a', 3)
+ assert decode_raw_string('0:', 0) == ('', 2)
+ assert decode_raw_string('10:aaaaaaaaaaaaaaaaaaaaaaaaa', 0) == ('aaaaaaaaaa', 13)
+ assert decode_raw_string('10:', 1) == ('', 3)
+# non-reexp version does not check for this case
+# try:
+# decode_raw_string('01:a', 0)
+# assert 0, 'failed'
+# except ValueError:
+# pass
+ try:
+ decode_raw_string('--1:a', 0)
+ assert 0, 'failed'
+ except ValueError:
+ pass
+ try:
+ decode_raw_string('h', 0)
+ assert 0, 'failed'
+ except ValueError:
+ pass
+ try:
+ decode_raw_string('h:', 0)
+ assert 0, 'failed'
+ except ValueError:
+ pass
+ try:
+ decode_raw_string('1', 0)
+ assert 0, 'failed'
+ except ValueError:
+ pass
+ try:
+ decode_raw_string('', 0)
+ assert 0, 'failed'
+ except ValueError:
+ pass
+ try:
+ decode_raw_string('5:a', 0)
+ assert 0, 'failed'
+ except ValueError:
+ pass
+
+def test_encode_and_decode_unicode_results_in_unicode_type():
+ assert bdecode(bencode(u'\u00bfHabla espa\u00f1ol?')) == u'\u00bfHabla espa\u00f1ol?'
+
+def test_encode_and_decode_unicode_at_least_preserves_the_content_even_if_it_flattens_the_type():
+ test_string = bdecode(bencode(u'\u00bfHabla espa\u00f1ol?'))
+ if isinstance(test_string, unicode):
+ assert test_string == u'\u00bfHabla espa\u00f1ol?'
+ elif isinstance(test_string, str):
+ assert test_string.decode('utf-8') == u'\u00bfHabla espa\u00f1ol?'
+ else:
+ assert 0, 'flunked'
+
+def test_dict_forbids_non_string_key():
+ try:
+ bdecode('di3ene')
+ assert 0, 'failed'
+ except ValueError:
+ pass
+
+def test_dict_forbids_key_repeat():
+ try:
+ bdecode('d1:an1:ane')
+ assert 0, 'failed'
+ except ValueError:
+ pass
+
+def test_empty_dict():
+ assert bdecode('de') == {}
+
+def test_dict_allows_unicode_keys():
+ assert bdecode(bencode({'a': 1, u'\xa8': 2})) == {'a': 1L, u'\xa8': 2L}
+
+def test_ValueError_in_decode_unknown():
+ try:
+ bdecode('x')
+ assert 0, 'flunked'
+ except ValueError:
+ pass
+
+def test_encode_and_decode_none():
+ assert bdecode(bencode(None)) == None
+
+def test_encode_and_decode_long():
+ assert bdecode(bencode(-23452422452342L)) == -23452422452342L
+
+def test_encode_and_decode_int():
+ assert bdecode(bencode(2)) == 2
+
+def test_encode_and_decode_float():
+ assert bdecode(bencode(3.4)) == 3.4
+ assert bdecode(bencode(0.0)) == 0.0
+ assert bdecode(bencode(-4.56)) == -4.56
+ assert bdecode(bencode(-0.0)) == -0.0
+
+def test_encode_and_decode_bool():
+ assert bdecode(bencode(True)) == True
+ assert bdecode(bencode(False)) == False
+
+# the non-regexp methods no longer check for canonical ints, but we
+# don't parse input we did not generate using bencode, so I will leave
+# these commented out for now
+#def test_decode_noncanonical_int():
+# try:
+# bdecode('i03e')
+# assert 0
+# except ValueError:
+# pass
+# try:
+# bdecode('i3 e')
+# assert 0
+# except ValueError:
+# pass
+# try:
+# bdecode('i 3e')
+# assert 0
+# except ValueError:
+# pass
+# try:
+# bdecode('i-0e')
+# assert 0
+# except ValueError:
+# pass
+
+def test_encode_and_decode_dict():
+ x = {'42': 3}
+ assert bdecode(bencode(x)) == x
+
+def test_encode_and_decode_list():
+ assert bdecode(bencode([])) == []
+
+def test_encode_and_decode_tuple():
+ assert bdecode(bencode(())) == []
+
+def test_encode_and_decode_empty_dict():
+ assert bdecode(bencode({})) == {}
+
+def test_encode_and_decode_complex_object():
+ spam = [[], 0, -3, -345234523543245234523L, {}, 'spam', None, {'a': [3]}, {}, {'a': 1L, u'\xa8': 2L}]
+ assert bencode(bdecode(bencode(spam))) == bencode(spam)
+ assert bdecode(bencode(spam)) == spam
+
+def test_unfinished_list():
+ try:
+ bdecode('ln')
+ assert 0
+ except ValueError:
+ pass
+
+def test_unfinished_dict():
+ try:
+ bdecode('d')
+ assert 0
+ except ValueError:
+ pass
+ try:
+ bdecode('d1:a')
+ assert 0
+ except ValueError:
+ pass
+
+def test_unsupported_type():
+ try:
+ bencode(lambda: None)
+ assert 0
+ except ValueError:
+ pass
\ No newline at end of file