import os
#import os.path
-from allmydata.util import bencode
+from allmydata.util.hashutil import tagged_hash, tagged_pair_hash
-__all__ = ['CompleteChunkFile', 'PartialChunkFile']
-
-__version__ = '1.0.0'
+__version__ = '1.0.0-allmydata'
BLOCK_SIZE = 65536
MAX_CHUNK_SIZE = BLOCK_SIZE + 4096
then this is used as the bottom row of the hash tree.
The augmenting is done so that if the augmented element is at
- index C{i}, then its value is C{hash(bencode.bencode((i, '')))}.
+ index C{i}, then its value is C{hash(tagged_hash('Merkle tree empty leaf', '%d'%i))}.
"""
# Augment the list.
start = len(L)
end = roundup_pow2(len(L))
L = L + [None] * (end - start)
for i in range(start, end):
- L[i] = hash(bencode.bencode((i, '')))
+ L[i] = tagged_hash('Merkle tree empty leaf', "%d"%i)
# Form each row of the tree.
rows = [L]
while len(rows[-1]) != 1:
last = rows[-1]
- rows += [[hash(last[2*i] + last[2*i+1]) for i in xrange(len(last)//2)]]
+ rows += [[tagged_pair_hash('Merkle tree internal node', last[2*i], last[2*i+1]) for i in xrange(len(last)//2)]]
# Flatten the list of rows into a single list.
rows.reverse()
self[:] = sum(rows, [])
-
-class BlockFile:
- """
- Reads and writes blocks of data to a binary file.
-
- It is assumed that the binary file does not change in size.
-
- @ivar file_name: Full path to file.
- @ivar file_size: Size of file in bytes.
- @ivar block_size: Size of each block.
- """
- def __init__(self, file_name, mode, block_size, file_size=None):
- """
- Initialize block reader or writer on given file name.
-
- If mode is 'r', the file must already exist and it is opened for
- reading only. If mode is 'w', the file will be created with size
- C{file_size} if it does not exist, and it is opened for reading
- and writing.
-
- Note that C{file_size} is ignored if the file already exists.
- """
- self.mode = mode
- self.file_name = os.path.abspath(file_name)
- assert self.mode in ['r', 'w']
-
- if mode == 'r':
- f = open(self.file_name, 'rb')
- f.close()
-
- # Create file if it doesn't exist.
- created = False
- if mode == 'w' and not os.path.exists(self.file_name):
- created = True
- buf = ' ' * 1024
- f = open(self.file_name, 'wb')
- for i in xrange(file_size // len(buf)):
- f.write(buf)
- f.write(' ' * (file_size % len(buf)))
- f.close()
-
- self.file_size = os.stat(self.file_name).st_size
- if created:
- assert self.file_size == file_size
- self.block_size = block_size
- self.__block_count = self.file_size // self.block_size
- if self.file_size % self.block_size == 0:
- self.last_block_size = self.block_size
- else:
- self.last_block_size = self.file_size % self.block_size
- self.__block_count += 1
-
- def __getitem__(self, i):
- """
- Get block i.
- """
- if i < 0 or i >= len(self):
- raise IndexError('block index out of range: ' + repr(i))
- f = open(self.file_name, 'rb')
- try:
- f.seek(i * self.block_size)
- ans = f.read(self.block_size)
- finally:
- f.close()
- return ans
-
- def __setitem__(self, i, s):
- """
- Set block i.
- """
- if self.mode != 'w':
- raise ValueError('file opened for reading only')
- if i < 0 or i >= len(self):
- raise IndexError('block index out of range: ' + repr(i))
- if i < len(self) - 1:
- if len(s) != self.block_size:
- raise ValueError('length of value must equal block_size')
- else:
- if len(s) != self.last_block_size:
- raise ValueError('length of value must equal last_block_size')
- f = open(self.file_name, 'rb+')
- try:
- f.seek(i * self.block_size)
- f.write(s)
- finally:
- f.close()
-
- def __len__(self):
- """
- Get number of blocks.
- """
- return int(self.__block_count)
-
-
-class MetaFile(CompleteBinaryTreeMixin):
- """
- A L{HashTree} stored on disk, with a timestamp.
-
- The list of hashes can be accessed using subscripting and
- C{__len__}, in the same manner as for L{HashTree}.
-
- Note that the constructor takes the entire list associated with
- the L{HashTree}, not just the bottom row of the tree.
-
- @ivar meta_name: Full path to metafile.
- """
- def __init__(self, meta_name, mode, L=None):
- """
- Open an existing meta-file for reading or writing.
-
- If C{mode} is 'r', the meta-file must already exist and it is
- opened for reading only, and the list C{L} is ignored. If C{mode}
- is 'w', the file will be created if it does not exist (from the
- list of hashes given in C{L}), and it is opened for reading and
- writing.
- """
- self.meta_name = os.path.abspath(meta_name)
- self.mode = mode
- assert self.mode in ['r', 'w']
-
- # A timestamp is stored at index 0. The MetaFile instance
- # offsets all indices passed to __getitem__, __setitem__ by
- # this offset, and pretends it has length equal to
- # self.sublength.
- self.offset = 1
-
- if self.mode == 'w':
- suggested_length = len(hash('')) * (len(L)+self.offset)
- else:
- suggested_length = None
-
- created = False
- if self.mode == 'w' and not os.path.exists(self.meta_name):
- created = True
-
- self.block_file = BlockFile(self.meta_name, self.mode,
- len(hash('')),
- suggested_length)
- self.sublength = len(self.block_file) - self.offset
-
- if created:
- for i in xrange(len(L)):
- self.block_file[i + self.offset] = L[i]
-
- def __getitem__(self, i):
- if i < 0 or i >= self.sublength:
- raise IndexError('bad meta-file block index')
- return self.block_file[i + self.offset]
-
- def __setitem__(self, i, value):
- if i < 0 or i >= self.sublength:
- raise IndexError('bad meta-file block index')
- self.block_file[i + self.offset] = value
-
- def __len__(self):
- return self.sublength
-
- def set_timestamp(self, file_name):
- """
- Set meta file's timestamp equal to the timestamp for C{file_name}.
- """
- st = os.stat(file_name)
- timestamp = bencode.bencode((st.st_size, st.st_mtime))
- self.block_file[0] = sha.new(timestamp).digest()
-
- def check_timestamp(self, file_name):
- """
- True if meta file's timestamp equals timestamp for C{file_name}.
- """
- st = os.stat(file_name)
- timestamp = bencode.bencode((st.st_size, st.st_mtime))
- return self.block_file[0] == sha.new(timestamp).digest()
-
-
-class CompleteChunkFile(BlockFile):
- """
- Reads chunks from a fully-downloaded file.
-
- A chunk C{i} is created from block C{i}. Block C{i} is unencoded
- data read from the file by the L{BlockFile}. Chunk C{i} is
- an encoded string created from block C{i}.
-
- Chunks can be read using list subscripting. The total number of
- chunks (equals the total number of blocks) is given by L{__len__}.
-
- @ivar file_name: Full path to file.
- @ivar file_size: Size of file in bytes.
- @ivar file_hash: Hash of file.
- @ivar meta_name: Full path to metafile, or C{None}.
- @ivar tree: L{HashTree} or L{MetaFile} instance for the file.
- One can extract a hash from any node in the hash
- tree.
- """
-
- def __init__(self, file_name, meta_name=None, callback=None):
- """
- Initialize reader on the given file name.
-
- The entire file will be read and the hash will be computed from
- the file. This may take a long time, so C{callback()} is called
- frequently during this process. This allows you to reduce CPU
- usage if you wish.
-
- The C{meta_name} argument is optional. If it is specified, then the
- hashes for C{file_name} will be stored under the file
- C{meta_name}. If a C{CompleteChunkFile} is created on the same
- file and metafile in the future, then the hashes will not need to
- be recomputed and the constructor will return instantly. The
- metafile contains a file and date stamp, so that if the file stored
- in C{file_name} is modified, then the hashes will be recomputed.
- """
- BlockFile.__init__(self, file_name, 'r', block_size=65536)
-
- # Whether we need to compute the hash tree
- compute_tree = False
-
- self.meta_name = meta_name
- if self.meta_name != None:
- self.meta_name = os.path.abspath(self.meta_name)
- self.meta = None
- if self.meta_name == None:
- compute_tree = True
- else:
- try:
- meta = MetaFile(self.meta_name, 'r')
- assert meta.check_timestamp(self.file_name)
- except (IOError, AssertionError):
- compute_tree = True
-
- # Compute the hash tree if needed.
- if compute_tree:
- chunk_hashes = [None] * len(self)
- for i in xrange(len(self)):
- triple = (self.file_size, i, BlockFile.__getitem__(self, i))
- chunk_hashes[i] = hash(bencode.bencode(triple))
- if callback:
- callback()
- self.tree = HashTree(chunk_hashes)
- del chunk_hashes
-
- # If a meta-file was given, make self.tree be a MetaFile instance.
- if self.meta_name != None:
- if compute_tree:
- # Did we compute the hash tree? Then store it to disk.
- self.tree = MetaFile(self.meta_name, 'w', self.tree)
- # Update its timestamp to be consistent with the file we
- # just hashed.
- self.tree.set_timestamp(self.file_name)
- else:
- # Read existing file from disk.
- self.tree = MetaFile(self.meta_name, 'r')
-
- self.file_hash = self.tree[0]
-
- def __getitem__(self, i):
- """
- Get chunk C{i}.
-
- Raises C{ValueError} if the file's contents changed since the
- CompleteFileChunkReader was instantiated.
- """
- return encode_chunk(BlockFile.__getitem__(self, i), i,
- self.file_size, self.tree)
-
-
-def encode_chunk(block, index, file_size, tree):
- """
- Encode a chunk.
-
- Given a block at index C{index} in a file with size C{file_size},
- and a L{HashTree} or L{MetaFile} instance C{tree}, computes and
- returns a chunk string for the given block.
-
- The C{tree} argument needs to have correct hashes only at certain
- indices. Check out the code for details. In any case, if a hash
- is wrong an exception will be raised.
- """
- block_count = (len(tree) + 1) // 2
- if index < 0 or index >= block_count:
- raise IndexError('block index out of range: ' + repr(index))
-
- suffix = bencode.bencode((file_size, index, block))
- current = len(tree) - block_count + index
- prefix = []
- while current > 0:
- sibling = tree.sibling(current)
- prefix += [tree[current], tree[sibling]]
- current = tree.parent(current)
- prefix = ''.join(prefix)
-
- # Encode the chunk
- chunk = bencode.bencode((prefix, suffix))
-
- # Check to make sure it decodes properly.
- decode_chunk(chunk, file_size, tree)
- return chunk
-
-
-def decode_chunk(chunk, file_size, tree):
- """
- Decode a chunk.
-
- Given file with size C{file_size} and a L{HashTree} or L{MetaFile}
- instance C{tree}, return C{(index, block, tree_items)}. Here
- C{index} is the block index where string C{block} should be placed
- in the file. Also C{tree_items} is a dict mapping indices within
- the L{HashTree} or L{MetaFile} tree object associated with the
- given file to the corresponding hashes at those indices. These
- have been verified against the file's hash, so it is known that
- they are correct.
-
- Raises C{ValueError} if chunk verification fails.
- """
- file_hash = tree[0]
- block_count = (len(tree) + 1) // 2
- try:
- # Decode the chunk
- try:
- (prefix, suffix) = bencode.bdecode(chunk)
- except:
- raise AssertionError()
-
- assert isinstance(prefix, str)
- assert isinstance(suffix, str)
-
- # Verify the suffix against the hashes in the prefix.
- hash_len = len(hash(''))
- L = [prefix[hash_len*i:hash_len*(i+1)] for i in range(len(prefix)//hash_len)]
- L += [file_hash]
- assert L[0] == hash(suffix)
- branches = []
- for i in range(0, len(L)-1, 2):
- if hash(L[i] + L[i+1]) == L[i+2]:
- branches += [0]
- elif hash(L[i+1] + L[i]) == L[i+2]:
- branches += [1]
- else:
- raise AssertionError()
-
- # Decode the suffix
- try:
- (claim_file_size, claim_index, block) = bencode.bdecode(suffix)
- except:
- raise AssertionError()
-
- assert isinstance(claim_file_size, int) or isinstance(claim_file_size, long)
- assert isinstance(claim_index, int) or isinstance(claim_index, long)
- assert isinstance(block, str)
-
- assert file_size == claim_file_size
-
- # Compute the index of the block, and check it.
- found_index = sum([branches[i]*2**i for i in range(len(branches))])
- assert found_index == claim_index
-
- # Now fill in the tree_items dict.
- tree_items = {}
- current = (len(tree) - block_count) + found_index
- i = 0
- while current > 0 and i + 1 < len(L):
- tree_items[current] = L[i]
- # Next item is our sibling.
- tree_items[tree.sibling(current)] = L[i+1]
- i += 2
- current = tree.parent(current)
-
- return (found_index, block, tree_items)
- except AssertionError:
- raise ValueError('corrupt chunk')
-
-
-class PartialChunkFile(BlockFile):
- """
- Reads and writes chunks to a partially downloaded file.
-
- @ivar file_name: Full path to file.
- @ivar file_size: Size of file in bytes.
- @ivar file_hash: Hash of file.
- @ivar meta_name: Full path to metafile.
- @ivar tree: L{MetaFile} instance for the file.
- The hashes in this hash tree are valid only for
- nodes that we have been sent hashes for.
- """
- def __init__(self, file_name, meta_name, file_hash=None, file_size=None):
- """
- Initialize reader/writer for the given file name and metafile name.
-
- If neither C{file_name} nor C{meta_file} exist, then both are
- created. The C{file_hash} and C{file_size} arguments are used to
- initialize the two files.
-
- If both C{file_name} and C{meta_file} exist, then the hash and
- file size arguments are ignored, and those values are instead read
- from the files.
-
- If one file exists and the other does not, an C{IOError} is raised.
- """
- self.meta_name = os.path.abspath(meta_name)
- meta_exists = os.path.exists(self.meta_name)
- file_exists = os.path.exists(os.path.abspath(file_name))
-
- BlockFile.__init__(self, os.path.abspath(file_name), 'w',
- BLOCK_SIZE, file_size)
-
- if file_exists and not meta_exists:
- raise IOError('metafile ' + repr(self.meta_name) +
- ' missing for file ' + repr(self.file_name))
- if meta_exists and not file_exists:
- raise IOError('file ' + repr(self.file_name) +
- ' missing for metafile ' + repr(self.meta_name))
- tree_count = 2 * roundup_pow2(len(self)) - 1
- self.tree = MetaFile(self.meta_name, 'w', [hash('')] * tree_count)
-
- if not meta_exists and not file_exists:
- self.tree[0] = file_hash
-
- self.file_hash = self.tree[0]
-
- def __getitem__(self, i):
- """
- Get chunk C{i}.
-
- Raises C{ValueError} if chunk has not yet been downloaded or is
- corrupted.
- """
- return encode_chunk(BlockFile.__getitem__(self, i), i,
- self.file_size, self.tree)
-
- def __setitem__(self, i, chunk):
- """
- Set chunk C{i}.
-
- Raises C{ValueError} if the chunk is invalid.
- """
- (index, block, tree_items) = decode_chunk(chunk,
- self.file_size, self.tree)
- if index != i:
- raise ValueError('incorrect index for chunk')
- BlockFile.__setitem__(self, index, block)
- for (tree_index, tree_value) in tree_items.items():
- self.tree[tree_index] = tree_value
-
-
-def test(filename1='temp-out', metaname1='temp-out.meta',
- filename2='temp-out2', metaname2='temp-out2.meta'):
- """
- Unit tests.
- """
- print 'Testing:'
-
- import random
- ntests = 100
- max_file_size = 200000
-
- # Test CompleteChunkFile.
-
- if os.path.exists(metaname1):
- os.remove(metaname1)
-
- for i in range(ntests):
- fsize = random.randrange(max_file_size)
- # Make some random string of size 'fsize' to go in the file.
- s = ''.join([sha.new(str(j)).digest() for j in range(fsize//20+1)])
- assert len(s) >= fsize
- s = s[:fsize]
- f = open(filename1, 'wb')
- f.write(s)
- f.close()
- C = CompleteChunkFile(filename1)
- for j in range(len(C)):
- C[j]
- C = CompleteChunkFile(filename1, metaname1)
- for j in range(len(C)):
- C[j]
- C = CompleteChunkFile(filename1, metaname1)
- for j in range(len(C)):
- C[j]
- os.remove(metaname1)
-
- os.remove(filename1)
-
- print ' CompleteChunkFile: OK'
-
- # Test PartialChunkFile
-
- for i in range(ntests):
- fsize = random.randrange(max_file_size)
- # Make some random string of size 'fsize' to go in the file.
- s = ''.join([sha.new(str(j)).digest() for j in range(fsize//20+1)])
- assert len(s) >= fsize
- s = s[:fsize]
- f = open(filename1, 'wb')
- f.write(s)
- f.close()
- C1 = CompleteChunkFile(filename1)
- if os.path.exists(filename2):
- os.remove(filename2)
-
- if os.path.exists(metaname2):
- os.remove(metaname2)
- C2 = PartialChunkFile(filename2, metaname2, C1.file_hash, C1.file_size)
- assert len(C1) == len(C2)
- assert C2.tree[0] == C1.tree[0]
- for j in range(len(C2)):
- try:
- C2[j]
- ok = False
- except ValueError:
- ok = True
- if not ok:
- raise AssertionError()
- for j in range(len(C2)//2):
- k = random.randrange(len(C2))
- if len(C1) > 1:
- assert C1[k] != C1[(k+1)%len(C1)]
- try:
- C2[k] = C1[(k+1)%len(C1)]
- ok = False
- except ValueError:
- ok = True
- if not ok:
- raise AssertionError()
- C2[k] = C1[k]
- assert C2[k] == C1[k]
- for j in range(len(C2)):
- C2[j] = C1[j]
- assert C2[j] == C1[j]
-
- os.remove(filename1)
- os.remove(filename2)
- os.remove(metaname2)
-
- print ' PartialChunkFile: OK'
-
-
-if __name__ == '__main__':
- test()