From: Brian Warner Date: Wed, 16 Jul 2008 20:14:39 +0000 (-0700) Subject: move encode/upload/download/checker.py into a new immutable/ directory. No behavior... X-Git-Url: https://git.rkrishnan.org/?a=commitdiff_plain;h=7394607141e78127e895243fd90430f249679e87;p=tahoe-lafs%2Ftahoe-lafs.git move encode/upload/download/checker.py into a new immutable/ directory. No behavior changes expected. --- diff --git a/src/allmydata/checker.py b/src/allmydata/checker.py deleted file mode 100644 index 8e559c48..00000000 --- a/src/allmydata/checker.py +++ /dev/null @@ -1,204 +0,0 @@ - -""" -Given a StorageIndex, count how many shares we can find. - -This does no verification of the shares whatsoever. If the peer claims to -have the share, we believe them. -""" - -from zope.interface import implements -from twisted.internet import defer -from twisted.python import log -from allmydata.interfaces import IVerifierURI, ICheckerResults -from allmydata import download, storage -from allmydata.util import hashutil, base32 - -class Results: - implements(ICheckerResults) - - def __init__(self, storage_index): - # storage_index might be None for, say, LIT files - self.storage_index = storage_index - if storage_index is None: - self.storage_index_s = "" - else: - self.storage_index_s = base32.b2a(storage_index)[:6] - - def is_healthy(self): - return self.healthy - - def html_summary(self): - if self.healthy: - return "healthy" - return "NOT HEALTHY" - - def html(self): - s = "
\n" - s += "

Checker Results for Immutable SI=%s

\n" % self.storage_index_s - if self.healthy: - s += "

Healthy!

\n" - else: - s += "

Not Healthy!

\n" - s += "
\n" - return s - - -class SimpleCHKFileChecker: - """Return a list of (needed, total, found, sharemap), where sharemap maps - share number to a list of (binary) nodeids of the shareholders.""" - - def __init__(self, peer_getter, uri_to_check): - self.peer_getter = peer_getter - self.found_shares = set() - self.uri_to_check = IVerifierURI(uri_to_check) - self.sharemap = {} - - ''' - def check_synchronously(self, si): - # this is how we would write this class if we were using synchronous - # messages (or if we used promises). - found = set() - for (pmpeerid, peerid, connection) in self.peer_getter(storage_index): - buckets = connection.get_buckets(si) - found.update(buckets.keys()) - return len(found) - ''' - - def check(self): - d = self._get_all_shareholders(self.uri_to_check.storage_index) - d.addCallback(self._done) - return d - - def _get_all_shareholders(self, storage_index): - dl = [] - for (peerid, ss) in self.peer_getter("storage", storage_index): - d = ss.callRemote("get_buckets", storage_index) - d.addCallbacks(self._got_response, self._got_error, - callbackArgs=(peerid,)) - dl.append(d) - return defer.DeferredList(dl) - - def _got_response(self, buckets, peerid): - # buckets is a dict: maps shum to an rref of the server who holds it - self.found_shares.update(buckets.keys()) - for k in buckets: - if k not in self.sharemap: - self.sharemap[k] = [] - self.sharemap[k].append(peerid) - - def _got_error(self, f): - if f.check(KeyError): - pass - log.err(f) - pass - - def _done(self, res): - u = self.uri_to_check - r = Results(self.uri_to_check.storage_index) - r.healthy = bool(len(self.found_shares) >= u.needed_shares) - r.stuff = (u.needed_shares, u.total_shares, len(self.found_shares), - self.sharemap) - return r - -class VerifyingOutput: - def __init__(self, total_length, results): - self._crypttext_hasher = hashutil.crypttext_hasher() - self.length = 0 - self.total_length = total_length - self._segment_number = 0 - self._crypttext_hash_tree = None - self._opened = False - self._results = results - results.healthy = False - - def setup_hashtrees(self, plaintext_hashtree, crypttext_hashtree): - self._crypttext_hash_tree = crypttext_hashtree - - def write_segment(self, crypttext): - self.length += len(crypttext) - - self._crypttext_hasher.update(crypttext) - if self._crypttext_hash_tree: - ch = hashutil.crypttext_segment_hasher() - ch.update(crypttext) - crypttext_leaves = {self._segment_number: ch.digest()} - self._crypttext_hash_tree.set_hashes(leaves=crypttext_leaves) - - self._segment_number += 1 - - def close(self): - self.crypttext_hash = self._crypttext_hasher.digest() - - def finish(self): - self._results.healthy = True - return self._results - - -class SimpleCHKFileVerifier(download.FileDownloader): - # this reconstructs the crypttext, which verifies that at least 'k' of - # the shareholders are around and have valid data. It does not check the - # remaining shareholders, and it cannot verify the plaintext. - check_plaintext_hash = False - - def __init__(self, client, u): - self._client = client - - u = IVerifierURI(u) - self._storage_index = u.storage_index - self._uri_extension_hash = u.uri_extension_hash - self._total_shares = u.total_shares - self._size = u.size - self._num_needed_shares = u.needed_shares - - self._si_s = storage.si_b2a(self._storage_index) - self.init_logging() - - r = Results(self._storage_index) - self._output = VerifyingOutput(self._size, r) - self._paused = False - self._stopped = False - - self._results = None - self.active_buckets = {} # k: shnum, v: bucket - self._share_buckets = [] # list of (sharenum, bucket) tuples - self._share_vbuckets = {} # k: shnum, v: set of ValidatedBuckets - self._uri_extension_sources = [] - - self._uri_extension_data = None - - self._fetch_failures = {"uri_extension": 0, - "plaintext_hashroot": 0, - "plaintext_hashtree": 0, - "crypttext_hashroot": 0, - "crypttext_hashtree": 0, - } - - def init_logging(self): - self._log_prefix = prefix = storage.si_b2a(self._storage_index)[:5] - num = self._client.log("SimpleCHKFileVerifier(%s): starting" % prefix) - self._log_number = num - - def log(self, msg, parent=None): - if parent is None: - parent = self._log_number - return self._client.log("SimpleCHKFileVerifier(%s): %s" - % (self._log_prefix, msg), - parent=parent) - - - def start(self): - log.msg("starting download [%s]" % storage.si_b2a(self._storage_index)[:5]) - - # first step: who should we download from? - d = defer.maybeDeferred(self._get_all_shareholders) - d.addCallback(self._got_all_shareholders) - # now get the uri_extension block from somebody and validate it - d.addCallback(self._obtain_uri_extension) - d.addCallback(self._got_uri_extension) - d.addCallback(self._get_hashtrees) - d.addCallback(self._create_validated_buckets) - # once we know that, we can download blocks from everybody - d.addCallback(self._download_all_segments) - d.addCallback(self._done) - return d - diff --git a/src/allmydata/client.py b/src/allmydata/client.py index 4e1121a5..47f4fd20 100644 --- a/src/allmydata/client.py +++ b/src/allmydata/client.py @@ -12,13 +12,13 @@ from pycryptopp.publickey import rsa import allmydata from allmydata.storage import StorageServer -from allmydata.upload import Uploader -from allmydata.download import Downloader +from allmydata.immutable.upload import Uploader +from allmydata.immutable.download import Downloader +from allmydata.immutable.filenode import FileNode, LiteralFileNode from allmydata.offloaded import Helper from allmydata.control import ControlServer from allmydata.introducer.client import IntroducerClient from allmydata.util import hashutil, base32, testutil -from allmydata.filenode import FileNode, LiteralFileNode from allmydata.uri import LiteralFileURI from allmydata.dirnode import NewDirectoryNode from allmydata.mutable.node import MutableFileNode, MutableWatcher diff --git a/src/allmydata/control.py b/src/allmydata/control.py index 99a3139b..ce8590c9 100644 --- a/src/allmydata/control.py +++ b/src/allmydata/control.py @@ -6,7 +6,7 @@ from twisted.internet import defer from foolscap import Referenceable from allmydata.interfaces import RIControlClient from allmydata.util import testutil, fileutil, mathutil -from allmydata import upload, download +from allmydata.immutable import upload, download from twisted.python import log def get_memory_usage(): diff --git a/src/allmydata/download.py b/src/allmydata/download.py deleted file mode 100644 index a0bf0e19..00000000 --- a/src/allmydata/download.py +++ /dev/null @@ -1,1100 +0,0 @@ - -import os, random, weakref, itertools, time -from zope.interface import implements -from twisted.internet import defer -from twisted.internet.interfaces import IPushProducer, IConsumer -from twisted.application import service -from foolscap.eventual import eventually - -from allmydata.util import base32, mathutil, hashutil, log -from allmydata.util.assertutil import _assert -from allmydata import codec, hashtree, storage, uri -from allmydata.interfaces import IDownloadTarget, IDownloader, IFileURI, \ - IDownloadStatus, IDownloadResults -from allmydata.encode import NotEnoughSharesError -from pycryptopp.cipher.aes import AES - -class HaveAllPeersError(Exception): - # we use this to jump out of the loop - pass - -class BadURIExtensionHashValue(Exception): - pass -class BadPlaintextHashValue(Exception): - pass -class BadCrypttextHashValue(Exception): - pass - -class DownloadStopped(Exception): - pass - -class DownloadResults: - implements(IDownloadResults) - - def __init__(self): - self.servers_used = set() - self.server_problems = {} - self.servermap = {} - self.timings = {} - self.file_size = None - -class Output: - def __init__(self, downloadable, key, total_length, log_parent, - download_status): - self.downloadable = downloadable - self._decryptor = AES(key) - self._crypttext_hasher = hashutil.crypttext_hasher() - self._plaintext_hasher = hashutil.plaintext_hasher() - self.length = 0 - self.total_length = total_length - self._segment_number = 0 - self._plaintext_hash_tree = None - self._crypttext_hash_tree = None - self._opened = False - self._log_parent = log_parent - self._status = download_status - self._status.set_progress(0.0) - - def log(self, *args, **kwargs): - if "parent" not in kwargs: - kwargs["parent"] = self._log_parent - if "facility" not in kwargs: - kwargs["facility"] = "download.output" - return log.msg(*args, **kwargs) - - def setup_hashtrees(self, plaintext_hashtree, crypttext_hashtree): - self._plaintext_hash_tree = plaintext_hashtree - self._crypttext_hash_tree = crypttext_hashtree - - def write_segment(self, crypttext): - self.length += len(crypttext) - self._status.set_progress( float(self.length) / self.total_length ) - - # memory footprint: 'crypttext' is the only segment_size usage - # outstanding. While we decrypt it into 'plaintext', we hit - # 2*segment_size. - self._crypttext_hasher.update(crypttext) - if self._crypttext_hash_tree: - ch = hashutil.crypttext_segment_hasher() - ch.update(crypttext) - crypttext_leaves = {self._segment_number: ch.digest()} - self.log(format="crypttext leaf hash (%(bytes)sB) [%(segnum)d] is %(hash)s", - bytes=len(crypttext), - segnum=self._segment_number, hash=base32.b2a(ch.digest()), - level=log.NOISY) - self._crypttext_hash_tree.set_hashes(leaves=crypttext_leaves) - - plaintext = self._decryptor.process(crypttext) - del crypttext - - # now we're back down to 1*segment_size. - - self._plaintext_hasher.update(plaintext) - if self._plaintext_hash_tree: - ph = hashutil.plaintext_segment_hasher() - ph.update(plaintext) - plaintext_leaves = {self._segment_number: ph.digest()} - self.log(format="plaintext leaf hash (%(bytes)sB) [%(segnum)d] is %(hash)s", - bytes=len(plaintext), - segnum=self._segment_number, hash=base32.b2a(ph.digest()), - level=log.NOISY) - self._plaintext_hash_tree.set_hashes(leaves=plaintext_leaves) - - self._segment_number += 1 - # We're still at 1*segment_size. The Downloadable is responsible for - # any memory usage beyond this. - if not self._opened: - self._opened = True - self.downloadable.open(self.total_length) - self.downloadable.write(plaintext) - - def fail(self, why): - # this is really unusual, and deserves maximum forensics - if why.check(DownloadStopped): - # except DownloadStopped just means the consumer aborted the - # download, not so scary - self.log("download stopped", level=log.UNUSUAL) - else: - self.log("download failed!", failure=why, level=log.SCARY) - self.downloadable.fail(why) - - def close(self): - self.crypttext_hash = self._crypttext_hasher.digest() - self.plaintext_hash = self._plaintext_hasher.digest() - self.log("download finished, closing IDownloadable", level=log.NOISY) - self.downloadable.close() - - def finish(self): - return self.downloadable.finish() - -class ValidatedBucket: - """I am a front-end for a remote storage bucket, responsible for - retrieving and validating data from that bucket. - - My get_block() method is used by BlockDownloaders. - """ - - def __init__(self, sharenum, bucket, - share_hash_tree, roothash, - num_blocks): - self.sharenum = sharenum - self.bucket = bucket - self._share_hash = None # None means not validated yet - self.share_hash_tree = share_hash_tree - self._roothash = roothash - self.block_hash_tree = hashtree.IncompleteHashTree(num_blocks) - self.started = False - - def get_block(self, blocknum): - if not self.started: - d = self.bucket.start() - def _started(res): - self.started = True - return self.get_block(blocknum) - d.addCallback(_started) - return d - - # the first time we use this bucket, we need to fetch enough elements - # of the share hash tree to validate it from our share hash up to the - # hashroot. - if not self._share_hash: - d1 = self.bucket.get_share_hashes() - else: - d1 = defer.succeed([]) - - # we might need to grab some elements of our block hash tree, to - # validate the requested block up to the share hash - needed = self.block_hash_tree.needed_hashes(blocknum) - if needed: - # TODO: get fewer hashes, use get_block_hashes(needed) - d2 = self.bucket.get_block_hashes() - else: - d2 = defer.succeed([]) - - d3 = self.bucket.get_block(blocknum) - - d = defer.gatherResults([d1, d2, d3]) - d.addCallback(self._got_data, blocknum) - return d - - def _got_data(self, res, blocknum): - sharehashes, blockhashes, blockdata = res - blockhash = None # to make logging it safe - - try: - if not self._share_hash: - sh = dict(sharehashes) - sh[0] = self._roothash # always use our own root, from the URI - sht = self.share_hash_tree - if sht.get_leaf_index(self.sharenum) not in sh: - raise hashtree.NotEnoughHashesError - sht.set_hashes(sh) - self._share_hash = sht.get_leaf(self.sharenum) - - blockhash = hashutil.block_hash(blockdata) - #log.msg("checking block_hash(shareid=%d, blocknum=%d) len=%d " - # "%r .. %r: %s" % - # (self.sharenum, blocknum, len(blockdata), - # blockdata[:50], blockdata[-50:], base32.b2a(blockhash))) - - # we always validate the blockhash - bh = dict(enumerate(blockhashes)) - # replace blockhash root with validated value - bh[0] = self._share_hash - self.block_hash_tree.set_hashes(bh, {blocknum: blockhash}) - - except (hashtree.BadHashError, hashtree.NotEnoughHashesError): - # log.WEIRD: indicates undetected disk/network error, or more - # likely a programming error - log.msg("hash failure in block=%d, shnum=%d on %s" % - (blocknum, self.sharenum, self.bucket)) - if self._share_hash: - log.msg(""" failure occurred when checking the block_hash_tree. - This suggests that either the block data was bad, or that the - block hashes we received along with it were bad.""") - else: - log.msg(""" the failure probably occurred when checking the - share_hash_tree, which suggests that the share hashes we - received from the remote peer were bad.""") - log.msg(" have self._share_hash: %s" % bool(self._share_hash)) - log.msg(" block length: %d" % len(blockdata)) - log.msg(" block hash: %s" % base32.b2a_or_none(blockhash)) - if len(blockdata) < 100: - log.msg(" block data: %r" % (blockdata,)) - else: - log.msg(" block data start/end: %r .. %r" % - (blockdata[:50], blockdata[-50:])) - log.msg(" root hash: %s" % base32.b2a(self._roothash)) - log.msg(" share hash tree:\n" + self.share_hash_tree.dump()) - log.msg(" block hash tree:\n" + self.block_hash_tree.dump()) - lines = [] - for i,h in sorted(sharehashes): - lines.append("%3d: %s" % (i, base32.b2a_or_none(h))) - log.msg(" sharehashes:\n" + "\n".join(lines) + "\n") - lines = [] - for i,h in enumerate(blockhashes): - lines.append("%3d: %s" % (i, base32.b2a_or_none(h))) - log.msg(" blockhashes:\n" + "\n".join(lines) + "\n") - raise - - # If we made it here, the block is good. If the hash trees didn't - # like what they saw, they would have raised a BadHashError, causing - # our caller to see a Failure and thus ignore this block (as well as - # dropping this bucket). - return blockdata - - - -class BlockDownloader: - """I am responsible for downloading a single block (from a single bucket) - for a single segment. - - I am a child of the SegmentDownloader. - """ - - def __init__(self, vbucket, blocknum, parent, results): - self.vbucket = vbucket - self.blocknum = blocknum - self.parent = parent - self.results = results - self._log_number = self.parent.log("starting block %d" % blocknum) - - def log(self, msg, parent=None): - if parent is None: - parent = self._log_number - return self.parent.log(msg, parent=parent) - - def start(self, segnum): - lognum = self.log("get_block(segnum=%d)" % segnum) - started = time.time() - d = self.vbucket.get_block(segnum) - d.addCallbacks(self._hold_block, self._got_block_error, - callbackArgs=(started, lognum,), errbackArgs=(lognum,)) - return d - - def _hold_block(self, data, started, lognum): - if self.results: - elapsed = time.time() - started - peerid = self.vbucket.bucket.get_peerid() - if peerid not in self.results.timings["fetch_per_server"]: - self.results.timings["fetch_per_server"][peerid] = [] - self.results.timings["fetch_per_server"][peerid].append(elapsed) - self.log("got block", parent=lognum) - self.parent.hold_block(self.blocknum, data) - - def _got_block_error(self, f, lognum): - self.log("BlockDownloader[%d] got error: %s" % (self.blocknum, f), - parent=lognum) - if self.results: - peerid = self.vbucket.bucket.get_peerid() - self.results.server_problems[peerid] = str(f) - self.parent.bucket_failed(self.vbucket) - -class SegmentDownloader: - """I am responsible for downloading all the blocks for a single segment - of data. - - I am a child of the FileDownloader. - """ - - def __init__(self, parent, segmentnumber, needed_shares, results): - self.parent = parent - self.segmentnumber = segmentnumber - self.needed_blocks = needed_shares - self.blocks = {} # k: blocknum, v: data - self.results = results - self._log_number = self.parent.log("starting segment %d" % - segmentnumber) - - def log(self, msg, parent=None): - if parent is None: - parent = self._log_number - return self.parent.log(msg, parent=parent) - - def start(self): - return self._download() - - def _download(self): - d = self._try() - def _done(res): - if len(self.blocks) >= self.needed_blocks: - # we only need self.needed_blocks blocks - # we want to get the smallest blockids, because they are - # more likely to be fast "primary blocks" - blockids = sorted(self.blocks.keys())[:self.needed_blocks] - blocks = [] - for blocknum in blockids: - blocks.append(self.blocks[blocknum]) - return (blocks, blockids) - else: - return self._download() - d.addCallback(_done) - return d - - def _try(self): - # fill our set of active buckets, maybe raising NotEnoughSharesError - active_buckets = self.parent._activate_enough_buckets() - # Now we have enough buckets, in self.parent.active_buckets. - - # in test cases, bd.start might mutate active_buckets right away, so - # we need to put off calling start() until we've iterated all the way - # through it. - downloaders = [] - for blocknum, vbucket in active_buckets.iteritems(): - bd = BlockDownloader(vbucket, blocknum, self, self.results) - downloaders.append(bd) - if self.results: - self.results.servers_used.add(vbucket.bucket.get_peerid()) - l = [bd.start(self.segmentnumber) for bd in downloaders] - return defer.DeferredList(l, fireOnOneErrback=True) - - def hold_block(self, blocknum, data): - self.blocks[blocknum] = data - - def bucket_failed(self, vbucket): - self.parent.bucket_failed(vbucket) - -class DownloadStatus: - implements(IDownloadStatus) - statusid_counter = itertools.count(0) - - def __init__(self): - self.storage_index = None - self.size = None - self.helper = False - self.status = "Not started" - self.progress = 0.0 - self.paused = False - self.stopped = False - self.active = True - self.results = None - self.counter = self.statusid_counter.next() - self.started = time.time() - - def get_started(self): - return self.started - def get_storage_index(self): - return self.storage_index - def get_size(self): - return self.size - def using_helper(self): - return self.helper - def get_status(self): - status = self.status - if self.paused: - status += " (output paused)" - if self.stopped: - status += " (output stopped)" - return status - def get_progress(self): - return self.progress - def get_active(self): - return self.active - def get_results(self): - return self.results - def get_counter(self): - return self.counter - - def set_storage_index(self, si): - self.storage_index = si - def set_size(self, size): - self.size = size - def set_helper(self, helper): - self.helper = helper - def set_status(self, status): - self.status = status - def set_paused(self, paused): - self.paused = paused - def set_stopped(self, stopped): - self.stopped = stopped - def set_progress(self, value): - self.progress = value - def set_active(self, value): - self.active = value - def set_results(self, value): - self.results = value - -class FileDownloader: - implements(IPushProducer) - check_crypttext_hash = True - check_plaintext_hash = True - _status = None - - def __init__(self, client, u, downloadable): - self._client = client - - u = IFileURI(u) - self._storage_index = u.storage_index - self._uri_extension_hash = u.uri_extension_hash - self._total_shares = u.total_shares - self._size = u.size - self._num_needed_shares = u.needed_shares - - self._si_s = storage.si_b2a(self._storage_index) - self.init_logging() - - self._started = time.time() - self._status = s = DownloadStatus() - s.set_status("Starting") - s.set_storage_index(self._storage_index) - s.set_size(self._size) - s.set_helper(False) - s.set_active(True) - - self._results = DownloadResults() - s.set_results(self._results) - self._results.file_size = self._size - self._results.timings["servers_peer_selection"] = {} - self._results.timings["fetch_per_server"] = {} - self._results.timings["cumulative_fetch"] = 0.0 - self._results.timings["cumulative_decode"] = 0.0 - self._results.timings["cumulative_decrypt"] = 0.0 - self._results.timings["paused"] = 0.0 - - if IConsumer.providedBy(downloadable): - downloadable.registerProducer(self, True) - self._downloadable = downloadable - self._output = Output(downloadable, u.key, self._size, self._log_number, - self._status) - self._paused = False - self._stopped = False - - self.active_buckets = {} # k: shnum, v: bucket - self._share_buckets = [] # list of (sharenum, bucket) tuples - self._share_vbuckets = {} # k: shnum, v: set of ValidatedBuckets - self._uri_extension_sources = [] - - self._uri_extension_data = None - - self._fetch_failures = {"uri_extension": 0, - "plaintext_hashroot": 0, - "plaintext_hashtree": 0, - "crypttext_hashroot": 0, - "crypttext_hashtree": 0, - } - - def init_logging(self): - self._log_prefix = prefix = storage.si_b2a(self._storage_index)[:5] - num = self._client.log(format="FileDownloader(%(si)s): starting", - si=storage.si_b2a(self._storage_index)) - self._log_number = num - - def log(self, *args, **kwargs): - if "parent" not in kwargs: - kwargs["parent"] = self._log_number - if "facility" not in kwargs: - kwargs["facility"] = "tahoe.download" - return log.msg(*args, **kwargs) - - def pauseProducing(self): - if self._paused: - return - self._paused = defer.Deferred() - self._paused_at = time.time() - if self._status: - self._status.set_paused(True) - - def resumeProducing(self): - if self._paused: - paused_for = time.time() - self._paused_at - self._results.timings['paused'] += paused_for - p = self._paused - self._paused = None - eventually(p.callback, None) - if self._status: - self._status.set_paused(False) - - def stopProducing(self): - self.log("Download.stopProducing") - self._stopped = True - self.resumeProducing() - if self._status: - self._status.set_stopped(True) - self._status.set_active(False) - - def start(self): - self.log("starting download") - - # first step: who should we download from? - d = defer.maybeDeferred(self._get_all_shareholders) - d.addCallback(self._got_all_shareholders) - # now get the uri_extension block from somebody and validate it - d.addCallback(self._obtain_uri_extension) - d.addCallback(self._got_uri_extension) - d.addCallback(self._get_hashtrees) - d.addCallback(self._create_validated_buckets) - # once we know that, we can download blocks from everybody - d.addCallback(self._download_all_segments) - def _finished(res): - if self._status: - self._status.set_status("Finished") - self._status.set_active(False) - self._status.set_paused(False) - if IConsumer.providedBy(self._downloadable): - self._downloadable.unregisterProducer() - return res - d.addBoth(_finished) - def _failed(why): - if self._status: - self._status.set_status("Failed") - self._status.set_active(False) - self._output.fail(why) - return why - d.addErrback(_failed) - d.addCallback(self._done) - return d - - def _get_all_shareholders(self): - dl = [] - for (peerid,ss) in self._client.get_permuted_peers("storage", - self._storage_index): - d = ss.callRemote("get_buckets", self._storage_index) - d.addCallbacks(self._got_response, self._got_error, - callbackArgs=(peerid,)) - dl.append(d) - self._responses_received = 0 - self._queries_sent = len(dl) - if self._status: - self._status.set_status("Locating Shares (%d/%d)" % - (self._responses_received, - self._queries_sent)) - return defer.DeferredList(dl) - - def _got_response(self, buckets, peerid): - self._responses_received += 1 - if self._results: - elapsed = time.time() - self._started - self._results.timings["servers_peer_selection"][peerid] = elapsed - if self._status: - self._status.set_status("Locating Shares (%d/%d)" % - (self._responses_received, - self._queries_sent)) - for sharenum, bucket in buckets.iteritems(): - b = storage.ReadBucketProxy(bucket, peerid, self._si_s) - self.add_share_bucket(sharenum, b) - self._uri_extension_sources.append(b) - if self._results: - if peerid not in self._results.servermap: - self._results.servermap[peerid] = set() - self._results.servermap[peerid].add(sharenum) - - def add_share_bucket(self, sharenum, bucket): - # this is split out for the benefit of test_encode.py - self._share_buckets.append( (sharenum, bucket) ) - - def _got_error(self, f): - self._client.log("Somebody failed. -- %s" % (f,)) - - def bucket_failed(self, vbucket): - shnum = vbucket.sharenum - del self.active_buckets[shnum] - s = self._share_vbuckets[shnum] - # s is a set of ValidatedBucket instances - s.remove(vbucket) - # ... which might now be empty - if not s: - # there are no more buckets which can provide this share, so - # remove the key. This may prompt us to use a different share. - del self._share_vbuckets[shnum] - - def _got_all_shareholders(self, res): - if self._results: - now = time.time() - self._results.timings["peer_selection"] = now - self._started - - if len(self._share_buckets) < self._num_needed_shares: - raise NotEnoughSharesError - - #for s in self._share_vbuckets.values(): - # for vb in s: - # assert isinstance(vb, ValidatedBucket), \ - # "vb is %s but should be a ValidatedBucket" % (vb,) - - def _unpack_uri_extension_data(self, data): - return uri.unpack_extension(data) - - def _obtain_uri_extension(self, ignored): - # all shareholders are supposed to have a copy of uri_extension, and - # all are supposed to be identical. We compute the hash of the data - # that comes back, and compare it against the version in our URI. If - # they don't match, ignore their data and try someone else. - if self._status: - self._status.set_status("Obtaining URI Extension") - - self._uri_extension_fetch_started = time.time() - def _validate(proposal, bucket): - h = hashutil.uri_extension_hash(proposal) - if h != self._uri_extension_hash: - self._fetch_failures["uri_extension"] += 1 - msg = ("The copy of uri_extension we received from " - "%s was bad: wanted %s, got %s" % - (bucket, - base32.b2a(self._uri_extension_hash), - base32.b2a(h))) - self.log(msg, level=log.SCARY) - raise BadURIExtensionHashValue(msg) - return self._unpack_uri_extension_data(proposal) - return self._obtain_validated_thing(None, - self._uri_extension_sources, - "uri_extension", - "get_uri_extension", (), _validate) - - def _obtain_validated_thing(self, ignored, sources, name, methname, args, - validatorfunc): - if not sources: - raise NotEnoughSharesError("started with zero peers while fetching " - "%s" % name) - bucket = sources[0] - sources = sources[1:] - #d = bucket.callRemote(methname, *args) - d = bucket.startIfNecessary() - d.addCallback(lambda res: getattr(bucket, methname)(*args)) - d.addCallback(validatorfunc, bucket) - def _bad(f): - self.log("%s from vbucket %s failed:" % (name, bucket), - failure=f, level=log.WEIRD) - if not sources: - raise NotEnoughSharesError("ran out of peers, last error was %s" - % (f,)) - # try again with a different one - return self._obtain_validated_thing(None, sources, name, - methname, args, validatorfunc) - d.addErrback(_bad) - return d - - def _got_uri_extension(self, uri_extension_data): - if self._results: - elapsed = time.time() - self._uri_extension_fetch_started - self._results.timings["uri_extension"] = elapsed - - d = self._uri_extension_data = uri_extension_data - - self._codec = codec.get_decoder_by_name(d['codec_name']) - self._codec.set_serialized_params(d['codec_params']) - self._tail_codec = codec.get_decoder_by_name(d['codec_name']) - self._tail_codec.set_serialized_params(d['tail_codec_params']) - - crypttext_hash = d.get('crypttext_hash', None) # optional - if crypttext_hash: - assert isinstance(crypttext_hash, str) - assert len(crypttext_hash) == 32 - self._crypttext_hash = crypttext_hash - self._plaintext_hash = d.get('plaintext_hash', None) # optional - - self._roothash = d['share_root_hash'] - - self._segment_size = segment_size = d['segment_size'] - self._total_segments = mathutil.div_ceil(self._size, segment_size) - self._current_segnum = 0 - - self._share_hashtree = hashtree.IncompleteHashTree(d['total_shares']) - self._share_hashtree.set_hashes({0: self._roothash}) - - def _get_hashtrees(self, res): - self._get_hashtrees_started = time.time() - if self._status: - self._status.set_status("Retrieving Hash Trees") - d = defer.maybeDeferred(self._get_plaintext_hashtrees) - d.addCallback(self._get_crypttext_hashtrees) - d.addCallback(self._setup_hashtrees) - return d - - def _get_plaintext_hashtrees(self): - # plaintext hashes are optional. If the root isn't in the UEB, then - # the share will be holding an empty list. We don't even bother - # fetching it. - if "plaintext_root_hash" not in self._uri_extension_data: - self._plaintext_hashtree = None - return - def _validate_plaintext_hashtree(proposal, bucket): - if proposal[0] != self._uri_extension_data['plaintext_root_hash']: - self._fetch_failures["plaintext_hashroot"] += 1 - msg = ("The copy of the plaintext_root_hash we received from" - " %s was bad" % bucket) - raise BadPlaintextHashValue(msg) - pt_hashtree = hashtree.IncompleteHashTree(self._total_segments) - pt_hashes = dict(list(enumerate(proposal))) - try: - pt_hashtree.set_hashes(pt_hashes) - except hashtree.BadHashError: - # the hashes they gave us were not self-consistent, even - # though the root matched what we saw in the uri_extension - # block - self._fetch_failures["plaintext_hashtree"] += 1 - raise - self._plaintext_hashtree = pt_hashtree - d = self._obtain_validated_thing(None, - self._uri_extension_sources, - "plaintext_hashes", - "get_plaintext_hashes", (), - _validate_plaintext_hashtree) - return d - - def _get_crypttext_hashtrees(self, res): - # crypttext hashes are optional too - if "crypttext_root_hash" not in self._uri_extension_data: - self._crypttext_hashtree = None - return - def _validate_crypttext_hashtree(proposal, bucket): - if proposal[0] != self._uri_extension_data['crypttext_root_hash']: - self._fetch_failures["crypttext_hashroot"] += 1 - msg = ("The copy of the crypttext_root_hash we received from" - " %s was bad" % bucket) - raise BadCrypttextHashValue(msg) - ct_hashtree = hashtree.IncompleteHashTree(self._total_segments) - ct_hashes = dict(list(enumerate(proposal))) - try: - ct_hashtree.set_hashes(ct_hashes) - except hashtree.BadHashError: - self._fetch_failures["crypttext_hashtree"] += 1 - raise - ct_hashtree.set_hashes(ct_hashes) - self._crypttext_hashtree = ct_hashtree - d = self._obtain_validated_thing(None, - self._uri_extension_sources, - "crypttext_hashes", - "get_crypttext_hashes", (), - _validate_crypttext_hashtree) - return d - - def _setup_hashtrees(self, res): - self._output.setup_hashtrees(self._plaintext_hashtree, - self._crypttext_hashtree) - if self._results: - elapsed = time.time() - self._get_hashtrees_started - self._results.timings["hashtrees"] = elapsed - - def _create_validated_buckets(self, ignored=None): - self._share_vbuckets = {} - for sharenum, bucket in self._share_buckets: - vbucket = ValidatedBucket(sharenum, bucket, - self._share_hashtree, - self._roothash, - self._total_segments) - s = self._share_vbuckets.setdefault(sharenum, set()) - s.add(vbucket) - - def _activate_enough_buckets(self): - """either return a mapping from shnum to a ValidatedBucket that can - provide data for that share, or raise NotEnoughSharesError""" - - while len(self.active_buckets) < self._num_needed_shares: - # need some more - handled_shnums = set(self.active_buckets.keys()) - available_shnums = set(self._share_vbuckets.keys()) - potential_shnums = list(available_shnums - handled_shnums) - if not potential_shnums: - raise NotEnoughSharesError - # choose a random share - shnum = random.choice(potential_shnums) - # and a random bucket that will provide it - validated_bucket = random.choice(list(self._share_vbuckets[shnum])) - self.active_buckets[shnum] = validated_bucket - return self.active_buckets - - - def _download_all_segments(self, res): - # the promise: upon entry to this function, self._share_vbuckets - # contains enough buckets to complete the download, and some extra - # ones to tolerate some buckets dropping out or having errors. - # self._share_vbuckets is a dictionary that maps from shnum to a set - # of ValidatedBuckets, which themselves are wrappers around - # RIBucketReader references. - self.active_buckets = {} # k: shnum, v: ValidatedBucket instance - - self._started_fetching = time.time() - - d = defer.succeed(None) - for segnum in range(self._total_segments-1): - d.addCallback(self._download_segment, segnum) - # this pause, at the end of write, prevents pre-fetch from - # happening until the consumer is ready for more data. - d.addCallback(self._check_for_pause) - d.addCallback(self._download_tail_segment, self._total_segments-1) - return d - - def _check_for_pause(self, res): - if self._paused: - d = defer.Deferred() - self._paused.addCallback(lambda ignored: d.callback(res)) - return d - if self._stopped: - raise DownloadStopped("our Consumer called stopProducing()") - return res - - def _download_segment(self, res, segnum): - if self._status: - self._status.set_status("Downloading segment %d of %d" % - (segnum+1, self._total_segments)) - self.log("downloading seg#%d of %d (%d%%)" - % (segnum, self._total_segments, - 100.0 * segnum / self._total_segments)) - # memory footprint: when the SegmentDownloader finishes pulling down - # all shares, we have 1*segment_size of usage. - segmentdler = SegmentDownloader(self, segnum, self._num_needed_shares, - self._results) - started = time.time() - d = segmentdler.start() - def _finished_fetching(res): - elapsed = time.time() - started - self._results.timings["cumulative_fetch"] += elapsed - return res - if self._results: - d.addCallback(_finished_fetching) - # pause before using more memory - d.addCallback(self._check_for_pause) - # while the codec does its job, we hit 2*segment_size - def _started_decode(res): - self._started_decode = time.time() - return res - if self._results: - d.addCallback(_started_decode) - d.addCallback(lambda (shares, shareids): - self._codec.decode(shares, shareids)) - # once the codec is done, we drop back to 1*segment_size, because - # 'shares' goes out of scope. The memory usage is all in the - # plaintext now, spread out into a bunch of tiny buffers. - def _finished_decode(res): - elapsed = time.time() - self._started_decode - self._results.timings["cumulative_decode"] += elapsed - return res - if self._results: - d.addCallback(_finished_decode) - - # pause/check-for-stop just before writing, to honor stopProducing - d.addCallback(self._check_for_pause) - def _done(buffers): - # we start by joining all these buffers together into a single - # string. This makes Output.write easier, since it wants to hash - # data one segment at a time anyways, and doesn't impact our - # memory footprint since we're already peaking at 2*segment_size - # inside the codec a moment ago. - segment = "".join(buffers) - del buffers - # we're down to 1*segment_size right now, but write_segment() - # will decrypt a copy of the segment internally, which will push - # us up to 2*segment_size while it runs. - started_decrypt = time.time() - self._output.write_segment(segment) - if self._results: - elapsed = time.time() - started_decrypt - self._results.timings["cumulative_decrypt"] += elapsed - d.addCallback(_done) - return d - - def _download_tail_segment(self, res, segnum): - self.log("downloading seg#%d of %d (%d%%)" - % (segnum, self._total_segments, - 100.0 * segnum / self._total_segments)) - segmentdler = SegmentDownloader(self, segnum, self._num_needed_shares, - self._results) - started = time.time() - d = segmentdler.start() - def _finished_fetching(res): - elapsed = time.time() - started - self._results.timings["cumulative_fetch"] += elapsed - return res - if self._results: - d.addCallback(_finished_fetching) - # pause before using more memory - d.addCallback(self._check_for_pause) - def _started_decode(res): - self._started_decode = time.time() - return res - if self._results: - d.addCallback(_started_decode) - d.addCallback(lambda (shares, shareids): - self._tail_codec.decode(shares, shareids)) - def _finished_decode(res): - elapsed = time.time() - self._started_decode - self._results.timings["cumulative_decode"] += elapsed - return res - if self._results: - d.addCallback(_finished_decode) - # pause/check-for-stop just before writing, to honor stopProducing - d.addCallback(self._check_for_pause) - def _done(buffers): - # trim off any padding added by the upload side - segment = "".join(buffers) - del buffers - # we never send empty segments. If the data was an exact multiple - # of the segment size, the last segment will be full. - pad_size = mathutil.pad_size(self._size, self._segment_size) - tail_size = self._segment_size - pad_size - segment = segment[:tail_size] - started_decrypt = time.time() - self._output.write_segment(segment) - if self._results: - elapsed = time.time() - started_decrypt - self._results.timings["cumulative_decrypt"] += elapsed - d.addCallback(_done) - return d - - def _done(self, res): - self.log("download done") - if self._results: - now = time.time() - self._results.timings["total"] = now - self._started - self._results.timings["segments"] = now - self._started_fetching - self._output.close() - if self.check_crypttext_hash and self._crypttext_hash: - _assert(self._crypttext_hash == self._output.crypttext_hash, - "bad crypttext_hash: computed=%s, expected=%s" % - (base32.b2a(self._output.crypttext_hash), - base32.b2a(self._crypttext_hash))) - if self.check_plaintext_hash and self._plaintext_hash: - _assert(self._plaintext_hash == self._output.plaintext_hash, - "bad plaintext_hash: computed=%s, expected=%s" % - (base32.b2a(self._output.plaintext_hash), - base32.b2a(self._plaintext_hash))) - _assert(self._output.length == self._size, - got=self._output.length, expected=self._size) - return self._output.finish() - - def get_download_status(self): - return self._status - - -class LiteralDownloader: - def __init__(self, client, u, downloadable): - self._uri = IFileURI(u) - assert isinstance(self._uri, uri.LiteralFileURI) - self._downloadable = downloadable - self._status = s = DownloadStatus() - s.set_storage_index(None) - s.set_helper(False) - s.set_status("Done") - s.set_active(False) - s.set_progress(1.0) - - def start(self): - data = self._uri.data - self._status.set_size(len(data)) - self._downloadable.open(len(data)) - self._downloadable.write(data) - self._downloadable.close() - return defer.maybeDeferred(self._downloadable.finish) - - def get_download_status(self): - return self._status - -class FileName: - implements(IDownloadTarget) - def __init__(self, filename): - self._filename = filename - self.f = None - def open(self, size): - self.f = open(self._filename, "wb") - return self.f - def write(self, data): - self.f.write(data) - def close(self): - if self.f: - self.f.close() - def fail(self, why): - if self.f: - self.f.close() - os.unlink(self._filename) - def register_canceller(self, cb): - pass # we won't use it - def finish(self): - pass - -class Data: - implements(IDownloadTarget) - def __init__(self): - self._data = [] - def open(self, size): - pass - def write(self, data): - self._data.append(data) - def close(self): - self.data = "".join(self._data) - del self._data - def fail(self, why): - del self._data - def register_canceller(self, cb): - pass # we won't use it - def finish(self): - return self.data - -class FileHandle: - """Use me to download data to a pre-defined filehandle-like object. I - will use the target's write() method. I will *not* close the filehandle: - I leave that up to the originator of the filehandle. The download process - will return the filehandle when it completes. - """ - implements(IDownloadTarget) - def __init__(self, filehandle): - self._filehandle = filehandle - def open(self, size): - pass - def write(self, data): - self._filehandle.write(data) - def close(self): - # the originator of the filehandle reserves the right to close it - pass - def fail(self, why): - pass - def register_canceller(self, cb): - pass - def finish(self): - return self._filehandle - -class Downloader(service.MultiService): - """I am a service that allows file downloading. - """ - implements(IDownloader) - name = "downloader" - MAX_DOWNLOAD_STATUSES = 10 - - def __init__(self, stats_provider=None): - service.MultiService.__init__(self) - self.stats_provider = stats_provider - self._all_downloads = weakref.WeakKeyDictionary() # for debugging - self._all_download_statuses = weakref.WeakKeyDictionary() - self._recent_download_statuses = [] - - def download(self, u, t): - assert self.parent - assert self.running - u = IFileURI(u) - t = IDownloadTarget(t) - assert t.write - assert t.close - - - if isinstance(u, uri.LiteralFileURI): - dl = LiteralDownloader(self.parent, u, t) - elif isinstance(u, uri.CHKFileURI): - if self.stats_provider: - # these counters are meant for network traffic, and don't - # include LIT files - self.stats_provider.count('downloader.files_downloaded', 1) - self.stats_provider.count('downloader.bytes_downloaded', u.get_size()) - dl = FileDownloader(self.parent, u, t) - else: - raise RuntimeError("I don't know how to download a %s" % u) - self._add_download(dl) - d = dl.start() - return d - - # utility functions - def download_to_data(self, uri): - return self.download(uri, Data()) - def download_to_filename(self, uri, filename): - return self.download(uri, FileName(filename)) - def download_to_filehandle(self, uri, filehandle): - return self.download(uri, FileHandle(filehandle)) - - def _add_download(self, downloader): - self._all_downloads[downloader] = None - s = downloader.get_download_status() - self._all_download_statuses[s] = None - self._recent_download_statuses.append(s) - while len(self._recent_download_statuses) > self.MAX_DOWNLOAD_STATUSES: - self._recent_download_statuses.pop(0) - - def list_all_download_statuses(self): - for ds in self._all_download_statuses: - yield ds diff --git a/src/allmydata/encode.py b/src/allmydata/encode.py deleted file mode 100644 index 766292fe..00000000 --- a/src/allmydata/encode.py +++ /dev/null @@ -1,718 +0,0 @@ -# -*- test-case-name: allmydata.test.test_encode -*- - -import time -from zope.interface import implements -from twisted.internet import defer -from foolscap import eventual -from allmydata import storage, uri -from allmydata.hashtree import HashTree -from allmydata.util import mathutil, hashutil, base32, log -from allmydata.util.assertutil import _assert, precondition -from allmydata.codec import CRSEncoder -from allmydata.interfaces import IEncoder, IStorageBucketWriter, \ - IEncryptedUploadable, IUploadStatus - -""" -The goal of the encoder is to turn the original file into a series of -'shares'. Each share is going to a 'shareholder' (nominally each shareholder -is a different host, but for small grids there may be overlap). The number -of shares is chosen to hit our reliability goals (more shares on more -machines means more reliability), and is limited by overhead (proportional to -numshares or log(numshares)) and the encoding technology in use (zfec permits -only 256 shares total). It is also constrained by the amount of data -we want to send to each host. For estimating purposes, think of 10 shares -out of which we need 3 to reconstruct the file. - -The encoder starts by cutting the original file into segments. All segments -except the last are of equal size. The segment size is chosen to constrain -the memory footprint (which will probably vary between 1x and 4x segment -size) and to constrain the overhead (which will be proportional to -log(number of segments)). - - -Each segment (A,B,C) is read into memory, encrypted, and encoded into -blocks. The 'share' (say, share #1) that makes it out to a host is a -collection of these blocks (block A1, B1, C1), plus some hash-tree -information necessary to validate the data upon retrieval. Only one segment -is handled at a time: all blocks for segment A are delivered before any -work is begun on segment B. - -As blocks are created, we retain the hash of each one. The list of block hashes -for a single share (say, hash(A1), hash(B1), hash(C1)) is used to form the base -of a Merkle hash tree for that share, called the block hash tree. - -This hash tree has one terminal leaf per block. The complete block hash -tree is sent to the shareholder after all the data has been sent. At -retrieval time, the decoder will ask for specific pieces of this tree before -asking for blocks, whichever it needs to validate those blocks. - -(Note: we don't really need to generate this whole block hash tree -ourselves. It would be sufficient to have the shareholder generate it and -just tell us the root. This gives us an extra level of validation on the -transfer, though, and it is relatively cheap to compute.) - -Each of these block hash trees has a root hash. The collection of these -root hashes for all shares are collected into the 'share hash tree', which -has one terminal leaf per share. After sending the blocks and the complete -block hash tree to each shareholder, we send them the portion of the share -hash tree that is necessary to validate their share. The root of the share -hash tree is put into the URI. - -""" - -class NotEnoughSharesError(Exception): - servermap = None - pass - -class UploadAborted(Exception): - pass - -KiB=1024 -MiB=1024*KiB -GiB=1024*MiB -TiB=1024*GiB -PiB=1024*TiB - -class Encoder(object): - implements(IEncoder) - USE_PLAINTEXT_HASHES = False - - def __init__(self, log_parent=None, upload_status=None): - object.__init__(self) - self.uri_extension_data = {} - self._codec = None - self._status = None - if upload_status: - self._status = IUploadStatus(upload_status) - precondition(log_parent is None or isinstance(log_parent, int), - log_parent) - self._log_number = log.msg("creating Encoder %s" % self, - facility="tahoe.encoder", parent=log_parent) - self._aborted = False - - def __repr__(self): - if hasattr(self, "_storage_index"): - return "" % storage.si_b2a(self._storage_index)[:5] - return "" - - def log(self, *args, **kwargs): - if "parent" not in kwargs: - kwargs["parent"] = self._log_number - if "facility" not in kwargs: - kwargs["facility"] = "tahoe.encoder" - return log.msg(*args, **kwargs) - - def set_encrypted_uploadable(self, uploadable): - eu = self._uploadable = IEncryptedUploadable(uploadable) - d = eu.get_size() - def _got_size(size): - self.log(format="file size: %(size)d", size=size) - self.file_size = size - d.addCallback(_got_size) - d.addCallback(lambda res: eu.get_all_encoding_parameters()) - d.addCallback(self._got_all_encoding_parameters) - d.addCallback(lambda res: eu.get_storage_index()) - def _done(storage_index): - self._storage_index = storage_index - return self - d.addCallback(_done) - return d - - def _got_all_encoding_parameters(self, params): - assert not self._codec - k, happy, n, segsize = params - self.required_shares = k - self.shares_of_happiness = happy - self.num_shares = n - self.segment_size = segsize - self.log("got encoding parameters: %d/%d/%d %d" % (k,happy,n, segsize)) - self.log("now setting up codec") - - assert self.segment_size % self.required_shares == 0 - - self.num_segments = mathutil.div_ceil(self.file_size, - self.segment_size) - - self._codec = CRSEncoder() - self._codec.set_params(self.segment_size, - self.required_shares, self.num_shares) - - data = self.uri_extension_data - data['codec_name'] = self._codec.get_encoder_type() - data['codec_params'] = self._codec.get_serialized_params() - - data['size'] = self.file_size - data['segment_size'] = self.segment_size - self.share_size = mathutil.div_ceil(self.file_size, - self.required_shares) - data['num_segments'] = self.num_segments - data['needed_shares'] = self.required_shares - data['total_shares'] = self.num_shares - - # the "tail" is the last segment. This segment may or may not be - # shorter than all other segments. We use the "tail codec" to handle - # it. If the tail is short, we use a different codec instance. In - # addition, the tail codec must be fed data which has been padded out - # to the right size. - self.tail_size = self.file_size % self.segment_size - if not self.tail_size: - self.tail_size = self.segment_size - - # the tail codec is responsible for encoding tail_size bytes - padded_tail_size = mathutil.next_multiple(self.tail_size, - self.required_shares) - self._tail_codec = CRSEncoder() - self._tail_codec.set_params(padded_tail_size, - self.required_shares, self.num_shares) - data['tail_codec_params'] = self._tail_codec.get_serialized_params() - - def _get_share_size(self): - share_size = mathutil.div_ceil(self.file_size, self.required_shares) - overhead = self._compute_overhead() - return share_size + overhead - - def _compute_overhead(self): - return 0 - - def get_param(self, name): - assert self._codec - - if name == "storage_index": - return self._storage_index - elif name == "share_counts": - return (self.required_shares, self.shares_of_happiness, - self.num_shares) - elif name == "num_segments": - return self.num_segments - elif name == "segment_size": - return self.segment_size - elif name == "block_size": - return self._codec.get_block_size() - elif name == "share_size": - return self._get_share_size() - elif name == "serialized_params": - return self._codec.get_serialized_params() - else: - raise KeyError("unknown parameter name '%s'" % name) - - def set_shareholders(self, landlords): - assert isinstance(landlords, dict) - for k in landlords: - assert IStorageBucketWriter.providedBy(landlords[k]) - self.landlords = landlords.copy() - - def start(self): - self.log("%s starting" % (self,)) - #paddedsize = self._size + mathutil.pad_size(self._size, self.needed_shares) - assert self._codec - self._crypttext_hasher = hashutil.crypttext_hasher() - self._crypttext_hashes = [] - self.segment_num = 0 - self.subshare_hashes = [[] for x in range(self.num_shares)] - # subshare_hashes[i] is a list that will be accumulated and then send - # to landlord[i]. This list contains a hash of each segment_share - # that we sent to that landlord. - self.share_root_hashes = [None] * self.num_shares - - self._times = { - "cumulative_encoding": 0.0, - "cumulative_sending": 0.0, - "hashes_and_close": 0.0, - "total_encode_and_push": 0.0, - } - self._start_total_timestamp = time.time() - - d = eventual.fireEventually() - - d.addCallback(lambda res: self.start_all_shareholders()) - - for i in range(self.num_segments-1): - # note to self: this form doesn't work, because lambda only - # captures the slot, not the value - #d.addCallback(lambda res: self.do_segment(i)) - # use this form instead: - d.addCallback(lambda res, i=i: self._encode_segment(i)) - d.addCallback(self._send_segment, i) - d.addCallback(self._turn_barrier) - last_segnum = self.num_segments - 1 - d.addCallback(lambda res: self._encode_tail_segment(last_segnum)) - d.addCallback(self._send_segment, last_segnum) - d.addCallback(self._turn_barrier) - - d.addCallback(lambda res: self.finish_hashing()) - - if self.USE_PLAINTEXT_HASHES: - d.addCallback(lambda res: - self.send_plaintext_hash_tree_to_all_shareholders()) - d.addCallback(lambda res: - self.send_crypttext_hash_tree_to_all_shareholders()) - d.addCallback(lambda res: self.send_all_subshare_hash_trees()) - d.addCallback(lambda res: self.send_all_share_hash_trees()) - d.addCallback(lambda res: self.send_uri_extension_to_all_shareholders()) - - d.addCallback(lambda res: self.close_all_shareholders()) - d.addCallbacks(self.done, self.err) - return d - - def set_status(self, status): - if self._status: - self._status.set_status(status) - - def set_encode_and_push_progress(self, sent_segments=None, extra=0.0): - if self._status: - # we treat the final hash+close as an extra segment - if sent_segments is None: - sent_segments = self.num_segments - progress = float(sent_segments + extra) / (self.num_segments + 1) - self._status.set_progress(2, progress) - - def abort(self): - self.log("aborting upload", level=log.UNUSUAL) - assert self._codec, "don't call abort before start" - self._aborted = True - # the next segment read (in _gather_data inside _encode_segment) will - # raise UploadAborted(), which will bypass the rest of the upload - # chain. If we've sent the final segment's shares, it's too late to - # abort. TODO: allow abort any time up to close_all_shareholders. - - def _turn_barrier(self, res): - # putting this method in a Deferred chain imposes a guaranteed - # reactor turn between the pre- and post- portions of that chain. - # This can be useful to limit memory consumption: since Deferreds do - # not do tail recursion, code which uses defer.succeed(result) for - # consistency will cause objects to live for longer than you might - # normally expect. - - return eventual.fireEventually(res) - - - def start_all_shareholders(self): - self.log("starting shareholders", level=log.NOISY) - self.set_status("Starting shareholders") - dl = [] - for shareid in self.landlords: - d = self.landlords[shareid].start() - d.addErrback(self._remove_shareholder, shareid, "start") - dl.append(d) - return self._gather_responses(dl) - - def _encode_segment(self, segnum): - codec = self._codec - start = time.time() - - # the ICodecEncoder API wants to receive a total of self.segment_size - # bytes on each encode() call, broken up into a number of - # identically-sized pieces. Due to the way the codec algorithm works, - # these pieces need to be the same size as the share which the codec - # will generate. Therefore we must feed it with input_piece_size that - # equals the output share size. - input_piece_size = codec.get_block_size() - - # as a result, the number of input pieces per encode() call will be - # equal to the number of required shares with which the codec was - # constructed. You can think of the codec as chopping up a - # 'segment_size' of data into 'required_shares' shares (not doing any - # fancy math at all, just doing a split), then creating some number - # of additional shares which can be substituted if the primary ones - # are unavailable - - crypttext_segment_hasher = hashutil.crypttext_segment_hasher() - - # memory footprint: we only hold a tiny piece of the plaintext at any - # given time. We build up a segment's worth of cryptttext, then hand - # it to the encoder. Assuming 3-of-10 encoding (3.3x expansion) and - # 1MiB max_segment_size, we get a peak memory footprint of 4.3*1MiB = - # 4.3MiB. Lowering max_segment_size to, say, 100KiB would drop the - # footprint to 430KiB at the expense of more hash-tree overhead. - - d = self._gather_data(self.required_shares, input_piece_size, - crypttext_segment_hasher) - def _done_gathering(chunks): - for c in chunks: - assert len(c) == input_piece_size - self._crypttext_hashes.append(crypttext_segment_hasher.digest()) - # during this call, we hit 5*segsize memory - return codec.encode(chunks) - d.addCallback(_done_gathering) - def _done(res): - elapsed = time.time() - start - self._times["cumulative_encoding"] += elapsed - return res - d.addCallback(_done) - return d - - def _encode_tail_segment(self, segnum): - - start = time.time() - codec = self._tail_codec - input_piece_size = codec.get_block_size() - - crypttext_segment_hasher = hashutil.crypttext_segment_hasher() - - d = self._gather_data(self.required_shares, input_piece_size, - crypttext_segment_hasher, - allow_short=True) - def _done_gathering(chunks): - for c in chunks: - # a short trailing chunk will have been padded by - # _gather_data - assert len(c) == input_piece_size - self._crypttext_hashes.append(crypttext_segment_hasher.digest()) - return codec.encode(chunks) - d.addCallback(_done_gathering) - def _done(res): - elapsed = time.time() - start - self._times["cumulative_encoding"] += elapsed - return res - d.addCallback(_done) - return d - - def _gather_data(self, num_chunks, input_chunk_size, - crypttext_segment_hasher, - allow_short=False, - previous_chunks=[]): - """Return a Deferred that will fire when the required number of - chunks have been read (and hashed and encrypted). The Deferred fires - with the combination of any 'previous_chunks' and the new chunks - which were gathered.""" - - if self._aborted: - raise UploadAborted() - - if not num_chunks: - return defer.succeed(previous_chunks) - - d = self._uploadable.read_encrypted(input_chunk_size, False) - def _got(data): - if self._aborted: - raise UploadAborted() - encrypted_pieces = [] - length = 0 - while data: - encrypted_piece = data.pop(0) - length += len(encrypted_piece) - crypttext_segment_hasher.update(encrypted_piece) - self._crypttext_hasher.update(encrypted_piece) - encrypted_pieces.append(encrypted_piece) - - if allow_short: - if length < input_chunk_size: - # padding - pad_size = input_chunk_size - length - encrypted_pieces.append('\x00' * pad_size) - else: - # non-tail segments should be the full segment size - if length != input_chunk_size: - log.msg("non-tail segment should be full segment size: %d!=%d" - % (length, input_chunk_size), level=log.BAD) - precondition(length == input_chunk_size, - "length=%d != input_chunk_size=%d" % - (length, input_chunk_size)) - - encrypted_piece = "".join(encrypted_pieces) - return previous_chunks + [encrypted_piece] - - d.addCallback(_got) - d.addCallback(lambda chunks: - self._gather_data(num_chunks-1, input_chunk_size, - crypttext_segment_hasher, - allow_short, chunks)) - return d - - def _send_segment(self, (shares, shareids), segnum): - # To generate the URI, we must generate the roothash, so we must - # generate all shares, even if we aren't actually giving them to - # anybody. This means that the set of shares we create will be equal - # to or larger than the set of landlords. If we have any landlord who - # *doesn't* have a share, that's an error. - _assert(set(self.landlords.keys()).issubset(set(shareids)), - shareids=shareids, landlords=self.landlords) - start = time.time() - dl = [] - self.set_status("Sending segment %d of %d" % (segnum+1, - self.num_segments)) - self.set_encode_and_push_progress(segnum) - lognum = self.log("send_segment(%d)" % segnum, level=log.NOISY) - for i in range(len(shares)): - subshare = shares[i] - shareid = shareids[i] - d = self.send_subshare(shareid, segnum, subshare, lognum) - dl.append(d) - subshare_hash = hashutil.block_hash(subshare) - #from allmydata.util import base32 - #log.msg("creating block (shareid=%d, blocknum=%d) " - # "len=%d %r .. %r: %s" % - # (shareid, segnum, len(subshare), - # subshare[:50], subshare[-50:], base32.b2a(subshare_hash))) - self.subshare_hashes[shareid].append(subshare_hash) - - dl = self._gather_responses(dl) - def _logit(res): - self.log("%s uploaded %s / %s bytes (%d%%) of your file." % - (self, - self.segment_size*(segnum+1), - self.segment_size*self.num_segments, - 100 * (segnum+1) / self.num_segments, - ), - level=log.OPERATIONAL) - elapsed = time.time() - start - self._times["cumulative_sending"] += elapsed - return res - dl.addCallback(_logit) - return dl - - def send_subshare(self, shareid, segment_num, subshare, lognum): - if shareid not in self.landlords: - return defer.succeed(None) - sh = self.landlords[shareid] - lognum2 = self.log("put_block to %s" % self.landlords[shareid], - parent=lognum, level=log.NOISY) - d = sh.put_block(segment_num, subshare) - def _done(res): - self.log("put_block done", parent=lognum2, level=log.NOISY) - return res - d.addCallback(_done) - d.addErrback(self._remove_shareholder, shareid, - "segnum=%d" % segment_num) - return d - - def _remove_shareholder(self, why, shareid, where): - ln = self.log(format="error while sending %(method)s to shareholder=%(shnum)d", - method=where, shnum=shareid, - level=log.UNUSUAL, failure=why) - if shareid in self.landlords: - self.landlords[shareid].abort() - del self.landlords[shareid] - else: - # even more UNUSUAL - self.log("they weren't in our list of landlords", parent=ln, - level=log.WEIRD) - if len(self.landlords) < self.shares_of_happiness: - msg = "lost too many shareholders during upload: %s" % why - raise NotEnoughSharesError(msg) - self.log("but we can still continue with %s shares, we'll be happy " - "with at least %s" % (len(self.landlords), - self.shares_of_happiness), - parent=ln) - - def _gather_responses(self, dl): - d = defer.DeferredList(dl, fireOnOneErrback=True) - def _eatNotEnoughSharesError(f): - # all exceptions that occur while talking to a peer are handled - # in _remove_shareholder. That might raise NotEnoughSharesError, - # which will cause the DeferredList to errback but which should - # otherwise be consumed. Allow non-NotEnoughSharesError exceptions - # to pass through as an unhandled errback. We use this in lieu of - # consumeErrors=True to allow coding errors to be logged. - f.trap(NotEnoughSharesError) - return None - for d0 in dl: - d0.addErrback(_eatNotEnoughSharesError) - return d - - def finish_hashing(self): - self._start_hashing_and_close_timestamp = time.time() - self.set_status("Finishing hashes") - self.set_encode_and_push_progress(extra=0.0) - crypttext_hash = self._crypttext_hasher.digest() - self.uri_extension_data["crypttext_hash"] = crypttext_hash - d = self._uploadable.get_plaintext_hash() - def _got(plaintext_hash): - self.log(format="plaintext_hash=%(plaintext_hash)s, SI=%(SI)s, size=%(size)d", - plaintext_hash=base32.b2a(plaintext_hash), - SI=storage.si_b2a(self._storage_index), - size=self.file_size) - return plaintext_hash - d.addCallback(_got) - if self.USE_PLAINTEXT_HASHES: - def _use_plaintext_hash(plaintext_hash): - self.uri_extension_data["plaintext_hash"] = plaintext_hash - return self._uploadable.get_plaintext_hashtree_leaves(0, self.num_segments, self.num_segments) - d.addCallback(_use_plaintext_hash) - def _got_hashtree_leaves(leaves): - self.log("Encoder: got plaintext_hashtree_leaves: %s" % - (",".join([base32.b2a(h) for h in leaves]),), - level=log.NOISY) - ht = list(HashTree(list(leaves))) - self.uri_extension_data["plaintext_root_hash"] = ht[0] - self._plaintext_hashtree_nodes = ht - d.addCallback(_got_hashtree_leaves) - - d.addCallback(lambda res: self._uploadable.close()) - return d - - def send_plaintext_hash_tree_to_all_shareholders(self): - self.log("sending plaintext hash tree", level=log.NOISY) - self.set_status("Sending Plaintext Hash Tree") - self.set_encode_and_push_progress(extra=0.2) - dl = [] - for shareid in self.landlords.keys(): - d = self.send_plaintext_hash_tree(shareid, - self._plaintext_hashtree_nodes) - dl.append(d) - return self._gather_responses(dl) - - def send_plaintext_hash_tree(self, shareid, all_hashes): - if shareid not in self.landlords: - return defer.succeed(None) - sh = self.landlords[shareid] - d = sh.put_plaintext_hashes(all_hashes) - d.addErrback(self._remove_shareholder, shareid, "put_plaintext_hashes") - return d - - def send_crypttext_hash_tree_to_all_shareholders(self): - self.log("sending crypttext hash tree", level=log.NOISY) - self.set_status("Sending Crypttext Hash Tree") - self.set_encode_and_push_progress(extra=0.3) - t = HashTree(self._crypttext_hashes) - all_hashes = list(t) - self.uri_extension_data["crypttext_root_hash"] = t[0] - dl = [] - for shareid in self.landlords.keys(): - dl.append(self.send_crypttext_hash_tree(shareid, all_hashes)) - return self._gather_responses(dl) - - def send_crypttext_hash_tree(self, shareid, all_hashes): - if shareid not in self.landlords: - return defer.succeed(None) - sh = self.landlords[shareid] - d = sh.put_crypttext_hashes(all_hashes) - d.addErrback(self._remove_shareholder, shareid, "put_crypttext_hashes") - return d - - def send_all_subshare_hash_trees(self): - self.log("sending subshare hash trees", level=log.NOISY) - self.set_status("Sending Subshare Hash Trees") - self.set_encode_and_push_progress(extra=0.4) - dl = [] - for shareid,hashes in enumerate(self.subshare_hashes): - # hashes is a list of the hashes of all subshares that were sent - # to shareholder[shareid]. - dl.append(self.send_one_subshare_hash_tree(shareid, hashes)) - return self._gather_responses(dl) - - def send_one_subshare_hash_tree(self, shareid, subshare_hashes): - t = HashTree(subshare_hashes) - all_hashes = list(t) - # all_hashes[0] is the root hash, == hash(ah[1]+ah[2]) - # all_hashes[1] is the left child, == hash(ah[3]+ah[4]) - # all_hashes[n] == hash(all_hashes[2*n+1] + all_hashes[2*n+2]) - self.share_root_hashes[shareid] = t[0] - if shareid not in self.landlords: - return defer.succeed(None) - sh = self.landlords[shareid] - d = sh.put_block_hashes(all_hashes) - d.addErrback(self._remove_shareholder, shareid, "put_block_hashes") - return d - - def send_all_share_hash_trees(self): - # each bucket gets a set of share hash tree nodes that are needed to - # validate their share. This includes the share hash itself, but does - # not include the top-level hash root (which is stored securely in - # the URI instead). - self.log("sending all share hash trees", level=log.NOISY) - self.set_status("Sending Share Hash Trees") - self.set_encode_and_push_progress(extra=0.6) - dl = [] - for h in self.share_root_hashes: - assert h - # create the share hash tree - t = HashTree(self.share_root_hashes) - # the root of this hash tree goes into our URI - self.uri_extension_data['share_root_hash'] = t[0] - # now send just the necessary pieces out to each shareholder - for i in range(self.num_shares): - # the HashTree is given a list of leaves: 0,1,2,3..n . - # These become nodes A+0,A+1,A+2.. of the tree, where A=n-1 - needed_hash_indices = t.needed_hashes(i, include_leaf=True) - hashes = [(hi, t[hi]) for hi in needed_hash_indices] - dl.append(self.send_one_share_hash_tree(i, hashes)) - return self._gather_responses(dl) - - def send_one_share_hash_tree(self, shareid, needed_hashes): - if shareid not in self.landlords: - return defer.succeed(None) - sh = self.landlords[shareid] - d = sh.put_share_hashes(needed_hashes) - d.addErrback(self._remove_shareholder, shareid, "put_share_hashes") - return d - - def send_uri_extension_to_all_shareholders(self): - lp = self.log("sending uri_extension", level=log.NOISY) - self.set_status("Sending URI Extensions") - self.set_encode_and_push_progress(extra=0.8) - for k in ('crypttext_root_hash', 'crypttext_hash', - ): - assert k in self.uri_extension_data - if self.USE_PLAINTEXT_HASHES: - for k in ('plaintext_root_hash', 'plaintext_hash', - ): - assert k in self.uri_extension_data - uri_extension = uri.pack_extension(self.uri_extension_data) - ed = {} - for k,v in self.uri_extension_data.items(): - if k.endswith("hash"): - ed[k] = base32.b2a(v) - else: - ed[k] = v - self.log("uri_extension_data is %s" % (ed,), level=log.NOISY, parent=lp) - self.uri_extension_hash = hashutil.uri_extension_hash(uri_extension) - dl = [] - for shareid in self.landlords.keys(): - dl.append(self.send_uri_extension(shareid, uri_extension)) - return self._gather_responses(dl) - - def send_uri_extension(self, shareid, uri_extension): - sh = self.landlords[shareid] - d = sh.put_uri_extension(uri_extension) - d.addErrback(self._remove_shareholder, shareid, "put_uri_extension") - return d - - def close_all_shareholders(self): - self.log("closing shareholders", level=log.NOISY) - self.set_status("Closing Shareholders") - self.set_encode_and_push_progress(extra=0.9) - dl = [] - for shareid in self.landlords: - d = self.landlords[shareid].close() - d.addErrback(self._remove_shareholder, shareid, "close") - dl.append(d) - return self._gather_responses(dl) - - def done(self, res): - self.log("upload done", level=log.OPERATIONAL) - self.set_status("Done") - self.set_encode_and_push_progress(extra=1.0) # done - now = time.time() - h_and_c_elapsed = now - self._start_hashing_and_close_timestamp - self._times["hashes_and_close"] = h_and_c_elapsed - total_elapsed = now - self._start_total_timestamp - self._times["total_encode_and_push"] = total_elapsed - - # update our sharemap - self._shares_placed = set(self.landlords.keys()) - return (self.uri_extension_hash, self.required_shares, - self.num_shares, self.file_size) - - def err(self, f): - self.log("upload failed", failure=f, level=log.UNUSUAL) - self.set_status("Failed") - # we need to abort any remaining shareholders, so they'll delete the - # partial share, allowing someone else to upload it again. - self.log("aborting shareholders", level=log.UNUSUAL) - for shareid in list(self.landlords.keys()): - self.landlords[shareid].abort() - if f.check(defer.FirstError): - return f.value.subFailure - return f - - def get_shares_placed(self): - # return a set of share numbers that were successfully placed. - return self._shares_placed - - def get_times(self): - # return a dictionary of encode+push timings - return self._times - - def get_uri_extension_data(self): - return self.uri_extension_data diff --git a/src/allmydata/filenode.py b/src/allmydata/filenode.py deleted file mode 100644 index 2d0f2a36..00000000 --- a/src/allmydata/filenode.py +++ /dev/null @@ -1,118 +0,0 @@ - -from zope.interface import implements -from twisted.internet import defer -from allmydata.interfaces import IFileNode, IFileURI, IURI, ICheckable -from allmydata import uri -from allmydata.checker import SimpleCHKFileChecker, SimpleCHKFileVerifier, \ - Results - -class FileNode: - implements(IFileNode, ICheckable) - - def __init__(self, uri, client): - u = IFileURI(uri) - self.uri = u.to_string() - self._client = client - - def get_uri(self): - return self.uri - - def is_mutable(self): - return False - - def is_readonly(self): - return True - - def get_readonly_uri(self): - return self.uri - - def get_size(self): - return IFileURI(self.uri).get_size() - - def __hash__(self): - return hash((self.__class__, self.uri)) - def __cmp__(self, them): - if cmp(type(self), type(them)): - return cmp(type(self), type(them)) - if cmp(self.__class__, them.__class__): - return cmp(self.__class__, them.__class__) - return cmp(self.uri, them.uri) - - def get_verifier(self): - return IFileURI(self.uri).get_verifier() - - def check(self, verify=False, repair=False): - assert repair is False # not implemented yet - vcap = self.get_verifier() - if verify: - v = SimpleCHKFileVerifier(self._client, vcap) - return v.start() - else: - peer_getter = self._client.get_permuted_peers - v = SimpleCHKFileChecker(peer_getter, vcap) - return v.check() - - def download(self, target): - downloader = self._client.getServiceNamed("downloader") - return downloader.download(self.uri, target) - - def download_to_data(self): - downloader = self._client.getServiceNamed("downloader") - return downloader.download_to_data(self.uri) - - - -class LiteralFileNode: - implements(IFileNode, ICheckable) - - def __init__(self, my_uri, client): - u = IFileURI(my_uri) - assert isinstance(u, uri.LiteralFileURI) - self.uri = u.to_string() - self._client = client - - def get_uri(self): - return self.uri - - def is_mutable(self): - return False - - def is_readonly(self): - return True - - def get_readonly_uri(self): - return self.uri - - def get_size(self): - return len(IURI(self.uri).data) - - def __hash__(self): - return hash((self.__class__, self.uri)) - def __cmp__(self, them): - if cmp(type(self), type(them)): - return cmp(type(self), type(them)) - if cmp(self.__class__, them.__class__): - return cmp(self.__class__, them.__class__) - return cmp(self.uri, them.uri) - - def get_verifier(self): - return None - - def check(self, verify=False, repair=False): - # neither verify= nor repair= affect LIT files - r = Results(None) - r.healthy = True - r.problems = [] - return defer.succeed(r) - - def download(self, target): - # note that this does not update the stats_provider - data = IURI(self.uri).data - target.open(len(data)) - target.write(data) - target.close() - return defer.maybeDeferred(target.finish) - - def download_to_data(self): - data = IURI(self.uri).data - return defer.succeed(data) diff --git a/src/allmydata/immutable/__init__.py b/src/allmydata/immutable/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/allmydata/immutable/checker.py b/src/allmydata/immutable/checker.py new file mode 100644 index 00000000..c49aeb28 --- /dev/null +++ b/src/allmydata/immutable/checker.py @@ -0,0 +1,205 @@ + +""" +Given a StorageIndex, count how many shares we can find. + +This does no verification of the shares whatsoever. If the peer claims to +have the share, we believe them. +""" + +from zope.interface import implements +from twisted.internet import defer +from twisted.python import log +from allmydata import storage +from allmydata.interfaces import IVerifierURI, ICheckerResults +from allmydata.immutable import download +from allmydata.util import hashutil, base32 + +class Results: + implements(ICheckerResults) + + def __init__(self, storage_index): + # storage_index might be None for, say, LIT files + self.storage_index = storage_index + if storage_index is None: + self.storage_index_s = "" + else: + self.storage_index_s = base32.b2a(storage_index)[:6] + + def is_healthy(self): + return self.healthy + + def html_summary(self): + if self.healthy: + return "healthy" + return "NOT HEALTHY" + + def html(self): + s = "
\n" + s += "

Checker Results for Immutable SI=%s

\n" % self.storage_index_s + if self.healthy: + s += "

Healthy!

\n" + else: + s += "

Not Healthy!

\n" + s += "
\n" + return s + + +class SimpleCHKFileChecker: + """Return a list of (needed, total, found, sharemap), where sharemap maps + share number to a list of (binary) nodeids of the shareholders.""" + + def __init__(self, peer_getter, uri_to_check): + self.peer_getter = peer_getter + self.found_shares = set() + self.uri_to_check = IVerifierURI(uri_to_check) + self.sharemap = {} + + ''' + def check_synchronously(self, si): + # this is how we would write this class if we were using synchronous + # messages (or if we used promises). + found = set() + for (pmpeerid, peerid, connection) in self.peer_getter(storage_index): + buckets = connection.get_buckets(si) + found.update(buckets.keys()) + return len(found) + ''' + + def check(self): + d = self._get_all_shareholders(self.uri_to_check.storage_index) + d.addCallback(self._done) + return d + + def _get_all_shareholders(self, storage_index): + dl = [] + for (peerid, ss) in self.peer_getter("storage", storage_index): + d = ss.callRemote("get_buckets", storage_index) + d.addCallbacks(self._got_response, self._got_error, + callbackArgs=(peerid,)) + dl.append(d) + return defer.DeferredList(dl) + + def _got_response(self, buckets, peerid): + # buckets is a dict: maps shum to an rref of the server who holds it + self.found_shares.update(buckets.keys()) + for k in buckets: + if k not in self.sharemap: + self.sharemap[k] = [] + self.sharemap[k].append(peerid) + + def _got_error(self, f): + if f.check(KeyError): + pass + log.err(f) + pass + + def _done(self, res): + u = self.uri_to_check + r = Results(self.uri_to_check.storage_index) + r.healthy = bool(len(self.found_shares) >= u.needed_shares) + r.stuff = (u.needed_shares, u.total_shares, len(self.found_shares), + self.sharemap) + return r + +class VerifyingOutput: + def __init__(self, total_length, results): + self._crypttext_hasher = hashutil.crypttext_hasher() + self.length = 0 + self.total_length = total_length + self._segment_number = 0 + self._crypttext_hash_tree = None + self._opened = False + self._results = results + results.healthy = False + + def setup_hashtrees(self, plaintext_hashtree, crypttext_hashtree): + self._crypttext_hash_tree = crypttext_hashtree + + def write_segment(self, crypttext): + self.length += len(crypttext) + + self._crypttext_hasher.update(crypttext) + if self._crypttext_hash_tree: + ch = hashutil.crypttext_segment_hasher() + ch.update(crypttext) + crypttext_leaves = {self._segment_number: ch.digest()} + self._crypttext_hash_tree.set_hashes(leaves=crypttext_leaves) + + self._segment_number += 1 + + def close(self): + self.crypttext_hash = self._crypttext_hasher.digest() + + def finish(self): + self._results.healthy = True + return self._results + + +class SimpleCHKFileVerifier(download.FileDownloader): + # this reconstructs the crypttext, which verifies that at least 'k' of + # the shareholders are around and have valid data. It does not check the + # remaining shareholders, and it cannot verify the plaintext. + check_plaintext_hash = False + + def __init__(self, client, u): + self._client = client + + u = IVerifierURI(u) + self._storage_index = u.storage_index + self._uri_extension_hash = u.uri_extension_hash + self._total_shares = u.total_shares + self._size = u.size + self._num_needed_shares = u.needed_shares + + self._si_s = storage.si_b2a(self._storage_index) + self.init_logging() + + r = Results(self._storage_index) + self._output = VerifyingOutput(self._size, r) + self._paused = False + self._stopped = False + + self._results = None + self.active_buckets = {} # k: shnum, v: bucket + self._share_buckets = [] # list of (sharenum, bucket) tuples + self._share_vbuckets = {} # k: shnum, v: set of ValidatedBuckets + self._uri_extension_sources = [] + + self._uri_extension_data = None + + self._fetch_failures = {"uri_extension": 0, + "plaintext_hashroot": 0, + "plaintext_hashtree": 0, + "crypttext_hashroot": 0, + "crypttext_hashtree": 0, + } + + def init_logging(self): + self._log_prefix = prefix = storage.si_b2a(self._storage_index)[:5] + num = self._client.log("SimpleCHKFileVerifier(%s): starting" % prefix) + self._log_number = num + + def log(self, msg, parent=None): + if parent is None: + parent = self._log_number + return self._client.log("SimpleCHKFileVerifier(%s): %s" + % (self._log_prefix, msg), + parent=parent) + + + def start(self): + log.msg("starting download [%s]" % storage.si_b2a(self._storage_index)[:5]) + + # first step: who should we download from? + d = defer.maybeDeferred(self._get_all_shareholders) + d.addCallback(self._got_all_shareholders) + # now get the uri_extension block from somebody and validate it + d.addCallback(self._obtain_uri_extension) + d.addCallback(self._got_uri_extension) + d.addCallback(self._get_hashtrees) + d.addCallback(self._create_validated_buckets) + # once we know that, we can download blocks from everybody + d.addCallback(self._download_all_segments) + d.addCallback(self._done) + return d + diff --git a/src/allmydata/immutable/download.py b/src/allmydata/immutable/download.py new file mode 100644 index 00000000..983575ed --- /dev/null +++ b/src/allmydata/immutable/download.py @@ -0,0 +1,1100 @@ + +import os, random, weakref, itertools, time +from zope.interface import implements +from twisted.internet import defer +from twisted.internet.interfaces import IPushProducer, IConsumer +from twisted.application import service +from foolscap.eventual import eventually + +from allmydata.util import base32, mathutil, hashutil, log +from allmydata.util.assertutil import _assert +from allmydata import codec, hashtree, storage, uri +from allmydata.interfaces import IDownloadTarget, IDownloader, IFileURI, \ + IDownloadStatus, IDownloadResults +from allmydata.immutable.encode import NotEnoughSharesError +from pycryptopp.cipher.aes import AES + +class HaveAllPeersError(Exception): + # we use this to jump out of the loop + pass + +class BadURIExtensionHashValue(Exception): + pass +class BadPlaintextHashValue(Exception): + pass +class BadCrypttextHashValue(Exception): + pass + +class DownloadStopped(Exception): + pass + +class DownloadResults: + implements(IDownloadResults) + + def __init__(self): + self.servers_used = set() + self.server_problems = {} + self.servermap = {} + self.timings = {} + self.file_size = None + +class Output: + def __init__(self, downloadable, key, total_length, log_parent, + download_status): + self.downloadable = downloadable + self._decryptor = AES(key) + self._crypttext_hasher = hashutil.crypttext_hasher() + self._plaintext_hasher = hashutil.plaintext_hasher() + self.length = 0 + self.total_length = total_length + self._segment_number = 0 + self._plaintext_hash_tree = None + self._crypttext_hash_tree = None + self._opened = False + self._log_parent = log_parent + self._status = download_status + self._status.set_progress(0.0) + + def log(self, *args, **kwargs): + if "parent" not in kwargs: + kwargs["parent"] = self._log_parent + if "facility" not in kwargs: + kwargs["facility"] = "download.output" + return log.msg(*args, **kwargs) + + def setup_hashtrees(self, plaintext_hashtree, crypttext_hashtree): + self._plaintext_hash_tree = plaintext_hashtree + self._crypttext_hash_tree = crypttext_hashtree + + def write_segment(self, crypttext): + self.length += len(crypttext) + self._status.set_progress( float(self.length) / self.total_length ) + + # memory footprint: 'crypttext' is the only segment_size usage + # outstanding. While we decrypt it into 'plaintext', we hit + # 2*segment_size. + self._crypttext_hasher.update(crypttext) + if self._crypttext_hash_tree: + ch = hashutil.crypttext_segment_hasher() + ch.update(crypttext) + crypttext_leaves = {self._segment_number: ch.digest()} + self.log(format="crypttext leaf hash (%(bytes)sB) [%(segnum)d] is %(hash)s", + bytes=len(crypttext), + segnum=self._segment_number, hash=base32.b2a(ch.digest()), + level=log.NOISY) + self._crypttext_hash_tree.set_hashes(leaves=crypttext_leaves) + + plaintext = self._decryptor.process(crypttext) + del crypttext + + # now we're back down to 1*segment_size. + + self._plaintext_hasher.update(plaintext) + if self._plaintext_hash_tree: + ph = hashutil.plaintext_segment_hasher() + ph.update(plaintext) + plaintext_leaves = {self._segment_number: ph.digest()} + self.log(format="plaintext leaf hash (%(bytes)sB) [%(segnum)d] is %(hash)s", + bytes=len(plaintext), + segnum=self._segment_number, hash=base32.b2a(ph.digest()), + level=log.NOISY) + self._plaintext_hash_tree.set_hashes(leaves=plaintext_leaves) + + self._segment_number += 1 + # We're still at 1*segment_size. The Downloadable is responsible for + # any memory usage beyond this. + if not self._opened: + self._opened = True + self.downloadable.open(self.total_length) + self.downloadable.write(plaintext) + + def fail(self, why): + # this is really unusual, and deserves maximum forensics + if why.check(DownloadStopped): + # except DownloadStopped just means the consumer aborted the + # download, not so scary + self.log("download stopped", level=log.UNUSUAL) + else: + self.log("download failed!", failure=why, level=log.SCARY) + self.downloadable.fail(why) + + def close(self): + self.crypttext_hash = self._crypttext_hasher.digest() + self.plaintext_hash = self._plaintext_hasher.digest() + self.log("download finished, closing IDownloadable", level=log.NOISY) + self.downloadable.close() + + def finish(self): + return self.downloadable.finish() + +class ValidatedBucket: + """I am a front-end for a remote storage bucket, responsible for + retrieving and validating data from that bucket. + + My get_block() method is used by BlockDownloaders. + """ + + def __init__(self, sharenum, bucket, + share_hash_tree, roothash, + num_blocks): + self.sharenum = sharenum + self.bucket = bucket + self._share_hash = None # None means not validated yet + self.share_hash_tree = share_hash_tree + self._roothash = roothash + self.block_hash_tree = hashtree.IncompleteHashTree(num_blocks) + self.started = False + + def get_block(self, blocknum): + if not self.started: + d = self.bucket.start() + def _started(res): + self.started = True + return self.get_block(blocknum) + d.addCallback(_started) + return d + + # the first time we use this bucket, we need to fetch enough elements + # of the share hash tree to validate it from our share hash up to the + # hashroot. + if not self._share_hash: + d1 = self.bucket.get_share_hashes() + else: + d1 = defer.succeed([]) + + # we might need to grab some elements of our block hash tree, to + # validate the requested block up to the share hash + needed = self.block_hash_tree.needed_hashes(blocknum) + if needed: + # TODO: get fewer hashes, use get_block_hashes(needed) + d2 = self.bucket.get_block_hashes() + else: + d2 = defer.succeed([]) + + d3 = self.bucket.get_block(blocknum) + + d = defer.gatherResults([d1, d2, d3]) + d.addCallback(self._got_data, blocknum) + return d + + def _got_data(self, res, blocknum): + sharehashes, blockhashes, blockdata = res + blockhash = None # to make logging it safe + + try: + if not self._share_hash: + sh = dict(sharehashes) + sh[0] = self._roothash # always use our own root, from the URI + sht = self.share_hash_tree + if sht.get_leaf_index(self.sharenum) not in sh: + raise hashtree.NotEnoughHashesError + sht.set_hashes(sh) + self._share_hash = sht.get_leaf(self.sharenum) + + blockhash = hashutil.block_hash(blockdata) + #log.msg("checking block_hash(shareid=%d, blocknum=%d) len=%d " + # "%r .. %r: %s" % + # (self.sharenum, blocknum, len(blockdata), + # blockdata[:50], blockdata[-50:], base32.b2a(blockhash))) + + # we always validate the blockhash + bh = dict(enumerate(blockhashes)) + # replace blockhash root with validated value + bh[0] = self._share_hash + self.block_hash_tree.set_hashes(bh, {blocknum: blockhash}) + + except (hashtree.BadHashError, hashtree.NotEnoughHashesError): + # log.WEIRD: indicates undetected disk/network error, or more + # likely a programming error + log.msg("hash failure in block=%d, shnum=%d on %s" % + (blocknum, self.sharenum, self.bucket)) + if self._share_hash: + log.msg(""" failure occurred when checking the block_hash_tree. + This suggests that either the block data was bad, or that the + block hashes we received along with it were bad.""") + else: + log.msg(""" the failure probably occurred when checking the + share_hash_tree, which suggests that the share hashes we + received from the remote peer were bad.""") + log.msg(" have self._share_hash: %s" % bool(self._share_hash)) + log.msg(" block length: %d" % len(blockdata)) + log.msg(" block hash: %s" % base32.b2a_or_none(blockhash)) + if len(blockdata) < 100: + log.msg(" block data: %r" % (blockdata,)) + else: + log.msg(" block data start/end: %r .. %r" % + (blockdata[:50], blockdata[-50:])) + log.msg(" root hash: %s" % base32.b2a(self._roothash)) + log.msg(" share hash tree:\n" + self.share_hash_tree.dump()) + log.msg(" block hash tree:\n" + self.block_hash_tree.dump()) + lines = [] + for i,h in sorted(sharehashes): + lines.append("%3d: %s" % (i, base32.b2a_or_none(h))) + log.msg(" sharehashes:\n" + "\n".join(lines) + "\n") + lines = [] + for i,h in enumerate(blockhashes): + lines.append("%3d: %s" % (i, base32.b2a_or_none(h))) + log.msg(" blockhashes:\n" + "\n".join(lines) + "\n") + raise + + # If we made it here, the block is good. If the hash trees didn't + # like what they saw, they would have raised a BadHashError, causing + # our caller to see a Failure and thus ignore this block (as well as + # dropping this bucket). + return blockdata + + + +class BlockDownloader: + """I am responsible for downloading a single block (from a single bucket) + for a single segment. + + I am a child of the SegmentDownloader. + """ + + def __init__(self, vbucket, blocknum, parent, results): + self.vbucket = vbucket + self.blocknum = blocknum + self.parent = parent + self.results = results + self._log_number = self.parent.log("starting block %d" % blocknum) + + def log(self, msg, parent=None): + if parent is None: + parent = self._log_number + return self.parent.log(msg, parent=parent) + + def start(self, segnum): + lognum = self.log("get_block(segnum=%d)" % segnum) + started = time.time() + d = self.vbucket.get_block(segnum) + d.addCallbacks(self._hold_block, self._got_block_error, + callbackArgs=(started, lognum,), errbackArgs=(lognum,)) + return d + + def _hold_block(self, data, started, lognum): + if self.results: + elapsed = time.time() - started + peerid = self.vbucket.bucket.get_peerid() + if peerid not in self.results.timings["fetch_per_server"]: + self.results.timings["fetch_per_server"][peerid] = [] + self.results.timings["fetch_per_server"][peerid].append(elapsed) + self.log("got block", parent=lognum) + self.parent.hold_block(self.blocknum, data) + + def _got_block_error(self, f, lognum): + self.log("BlockDownloader[%d] got error: %s" % (self.blocknum, f), + parent=lognum) + if self.results: + peerid = self.vbucket.bucket.get_peerid() + self.results.server_problems[peerid] = str(f) + self.parent.bucket_failed(self.vbucket) + +class SegmentDownloader: + """I am responsible for downloading all the blocks for a single segment + of data. + + I am a child of the FileDownloader. + """ + + def __init__(self, parent, segmentnumber, needed_shares, results): + self.parent = parent + self.segmentnumber = segmentnumber + self.needed_blocks = needed_shares + self.blocks = {} # k: blocknum, v: data + self.results = results + self._log_number = self.parent.log("starting segment %d" % + segmentnumber) + + def log(self, msg, parent=None): + if parent is None: + parent = self._log_number + return self.parent.log(msg, parent=parent) + + def start(self): + return self._download() + + def _download(self): + d = self._try() + def _done(res): + if len(self.blocks) >= self.needed_blocks: + # we only need self.needed_blocks blocks + # we want to get the smallest blockids, because they are + # more likely to be fast "primary blocks" + blockids = sorted(self.blocks.keys())[:self.needed_blocks] + blocks = [] + for blocknum in blockids: + blocks.append(self.blocks[blocknum]) + return (blocks, blockids) + else: + return self._download() + d.addCallback(_done) + return d + + def _try(self): + # fill our set of active buckets, maybe raising NotEnoughSharesError + active_buckets = self.parent._activate_enough_buckets() + # Now we have enough buckets, in self.parent.active_buckets. + + # in test cases, bd.start might mutate active_buckets right away, so + # we need to put off calling start() until we've iterated all the way + # through it. + downloaders = [] + for blocknum, vbucket in active_buckets.iteritems(): + bd = BlockDownloader(vbucket, blocknum, self, self.results) + downloaders.append(bd) + if self.results: + self.results.servers_used.add(vbucket.bucket.get_peerid()) + l = [bd.start(self.segmentnumber) for bd in downloaders] + return defer.DeferredList(l, fireOnOneErrback=True) + + def hold_block(self, blocknum, data): + self.blocks[blocknum] = data + + def bucket_failed(self, vbucket): + self.parent.bucket_failed(vbucket) + +class DownloadStatus: + implements(IDownloadStatus) + statusid_counter = itertools.count(0) + + def __init__(self): + self.storage_index = None + self.size = None + self.helper = False + self.status = "Not started" + self.progress = 0.0 + self.paused = False + self.stopped = False + self.active = True + self.results = None + self.counter = self.statusid_counter.next() + self.started = time.time() + + def get_started(self): + return self.started + def get_storage_index(self): + return self.storage_index + def get_size(self): + return self.size + def using_helper(self): + return self.helper + def get_status(self): + status = self.status + if self.paused: + status += " (output paused)" + if self.stopped: + status += " (output stopped)" + return status + def get_progress(self): + return self.progress + def get_active(self): + return self.active + def get_results(self): + return self.results + def get_counter(self): + return self.counter + + def set_storage_index(self, si): + self.storage_index = si + def set_size(self, size): + self.size = size + def set_helper(self, helper): + self.helper = helper + def set_status(self, status): + self.status = status + def set_paused(self, paused): + self.paused = paused + def set_stopped(self, stopped): + self.stopped = stopped + def set_progress(self, value): + self.progress = value + def set_active(self, value): + self.active = value + def set_results(self, value): + self.results = value + +class FileDownloader: + implements(IPushProducer) + check_crypttext_hash = True + check_plaintext_hash = True + _status = None + + def __init__(self, client, u, downloadable): + self._client = client + + u = IFileURI(u) + self._storage_index = u.storage_index + self._uri_extension_hash = u.uri_extension_hash + self._total_shares = u.total_shares + self._size = u.size + self._num_needed_shares = u.needed_shares + + self._si_s = storage.si_b2a(self._storage_index) + self.init_logging() + + self._started = time.time() + self._status = s = DownloadStatus() + s.set_status("Starting") + s.set_storage_index(self._storage_index) + s.set_size(self._size) + s.set_helper(False) + s.set_active(True) + + self._results = DownloadResults() + s.set_results(self._results) + self._results.file_size = self._size + self._results.timings["servers_peer_selection"] = {} + self._results.timings["fetch_per_server"] = {} + self._results.timings["cumulative_fetch"] = 0.0 + self._results.timings["cumulative_decode"] = 0.0 + self._results.timings["cumulative_decrypt"] = 0.0 + self._results.timings["paused"] = 0.0 + + if IConsumer.providedBy(downloadable): + downloadable.registerProducer(self, True) + self._downloadable = downloadable + self._output = Output(downloadable, u.key, self._size, self._log_number, + self._status) + self._paused = False + self._stopped = False + + self.active_buckets = {} # k: shnum, v: bucket + self._share_buckets = [] # list of (sharenum, bucket) tuples + self._share_vbuckets = {} # k: shnum, v: set of ValidatedBuckets + self._uri_extension_sources = [] + + self._uri_extension_data = None + + self._fetch_failures = {"uri_extension": 0, + "plaintext_hashroot": 0, + "plaintext_hashtree": 0, + "crypttext_hashroot": 0, + "crypttext_hashtree": 0, + } + + def init_logging(self): + self._log_prefix = prefix = storage.si_b2a(self._storage_index)[:5] + num = self._client.log(format="FileDownloader(%(si)s): starting", + si=storage.si_b2a(self._storage_index)) + self._log_number = num + + def log(self, *args, **kwargs): + if "parent" not in kwargs: + kwargs["parent"] = self._log_number + if "facility" not in kwargs: + kwargs["facility"] = "tahoe.download" + return log.msg(*args, **kwargs) + + def pauseProducing(self): + if self._paused: + return + self._paused = defer.Deferred() + self._paused_at = time.time() + if self._status: + self._status.set_paused(True) + + def resumeProducing(self): + if self._paused: + paused_for = time.time() - self._paused_at + self._results.timings['paused'] += paused_for + p = self._paused + self._paused = None + eventually(p.callback, None) + if self._status: + self._status.set_paused(False) + + def stopProducing(self): + self.log("Download.stopProducing") + self._stopped = True + self.resumeProducing() + if self._status: + self._status.set_stopped(True) + self._status.set_active(False) + + def start(self): + self.log("starting download") + + # first step: who should we download from? + d = defer.maybeDeferred(self._get_all_shareholders) + d.addCallback(self._got_all_shareholders) + # now get the uri_extension block from somebody and validate it + d.addCallback(self._obtain_uri_extension) + d.addCallback(self._got_uri_extension) + d.addCallback(self._get_hashtrees) + d.addCallback(self._create_validated_buckets) + # once we know that, we can download blocks from everybody + d.addCallback(self._download_all_segments) + def _finished(res): + if self._status: + self._status.set_status("Finished") + self._status.set_active(False) + self._status.set_paused(False) + if IConsumer.providedBy(self._downloadable): + self._downloadable.unregisterProducer() + return res + d.addBoth(_finished) + def _failed(why): + if self._status: + self._status.set_status("Failed") + self._status.set_active(False) + self._output.fail(why) + return why + d.addErrback(_failed) + d.addCallback(self._done) + return d + + def _get_all_shareholders(self): + dl = [] + for (peerid,ss) in self._client.get_permuted_peers("storage", + self._storage_index): + d = ss.callRemote("get_buckets", self._storage_index) + d.addCallbacks(self._got_response, self._got_error, + callbackArgs=(peerid,)) + dl.append(d) + self._responses_received = 0 + self._queries_sent = len(dl) + if self._status: + self._status.set_status("Locating Shares (%d/%d)" % + (self._responses_received, + self._queries_sent)) + return defer.DeferredList(dl) + + def _got_response(self, buckets, peerid): + self._responses_received += 1 + if self._results: + elapsed = time.time() - self._started + self._results.timings["servers_peer_selection"][peerid] = elapsed + if self._status: + self._status.set_status("Locating Shares (%d/%d)" % + (self._responses_received, + self._queries_sent)) + for sharenum, bucket in buckets.iteritems(): + b = storage.ReadBucketProxy(bucket, peerid, self._si_s) + self.add_share_bucket(sharenum, b) + self._uri_extension_sources.append(b) + if self._results: + if peerid not in self._results.servermap: + self._results.servermap[peerid] = set() + self._results.servermap[peerid].add(sharenum) + + def add_share_bucket(self, sharenum, bucket): + # this is split out for the benefit of test_encode.py + self._share_buckets.append( (sharenum, bucket) ) + + def _got_error(self, f): + self._client.log("Somebody failed. -- %s" % (f,)) + + def bucket_failed(self, vbucket): + shnum = vbucket.sharenum + del self.active_buckets[shnum] + s = self._share_vbuckets[shnum] + # s is a set of ValidatedBucket instances + s.remove(vbucket) + # ... which might now be empty + if not s: + # there are no more buckets which can provide this share, so + # remove the key. This may prompt us to use a different share. + del self._share_vbuckets[shnum] + + def _got_all_shareholders(self, res): + if self._results: + now = time.time() + self._results.timings["peer_selection"] = now - self._started + + if len(self._share_buckets) < self._num_needed_shares: + raise NotEnoughSharesError + + #for s in self._share_vbuckets.values(): + # for vb in s: + # assert isinstance(vb, ValidatedBucket), \ + # "vb is %s but should be a ValidatedBucket" % (vb,) + + def _unpack_uri_extension_data(self, data): + return uri.unpack_extension(data) + + def _obtain_uri_extension(self, ignored): + # all shareholders are supposed to have a copy of uri_extension, and + # all are supposed to be identical. We compute the hash of the data + # that comes back, and compare it against the version in our URI. If + # they don't match, ignore their data and try someone else. + if self._status: + self._status.set_status("Obtaining URI Extension") + + self._uri_extension_fetch_started = time.time() + def _validate(proposal, bucket): + h = hashutil.uri_extension_hash(proposal) + if h != self._uri_extension_hash: + self._fetch_failures["uri_extension"] += 1 + msg = ("The copy of uri_extension we received from " + "%s was bad: wanted %s, got %s" % + (bucket, + base32.b2a(self._uri_extension_hash), + base32.b2a(h))) + self.log(msg, level=log.SCARY) + raise BadURIExtensionHashValue(msg) + return self._unpack_uri_extension_data(proposal) + return self._obtain_validated_thing(None, + self._uri_extension_sources, + "uri_extension", + "get_uri_extension", (), _validate) + + def _obtain_validated_thing(self, ignored, sources, name, methname, args, + validatorfunc): + if not sources: + raise NotEnoughSharesError("started with zero peers while fetching " + "%s" % name) + bucket = sources[0] + sources = sources[1:] + #d = bucket.callRemote(methname, *args) + d = bucket.startIfNecessary() + d.addCallback(lambda res: getattr(bucket, methname)(*args)) + d.addCallback(validatorfunc, bucket) + def _bad(f): + self.log("%s from vbucket %s failed:" % (name, bucket), + failure=f, level=log.WEIRD) + if not sources: + raise NotEnoughSharesError("ran out of peers, last error was %s" + % (f,)) + # try again with a different one + return self._obtain_validated_thing(None, sources, name, + methname, args, validatorfunc) + d.addErrback(_bad) + return d + + def _got_uri_extension(self, uri_extension_data): + if self._results: + elapsed = time.time() - self._uri_extension_fetch_started + self._results.timings["uri_extension"] = elapsed + + d = self._uri_extension_data = uri_extension_data + + self._codec = codec.get_decoder_by_name(d['codec_name']) + self._codec.set_serialized_params(d['codec_params']) + self._tail_codec = codec.get_decoder_by_name(d['codec_name']) + self._tail_codec.set_serialized_params(d['tail_codec_params']) + + crypttext_hash = d.get('crypttext_hash', None) # optional + if crypttext_hash: + assert isinstance(crypttext_hash, str) + assert len(crypttext_hash) == 32 + self._crypttext_hash = crypttext_hash + self._plaintext_hash = d.get('plaintext_hash', None) # optional + + self._roothash = d['share_root_hash'] + + self._segment_size = segment_size = d['segment_size'] + self._total_segments = mathutil.div_ceil(self._size, segment_size) + self._current_segnum = 0 + + self._share_hashtree = hashtree.IncompleteHashTree(d['total_shares']) + self._share_hashtree.set_hashes({0: self._roothash}) + + def _get_hashtrees(self, res): + self._get_hashtrees_started = time.time() + if self._status: + self._status.set_status("Retrieving Hash Trees") + d = defer.maybeDeferred(self._get_plaintext_hashtrees) + d.addCallback(self._get_crypttext_hashtrees) + d.addCallback(self._setup_hashtrees) + return d + + def _get_plaintext_hashtrees(self): + # plaintext hashes are optional. If the root isn't in the UEB, then + # the share will be holding an empty list. We don't even bother + # fetching it. + if "plaintext_root_hash" not in self._uri_extension_data: + self._plaintext_hashtree = None + return + def _validate_plaintext_hashtree(proposal, bucket): + if proposal[0] != self._uri_extension_data['plaintext_root_hash']: + self._fetch_failures["plaintext_hashroot"] += 1 + msg = ("The copy of the plaintext_root_hash we received from" + " %s was bad" % bucket) + raise BadPlaintextHashValue(msg) + pt_hashtree = hashtree.IncompleteHashTree(self._total_segments) + pt_hashes = dict(list(enumerate(proposal))) + try: + pt_hashtree.set_hashes(pt_hashes) + except hashtree.BadHashError: + # the hashes they gave us were not self-consistent, even + # though the root matched what we saw in the uri_extension + # block + self._fetch_failures["plaintext_hashtree"] += 1 + raise + self._plaintext_hashtree = pt_hashtree + d = self._obtain_validated_thing(None, + self._uri_extension_sources, + "plaintext_hashes", + "get_plaintext_hashes", (), + _validate_plaintext_hashtree) + return d + + def _get_crypttext_hashtrees(self, res): + # crypttext hashes are optional too + if "crypttext_root_hash" not in self._uri_extension_data: + self._crypttext_hashtree = None + return + def _validate_crypttext_hashtree(proposal, bucket): + if proposal[0] != self._uri_extension_data['crypttext_root_hash']: + self._fetch_failures["crypttext_hashroot"] += 1 + msg = ("The copy of the crypttext_root_hash we received from" + " %s was bad" % bucket) + raise BadCrypttextHashValue(msg) + ct_hashtree = hashtree.IncompleteHashTree(self._total_segments) + ct_hashes = dict(list(enumerate(proposal))) + try: + ct_hashtree.set_hashes(ct_hashes) + except hashtree.BadHashError: + self._fetch_failures["crypttext_hashtree"] += 1 + raise + ct_hashtree.set_hashes(ct_hashes) + self._crypttext_hashtree = ct_hashtree + d = self._obtain_validated_thing(None, + self._uri_extension_sources, + "crypttext_hashes", + "get_crypttext_hashes", (), + _validate_crypttext_hashtree) + return d + + def _setup_hashtrees(self, res): + self._output.setup_hashtrees(self._plaintext_hashtree, + self._crypttext_hashtree) + if self._results: + elapsed = time.time() - self._get_hashtrees_started + self._results.timings["hashtrees"] = elapsed + + def _create_validated_buckets(self, ignored=None): + self._share_vbuckets = {} + for sharenum, bucket in self._share_buckets: + vbucket = ValidatedBucket(sharenum, bucket, + self._share_hashtree, + self._roothash, + self._total_segments) + s = self._share_vbuckets.setdefault(sharenum, set()) + s.add(vbucket) + + def _activate_enough_buckets(self): + """either return a mapping from shnum to a ValidatedBucket that can + provide data for that share, or raise NotEnoughSharesError""" + + while len(self.active_buckets) < self._num_needed_shares: + # need some more + handled_shnums = set(self.active_buckets.keys()) + available_shnums = set(self._share_vbuckets.keys()) + potential_shnums = list(available_shnums - handled_shnums) + if not potential_shnums: + raise NotEnoughSharesError + # choose a random share + shnum = random.choice(potential_shnums) + # and a random bucket that will provide it + validated_bucket = random.choice(list(self._share_vbuckets[shnum])) + self.active_buckets[shnum] = validated_bucket + return self.active_buckets + + + def _download_all_segments(self, res): + # the promise: upon entry to this function, self._share_vbuckets + # contains enough buckets to complete the download, and some extra + # ones to tolerate some buckets dropping out or having errors. + # self._share_vbuckets is a dictionary that maps from shnum to a set + # of ValidatedBuckets, which themselves are wrappers around + # RIBucketReader references. + self.active_buckets = {} # k: shnum, v: ValidatedBucket instance + + self._started_fetching = time.time() + + d = defer.succeed(None) + for segnum in range(self._total_segments-1): + d.addCallback(self._download_segment, segnum) + # this pause, at the end of write, prevents pre-fetch from + # happening until the consumer is ready for more data. + d.addCallback(self._check_for_pause) + d.addCallback(self._download_tail_segment, self._total_segments-1) + return d + + def _check_for_pause(self, res): + if self._paused: + d = defer.Deferred() + self._paused.addCallback(lambda ignored: d.callback(res)) + return d + if self._stopped: + raise DownloadStopped("our Consumer called stopProducing()") + return res + + def _download_segment(self, res, segnum): + if self._status: + self._status.set_status("Downloading segment %d of %d" % + (segnum+1, self._total_segments)) + self.log("downloading seg#%d of %d (%d%%)" + % (segnum, self._total_segments, + 100.0 * segnum / self._total_segments)) + # memory footprint: when the SegmentDownloader finishes pulling down + # all shares, we have 1*segment_size of usage. + segmentdler = SegmentDownloader(self, segnum, self._num_needed_shares, + self._results) + started = time.time() + d = segmentdler.start() + def _finished_fetching(res): + elapsed = time.time() - started + self._results.timings["cumulative_fetch"] += elapsed + return res + if self._results: + d.addCallback(_finished_fetching) + # pause before using more memory + d.addCallback(self._check_for_pause) + # while the codec does its job, we hit 2*segment_size + def _started_decode(res): + self._started_decode = time.time() + return res + if self._results: + d.addCallback(_started_decode) + d.addCallback(lambda (shares, shareids): + self._codec.decode(shares, shareids)) + # once the codec is done, we drop back to 1*segment_size, because + # 'shares' goes out of scope. The memory usage is all in the + # plaintext now, spread out into a bunch of tiny buffers. + def _finished_decode(res): + elapsed = time.time() - self._started_decode + self._results.timings["cumulative_decode"] += elapsed + return res + if self._results: + d.addCallback(_finished_decode) + + # pause/check-for-stop just before writing, to honor stopProducing + d.addCallback(self._check_for_pause) + def _done(buffers): + # we start by joining all these buffers together into a single + # string. This makes Output.write easier, since it wants to hash + # data one segment at a time anyways, and doesn't impact our + # memory footprint since we're already peaking at 2*segment_size + # inside the codec a moment ago. + segment = "".join(buffers) + del buffers + # we're down to 1*segment_size right now, but write_segment() + # will decrypt a copy of the segment internally, which will push + # us up to 2*segment_size while it runs. + started_decrypt = time.time() + self._output.write_segment(segment) + if self._results: + elapsed = time.time() - started_decrypt + self._results.timings["cumulative_decrypt"] += elapsed + d.addCallback(_done) + return d + + def _download_tail_segment(self, res, segnum): + self.log("downloading seg#%d of %d (%d%%)" + % (segnum, self._total_segments, + 100.0 * segnum / self._total_segments)) + segmentdler = SegmentDownloader(self, segnum, self._num_needed_shares, + self._results) + started = time.time() + d = segmentdler.start() + def _finished_fetching(res): + elapsed = time.time() - started + self._results.timings["cumulative_fetch"] += elapsed + return res + if self._results: + d.addCallback(_finished_fetching) + # pause before using more memory + d.addCallback(self._check_for_pause) + def _started_decode(res): + self._started_decode = time.time() + return res + if self._results: + d.addCallback(_started_decode) + d.addCallback(lambda (shares, shareids): + self._tail_codec.decode(shares, shareids)) + def _finished_decode(res): + elapsed = time.time() - self._started_decode + self._results.timings["cumulative_decode"] += elapsed + return res + if self._results: + d.addCallback(_finished_decode) + # pause/check-for-stop just before writing, to honor stopProducing + d.addCallback(self._check_for_pause) + def _done(buffers): + # trim off any padding added by the upload side + segment = "".join(buffers) + del buffers + # we never send empty segments. If the data was an exact multiple + # of the segment size, the last segment will be full. + pad_size = mathutil.pad_size(self._size, self._segment_size) + tail_size = self._segment_size - pad_size + segment = segment[:tail_size] + started_decrypt = time.time() + self._output.write_segment(segment) + if self._results: + elapsed = time.time() - started_decrypt + self._results.timings["cumulative_decrypt"] += elapsed + d.addCallback(_done) + return d + + def _done(self, res): + self.log("download done") + if self._results: + now = time.time() + self._results.timings["total"] = now - self._started + self._results.timings["segments"] = now - self._started_fetching + self._output.close() + if self.check_crypttext_hash and self._crypttext_hash: + _assert(self._crypttext_hash == self._output.crypttext_hash, + "bad crypttext_hash: computed=%s, expected=%s" % + (base32.b2a(self._output.crypttext_hash), + base32.b2a(self._crypttext_hash))) + if self.check_plaintext_hash and self._plaintext_hash: + _assert(self._plaintext_hash == self._output.plaintext_hash, + "bad plaintext_hash: computed=%s, expected=%s" % + (base32.b2a(self._output.plaintext_hash), + base32.b2a(self._plaintext_hash))) + _assert(self._output.length == self._size, + got=self._output.length, expected=self._size) + return self._output.finish() + + def get_download_status(self): + return self._status + + +class LiteralDownloader: + def __init__(self, client, u, downloadable): + self._uri = IFileURI(u) + assert isinstance(self._uri, uri.LiteralFileURI) + self._downloadable = downloadable + self._status = s = DownloadStatus() + s.set_storage_index(None) + s.set_helper(False) + s.set_status("Done") + s.set_active(False) + s.set_progress(1.0) + + def start(self): + data = self._uri.data + self._status.set_size(len(data)) + self._downloadable.open(len(data)) + self._downloadable.write(data) + self._downloadable.close() + return defer.maybeDeferred(self._downloadable.finish) + + def get_download_status(self): + return self._status + +class FileName: + implements(IDownloadTarget) + def __init__(self, filename): + self._filename = filename + self.f = None + def open(self, size): + self.f = open(self._filename, "wb") + return self.f + def write(self, data): + self.f.write(data) + def close(self): + if self.f: + self.f.close() + def fail(self, why): + if self.f: + self.f.close() + os.unlink(self._filename) + def register_canceller(self, cb): + pass # we won't use it + def finish(self): + pass + +class Data: + implements(IDownloadTarget) + def __init__(self): + self._data = [] + def open(self, size): + pass + def write(self, data): + self._data.append(data) + def close(self): + self.data = "".join(self._data) + del self._data + def fail(self, why): + del self._data + def register_canceller(self, cb): + pass # we won't use it + def finish(self): + return self.data + +class FileHandle: + """Use me to download data to a pre-defined filehandle-like object. I + will use the target's write() method. I will *not* close the filehandle: + I leave that up to the originator of the filehandle. The download process + will return the filehandle when it completes. + """ + implements(IDownloadTarget) + def __init__(self, filehandle): + self._filehandle = filehandle + def open(self, size): + pass + def write(self, data): + self._filehandle.write(data) + def close(self): + # the originator of the filehandle reserves the right to close it + pass + def fail(self, why): + pass + def register_canceller(self, cb): + pass + def finish(self): + return self._filehandle + +class Downloader(service.MultiService): + """I am a service that allows file downloading. + """ + implements(IDownloader) + name = "downloader" + MAX_DOWNLOAD_STATUSES = 10 + + def __init__(self, stats_provider=None): + service.MultiService.__init__(self) + self.stats_provider = stats_provider + self._all_downloads = weakref.WeakKeyDictionary() # for debugging + self._all_download_statuses = weakref.WeakKeyDictionary() + self._recent_download_statuses = [] + + def download(self, u, t): + assert self.parent + assert self.running + u = IFileURI(u) + t = IDownloadTarget(t) + assert t.write + assert t.close + + + if isinstance(u, uri.LiteralFileURI): + dl = LiteralDownloader(self.parent, u, t) + elif isinstance(u, uri.CHKFileURI): + if self.stats_provider: + # these counters are meant for network traffic, and don't + # include LIT files + self.stats_provider.count('downloader.files_downloaded', 1) + self.stats_provider.count('downloader.bytes_downloaded', u.get_size()) + dl = FileDownloader(self.parent, u, t) + else: + raise RuntimeError("I don't know how to download a %s" % u) + self._add_download(dl) + d = dl.start() + return d + + # utility functions + def download_to_data(self, uri): + return self.download(uri, Data()) + def download_to_filename(self, uri, filename): + return self.download(uri, FileName(filename)) + def download_to_filehandle(self, uri, filehandle): + return self.download(uri, FileHandle(filehandle)) + + def _add_download(self, downloader): + self._all_downloads[downloader] = None + s = downloader.get_download_status() + self._all_download_statuses[s] = None + self._recent_download_statuses.append(s) + while len(self._recent_download_statuses) > self.MAX_DOWNLOAD_STATUSES: + self._recent_download_statuses.pop(0) + + def list_all_download_statuses(self): + for ds in self._all_download_statuses: + yield ds diff --git a/src/allmydata/immutable/encode.py b/src/allmydata/immutable/encode.py new file mode 100644 index 00000000..766292fe --- /dev/null +++ b/src/allmydata/immutable/encode.py @@ -0,0 +1,718 @@ +# -*- test-case-name: allmydata.test.test_encode -*- + +import time +from zope.interface import implements +from twisted.internet import defer +from foolscap import eventual +from allmydata import storage, uri +from allmydata.hashtree import HashTree +from allmydata.util import mathutil, hashutil, base32, log +from allmydata.util.assertutil import _assert, precondition +from allmydata.codec import CRSEncoder +from allmydata.interfaces import IEncoder, IStorageBucketWriter, \ + IEncryptedUploadable, IUploadStatus + +""" +The goal of the encoder is to turn the original file into a series of +'shares'. Each share is going to a 'shareholder' (nominally each shareholder +is a different host, but for small grids there may be overlap). The number +of shares is chosen to hit our reliability goals (more shares on more +machines means more reliability), and is limited by overhead (proportional to +numshares or log(numshares)) and the encoding technology in use (zfec permits +only 256 shares total). It is also constrained by the amount of data +we want to send to each host. For estimating purposes, think of 10 shares +out of which we need 3 to reconstruct the file. + +The encoder starts by cutting the original file into segments. All segments +except the last are of equal size. The segment size is chosen to constrain +the memory footprint (which will probably vary between 1x and 4x segment +size) and to constrain the overhead (which will be proportional to +log(number of segments)). + + +Each segment (A,B,C) is read into memory, encrypted, and encoded into +blocks. The 'share' (say, share #1) that makes it out to a host is a +collection of these blocks (block A1, B1, C1), plus some hash-tree +information necessary to validate the data upon retrieval. Only one segment +is handled at a time: all blocks for segment A are delivered before any +work is begun on segment B. + +As blocks are created, we retain the hash of each one. The list of block hashes +for a single share (say, hash(A1), hash(B1), hash(C1)) is used to form the base +of a Merkle hash tree for that share, called the block hash tree. + +This hash tree has one terminal leaf per block. The complete block hash +tree is sent to the shareholder after all the data has been sent. At +retrieval time, the decoder will ask for specific pieces of this tree before +asking for blocks, whichever it needs to validate those blocks. + +(Note: we don't really need to generate this whole block hash tree +ourselves. It would be sufficient to have the shareholder generate it and +just tell us the root. This gives us an extra level of validation on the +transfer, though, and it is relatively cheap to compute.) + +Each of these block hash trees has a root hash. The collection of these +root hashes for all shares are collected into the 'share hash tree', which +has one terminal leaf per share. After sending the blocks and the complete +block hash tree to each shareholder, we send them the portion of the share +hash tree that is necessary to validate their share. The root of the share +hash tree is put into the URI. + +""" + +class NotEnoughSharesError(Exception): + servermap = None + pass + +class UploadAborted(Exception): + pass + +KiB=1024 +MiB=1024*KiB +GiB=1024*MiB +TiB=1024*GiB +PiB=1024*TiB + +class Encoder(object): + implements(IEncoder) + USE_PLAINTEXT_HASHES = False + + def __init__(self, log_parent=None, upload_status=None): + object.__init__(self) + self.uri_extension_data = {} + self._codec = None + self._status = None + if upload_status: + self._status = IUploadStatus(upload_status) + precondition(log_parent is None or isinstance(log_parent, int), + log_parent) + self._log_number = log.msg("creating Encoder %s" % self, + facility="tahoe.encoder", parent=log_parent) + self._aborted = False + + def __repr__(self): + if hasattr(self, "_storage_index"): + return "" % storage.si_b2a(self._storage_index)[:5] + return "" + + def log(self, *args, **kwargs): + if "parent" not in kwargs: + kwargs["parent"] = self._log_number + if "facility" not in kwargs: + kwargs["facility"] = "tahoe.encoder" + return log.msg(*args, **kwargs) + + def set_encrypted_uploadable(self, uploadable): + eu = self._uploadable = IEncryptedUploadable(uploadable) + d = eu.get_size() + def _got_size(size): + self.log(format="file size: %(size)d", size=size) + self.file_size = size + d.addCallback(_got_size) + d.addCallback(lambda res: eu.get_all_encoding_parameters()) + d.addCallback(self._got_all_encoding_parameters) + d.addCallback(lambda res: eu.get_storage_index()) + def _done(storage_index): + self._storage_index = storage_index + return self + d.addCallback(_done) + return d + + def _got_all_encoding_parameters(self, params): + assert not self._codec + k, happy, n, segsize = params + self.required_shares = k + self.shares_of_happiness = happy + self.num_shares = n + self.segment_size = segsize + self.log("got encoding parameters: %d/%d/%d %d" % (k,happy,n, segsize)) + self.log("now setting up codec") + + assert self.segment_size % self.required_shares == 0 + + self.num_segments = mathutil.div_ceil(self.file_size, + self.segment_size) + + self._codec = CRSEncoder() + self._codec.set_params(self.segment_size, + self.required_shares, self.num_shares) + + data = self.uri_extension_data + data['codec_name'] = self._codec.get_encoder_type() + data['codec_params'] = self._codec.get_serialized_params() + + data['size'] = self.file_size + data['segment_size'] = self.segment_size + self.share_size = mathutil.div_ceil(self.file_size, + self.required_shares) + data['num_segments'] = self.num_segments + data['needed_shares'] = self.required_shares + data['total_shares'] = self.num_shares + + # the "tail" is the last segment. This segment may or may not be + # shorter than all other segments. We use the "tail codec" to handle + # it. If the tail is short, we use a different codec instance. In + # addition, the tail codec must be fed data which has been padded out + # to the right size. + self.tail_size = self.file_size % self.segment_size + if not self.tail_size: + self.tail_size = self.segment_size + + # the tail codec is responsible for encoding tail_size bytes + padded_tail_size = mathutil.next_multiple(self.tail_size, + self.required_shares) + self._tail_codec = CRSEncoder() + self._tail_codec.set_params(padded_tail_size, + self.required_shares, self.num_shares) + data['tail_codec_params'] = self._tail_codec.get_serialized_params() + + def _get_share_size(self): + share_size = mathutil.div_ceil(self.file_size, self.required_shares) + overhead = self._compute_overhead() + return share_size + overhead + + def _compute_overhead(self): + return 0 + + def get_param(self, name): + assert self._codec + + if name == "storage_index": + return self._storage_index + elif name == "share_counts": + return (self.required_shares, self.shares_of_happiness, + self.num_shares) + elif name == "num_segments": + return self.num_segments + elif name == "segment_size": + return self.segment_size + elif name == "block_size": + return self._codec.get_block_size() + elif name == "share_size": + return self._get_share_size() + elif name == "serialized_params": + return self._codec.get_serialized_params() + else: + raise KeyError("unknown parameter name '%s'" % name) + + def set_shareholders(self, landlords): + assert isinstance(landlords, dict) + for k in landlords: + assert IStorageBucketWriter.providedBy(landlords[k]) + self.landlords = landlords.copy() + + def start(self): + self.log("%s starting" % (self,)) + #paddedsize = self._size + mathutil.pad_size(self._size, self.needed_shares) + assert self._codec + self._crypttext_hasher = hashutil.crypttext_hasher() + self._crypttext_hashes = [] + self.segment_num = 0 + self.subshare_hashes = [[] for x in range(self.num_shares)] + # subshare_hashes[i] is a list that will be accumulated and then send + # to landlord[i]. This list contains a hash of each segment_share + # that we sent to that landlord. + self.share_root_hashes = [None] * self.num_shares + + self._times = { + "cumulative_encoding": 0.0, + "cumulative_sending": 0.0, + "hashes_and_close": 0.0, + "total_encode_and_push": 0.0, + } + self._start_total_timestamp = time.time() + + d = eventual.fireEventually() + + d.addCallback(lambda res: self.start_all_shareholders()) + + for i in range(self.num_segments-1): + # note to self: this form doesn't work, because lambda only + # captures the slot, not the value + #d.addCallback(lambda res: self.do_segment(i)) + # use this form instead: + d.addCallback(lambda res, i=i: self._encode_segment(i)) + d.addCallback(self._send_segment, i) + d.addCallback(self._turn_barrier) + last_segnum = self.num_segments - 1 + d.addCallback(lambda res: self._encode_tail_segment(last_segnum)) + d.addCallback(self._send_segment, last_segnum) + d.addCallback(self._turn_barrier) + + d.addCallback(lambda res: self.finish_hashing()) + + if self.USE_PLAINTEXT_HASHES: + d.addCallback(lambda res: + self.send_plaintext_hash_tree_to_all_shareholders()) + d.addCallback(lambda res: + self.send_crypttext_hash_tree_to_all_shareholders()) + d.addCallback(lambda res: self.send_all_subshare_hash_trees()) + d.addCallback(lambda res: self.send_all_share_hash_trees()) + d.addCallback(lambda res: self.send_uri_extension_to_all_shareholders()) + + d.addCallback(lambda res: self.close_all_shareholders()) + d.addCallbacks(self.done, self.err) + return d + + def set_status(self, status): + if self._status: + self._status.set_status(status) + + def set_encode_and_push_progress(self, sent_segments=None, extra=0.0): + if self._status: + # we treat the final hash+close as an extra segment + if sent_segments is None: + sent_segments = self.num_segments + progress = float(sent_segments + extra) / (self.num_segments + 1) + self._status.set_progress(2, progress) + + def abort(self): + self.log("aborting upload", level=log.UNUSUAL) + assert self._codec, "don't call abort before start" + self._aborted = True + # the next segment read (in _gather_data inside _encode_segment) will + # raise UploadAborted(), which will bypass the rest of the upload + # chain. If we've sent the final segment's shares, it's too late to + # abort. TODO: allow abort any time up to close_all_shareholders. + + def _turn_barrier(self, res): + # putting this method in a Deferred chain imposes a guaranteed + # reactor turn between the pre- and post- portions of that chain. + # This can be useful to limit memory consumption: since Deferreds do + # not do tail recursion, code which uses defer.succeed(result) for + # consistency will cause objects to live for longer than you might + # normally expect. + + return eventual.fireEventually(res) + + + def start_all_shareholders(self): + self.log("starting shareholders", level=log.NOISY) + self.set_status("Starting shareholders") + dl = [] + for shareid in self.landlords: + d = self.landlords[shareid].start() + d.addErrback(self._remove_shareholder, shareid, "start") + dl.append(d) + return self._gather_responses(dl) + + def _encode_segment(self, segnum): + codec = self._codec + start = time.time() + + # the ICodecEncoder API wants to receive a total of self.segment_size + # bytes on each encode() call, broken up into a number of + # identically-sized pieces. Due to the way the codec algorithm works, + # these pieces need to be the same size as the share which the codec + # will generate. Therefore we must feed it with input_piece_size that + # equals the output share size. + input_piece_size = codec.get_block_size() + + # as a result, the number of input pieces per encode() call will be + # equal to the number of required shares with which the codec was + # constructed. You can think of the codec as chopping up a + # 'segment_size' of data into 'required_shares' shares (not doing any + # fancy math at all, just doing a split), then creating some number + # of additional shares which can be substituted if the primary ones + # are unavailable + + crypttext_segment_hasher = hashutil.crypttext_segment_hasher() + + # memory footprint: we only hold a tiny piece of the plaintext at any + # given time. We build up a segment's worth of cryptttext, then hand + # it to the encoder. Assuming 3-of-10 encoding (3.3x expansion) and + # 1MiB max_segment_size, we get a peak memory footprint of 4.3*1MiB = + # 4.3MiB. Lowering max_segment_size to, say, 100KiB would drop the + # footprint to 430KiB at the expense of more hash-tree overhead. + + d = self._gather_data(self.required_shares, input_piece_size, + crypttext_segment_hasher) + def _done_gathering(chunks): + for c in chunks: + assert len(c) == input_piece_size + self._crypttext_hashes.append(crypttext_segment_hasher.digest()) + # during this call, we hit 5*segsize memory + return codec.encode(chunks) + d.addCallback(_done_gathering) + def _done(res): + elapsed = time.time() - start + self._times["cumulative_encoding"] += elapsed + return res + d.addCallback(_done) + return d + + def _encode_tail_segment(self, segnum): + + start = time.time() + codec = self._tail_codec + input_piece_size = codec.get_block_size() + + crypttext_segment_hasher = hashutil.crypttext_segment_hasher() + + d = self._gather_data(self.required_shares, input_piece_size, + crypttext_segment_hasher, + allow_short=True) + def _done_gathering(chunks): + for c in chunks: + # a short trailing chunk will have been padded by + # _gather_data + assert len(c) == input_piece_size + self._crypttext_hashes.append(crypttext_segment_hasher.digest()) + return codec.encode(chunks) + d.addCallback(_done_gathering) + def _done(res): + elapsed = time.time() - start + self._times["cumulative_encoding"] += elapsed + return res + d.addCallback(_done) + return d + + def _gather_data(self, num_chunks, input_chunk_size, + crypttext_segment_hasher, + allow_short=False, + previous_chunks=[]): + """Return a Deferred that will fire when the required number of + chunks have been read (and hashed and encrypted). The Deferred fires + with the combination of any 'previous_chunks' and the new chunks + which were gathered.""" + + if self._aborted: + raise UploadAborted() + + if not num_chunks: + return defer.succeed(previous_chunks) + + d = self._uploadable.read_encrypted(input_chunk_size, False) + def _got(data): + if self._aborted: + raise UploadAborted() + encrypted_pieces = [] + length = 0 + while data: + encrypted_piece = data.pop(0) + length += len(encrypted_piece) + crypttext_segment_hasher.update(encrypted_piece) + self._crypttext_hasher.update(encrypted_piece) + encrypted_pieces.append(encrypted_piece) + + if allow_short: + if length < input_chunk_size: + # padding + pad_size = input_chunk_size - length + encrypted_pieces.append('\x00' * pad_size) + else: + # non-tail segments should be the full segment size + if length != input_chunk_size: + log.msg("non-tail segment should be full segment size: %d!=%d" + % (length, input_chunk_size), level=log.BAD) + precondition(length == input_chunk_size, + "length=%d != input_chunk_size=%d" % + (length, input_chunk_size)) + + encrypted_piece = "".join(encrypted_pieces) + return previous_chunks + [encrypted_piece] + + d.addCallback(_got) + d.addCallback(lambda chunks: + self._gather_data(num_chunks-1, input_chunk_size, + crypttext_segment_hasher, + allow_short, chunks)) + return d + + def _send_segment(self, (shares, shareids), segnum): + # To generate the URI, we must generate the roothash, so we must + # generate all shares, even if we aren't actually giving them to + # anybody. This means that the set of shares we create will be equal + # to or larger than the set of landlords. If we have any landlord who + # *doesn't* have a share, that's an error. + _assert(set(self.landlords.keys()).issubset(set(shareids)), + shareids=shareids, landlords=self.landlords) + start = time.time() + dl = [] + self.set_status("Sending segment %d of %d" % (segnum+1, + self.num_segments)) + self.set_encode_and_push_progress(segnum) + lognum = self.log("send_segment(%d)" % segnum, level=log.NOISY) + for i in range(len(shares)): + subshare = shares[i] + shareid = shareids[i] + d = self.send_subshare(shareid, segnum, subshare, lognum) + dl.append(d) + subshare_hash = hashutil.block_hash(subshare) + #from allmydata.util import base32 + #log.msg("creating block (shareid=%d, blocknum=%d) " + # "len=%d %r .. %r: %s" % + # (shareid, segnum, len(subshare), + # subshare[:50], subshare[-50:], base32.b2a(subshare_hash))) + self.subshare_hashes[shareid].append(subshare_hash) + + dl = self._gather_responses(dl) + def _logit(res): + self.log("%s uploaded %s / %s bytes (%d%%) of your file." % + (self, + self.segment_size*(segnum+1), + self.segment_size*self.num_segments, + 100 * (segnum+1) / self.num_segments, + ), + level=log.OPERATIONAL) + elapsed = time.time() - start + self._times["cumulative_sending"] += elapsed + return res + dl.addCallback(_logit) + return dl + + def send_subshare(self, shareid, segment_num, subshare, lognum): + if shareid not in self.landlords: + return defer.succeed(None) + sh = self.landlords[shareid] + lognum2 = self.log("put_block to %s" % self.landlords[shareid], + parent=lognum, level=log.NOISY) + d = sh.put_block(segment_num, subshare) + def _done(res): + self.log("put_block done", parent=lognum2, level=log.NOISY) + return res + d.addCallback(_done) + d.addErrback(self._remove_shareholder, shareid, + "segnum=%d" % segment_num) + return d + + def _remove_shareholder(self, why, shareid, where): + ln = self.log(format="error while sending %(method)s to shareholder=%(shnum)d", + method=where, shnum=shareid, + level=log.UNUSUAL, failure=why) + if shareid in self.landlords: + self.landlords[shareid].abort() + del self.landlords[shareid] + else: + # even more UNUSUAL + self.log("they weren't in our list of landlords", parent=ln, + level=log.WEIRD) + if len(self.landlords) < self.shares_of_happiness: + msg = "lost too many shareholders during upload: %s" % why + raise NotEnoughSharesError(msg) + self.log("but we can still continue with %s shares, we'll be happy " + "with at least %s" % (len(self.landlords), + self.shares_of_happiness), + parent=ln) + + def _gather_responses(self, dl): + d = defer.DeferredList(dl, fireOnOneErrback=True) + def _eatNotEnoughSharesError(f): + # all exceptions that occur while talking to a peer are handled + # in _remove_shareholder. That might raise NotEnoughSharesError, + # which will cause the DeferredList to errback but which should + # otherwise be consumed. Allow non-NotEnoughSharesError exceptions + # to pass through as an unhandled errback. We use this in lieu of + # consumeErrors=True to allow coding errors to be logged. + f.trap(NotEnoughSharesError) + return None + for d0 in dl: + d0.addErrback(_eatNotEnoughSharesError) + return d + + def finish_hashing(self): + self._start_hashing_and_close_timestamp = time.time() + self.set_status("Finishing hashes") + self.set_encode_and_push_progress(extra=0.0) + crypttext_hash = self._crypttext_hasher.digest() + self.uri_extension_data["crypttext_hash"] = crypttext_hash + d = self._uploadable.get_plaintext_hash() + def _got(plaintext_hash): + self.log(format="plaintext_hash=%(plaintext_hash)s, SI=%(SI)s, size=%(size)d", + plaintext_hash=base32.b2a(plaintext_hash), + SI=storage.si_b2a(self._storage_index), + size=self.file_size) + return plaintext_hash + d.addCallback(_got) + if self.USE_PLAINTEXT_HASHES: + def _use_plaintext_hash(plaintext_hash): + self.uri_extension_data["plaintext_hash"] = plaintext_hash + return self._uploadable.get_plaintext_hashtree_leaves(0, self.num_segments, self.num_segments) + d.addCallback(_use_plaintext_hash) + def _got_hashtree_leaves(leaves): + self.log("Encoder: got plaintext_hashtree_leaves: %s" % + (",".join([base32.b2a(h) for h in leaves]),), + level=log.NOISY) + ht = list(HashTree(list(leaves))) + self.uri_extension_data["plaintext_root_hash"] = ht[0] + self._plaintext_hashtree_nodes = ht + d.addCallback(_got_hashtree_leaves) + + d.addCallback(lambda res: self._uploadable.close()) + return d + + def send_plaintext_hash_tree_to_all_shareholders(self): + self.log("sending plaintext hash tree", level=log.NOISY) + self.set_status("Sending Plaintext Hash Tree") + self.set_encode_and_push_progress(extra=0.2) + dl = [] + for shareid in self.landlords.keys(): + d = self.send_plaintext_hash_tree(shareid, + self._plaintext_hashtree_nodes) + dl.append(d) + return self._gather_responses(dl) + + def send_plaintext_hash_tree(self, shareid, all_hashes): + if shareid not in self.landlords: + return defer.succeed(None) + sh = self.landlords[shareid] + d = sh.put_plaintext_hashes(all_hashes) + d.addErrback(self._remove_shareholder, shareid, "put_plaintext_hashes") + return d + + def send_crypttext_hash_tree_to_all_shareholders(self): + self.log("sending crypttext hash tree", level=log.NOISY) + self.set_status("Sending Crypttext Hash Tree") + self.set_encode_and_push_progress(extra=0.3) + t = HashTree(self._crypttext_hashes) + all_hashes = list(t) + self.uri_extension_data["crypttext_root_hash"] = t[0] + dl = [] + for shareid in self.landlords.keys(): + dl.append(self.send_crypttext_hash_tree(shareid, all_hashes)) + return self._gather_responses(dl) + + def send_crypttext_hash_tree(self, shareid, all_hashes): + if shareid not in self.landlords: + return defer.succeed(None) + sh = self.landlords[shareid] + d = sh.put_crypttext_hashes(all_hashes) + d.addErrback(self._remove_shareholder, shareid, "put_crypttext_hashes") + return d + + def send_all_subshare_hash_trees(self): + self.log("sending subshare hash trees", level=log.NOISY) + self.set_status("Sending Subshare Hash Trees") + self.set_encode_and_push_progress(extra=0.4) + dl = [] + for shareid,hashes in enumerate(self.subshare_hashes): + # hashes is a list of the hashes of all subshares that were sent + # to shareholder[shareid]. + dl.append(self.send_one_subshare_hash_tree(shareid, hashes)) + return self._gather_responses(dl) + + def send_one_subshare_hash_tree(self, shareid, subshare_hashes): + t = HashTree(subshare_hashes) + all_hashes = list(t) + # all_hashes[0] is the root hash, == hash(ah[1]+ah[2]) + # all_hashes[1] is the left child, == hash(ah[3]+ah[4]) + # all_hashes[n] == hash(all_hashes[2*n+1] + all_hashes[2*n+2]) + self.share_root_hashes[shareid] = t[0] + if shareid not in self.landlords: + return defer.succeed(None) + sh = self.landlords[shareid] + d = sh.put_block_hashes(all_hashes) + d.addErrback(self._remove_shareholder, shareid, "put_block_hashes") + return d + + def send_all_share_hash_trees(self): + # each bucket gets a set of share hash tree nodes that are needed to + # validate their share. This includes the share hash itself, but does + # not include the top-level hash root (which is stored securely in + # the URI instead). + self.log("sending all share hash trees", level=log.NOISY) + self.set_status("Sending Share Hash Trees") + self.set_encode_and_push_progress(extra=0.6) + dl = [] + for h in self.share_root_hashes: + assert h + # create the share hash tree + t = HashTree(self.share_root_hashes) + # the root of this hash tree goes into our URI + self.uri_extension_data['share_root_hash'] = t[0] + # now send just the necessary pieces out to each shareholder + for i in range(self.num_shares): + # the HashTree is given a list of leaves: 0,1,2,3..n . + # These become nodes A+0,A+1,A+2.. of the tree, where A=n-1 + needed_hash_indices = t.needed_hashes(i, include_leaf=True) + hashes = [(hi, t[hi]) for hi in needed_hash_indices] + dl.append(self.send_one_share_hash_tree(i, hashes)) + return self._gather_responses(dl) + + def send_one_share_hash_tree(self, shareid, needed_hashes): + if shareid not in self.landlords: + return defer.succeed(None) + sh = self.landlords[shareid] + d = sh.put_share_hashes(needed_hashes) + d.addErrback(self._remove_shareholder, shareid, "put_share_hashes") + return d + + def send_uri_extension_to_all_shareholders(self): + lp = self.log("sending uri_extension", level=log.NOISY) + self.set_status("Sending URI Extensions") + self.set_encode_and_push_progress(extra=0.8) + for k in ('crypttext_root_hash', 'crypttext_hash', + ): + assert k in self.uri_extension_data + if self.USE_PLAINTEXT_HASHES: + for k in ('plaintext_root_hash', 'plaintext_hash', + ): + assert k in self.uri_extension_data + uri_extension = uri.pack_extension(self.uri_extension_data) + ed = {} + for k,v in self.uri_extension_data.items(): + if k.endswith("hash"): + ed[k] = base32.b2a(v) + else: + ed[k] = v + self.log("uri_extension_data is %s" % (ed,), level=log.NOISY, parent=lp) + self.uri_extension_hash = hashutil.uri_extension_hash(uri_extension) + dl = [] + for shareid in self.landlords.keys(): + dl.append(self.send_uri_extension(shareid, uri_extension)) + return self._gather_responses(dl) + + def send_uri_extension(self, shareid, uri_extension): + sh = self.landlords[shareid] + d = sh.put_uri_extension(uri_extension) + d.addErrback(self._remove_shareholder, shareid, "put_uri_extension") + return d + + def close_all_shareholders(self): + self.log("closing shareholders", level=log.NOISY) + self.set_status("Closing Shareholders") + self.set_encode_and_push_progress(extra=0.9) + dl = [] + for shareid in self.landlords: + d = self.landlords[shareid].close() + d.addErrback(self._remove_shareholder, shareid, "close") + dl.append(d) + return self._gather_responses(dl) + + def done(self, res): + self.log("upload done", level=log.OPERATIONAL) + self.set_status("Done") + self.set_encode_and_push_progress(extra=1.0) # done + now = time.time() + h_and_c_elapsed = now - self._start_hashing_and_close_timestamp + self._times["hashes_and_close"] = h_and_c_elapsed + total_elapsed = now - self._start_total_timestamp + self._times["total_encode_and_push"] = total_elapsed + + # update our sharemap + self._shares_placed = set(self.landlords.keys()) + return (self.uri_extension_hash, self.required_shares, + self.num_shares, self.file_size) + + def err(self, f): + self.log("upload failed", failure=f, level=log.UNUSUAL) + self.set_status("Failed") + # we need to abort any remaining shareholders, so they'll delete the + # partial share, allowing someone else to upload it again. + self.log("aborting shareholders", level=log.UNUSUAL) + for shareid in list(self.landlords.keys()): + self.landlords[shareid].abort() + if f.check(defer.FirstError): + return f.value.subFailure + return f + + def get_shares_placed(self): + # return a set of share numbers that were successfully placed. + return self._shares_placed + + def get_times(self): + # return a dictionary of encode+push timings + return self._times + + def get_uri_extension_data(self): + return self.uri_extension_data diff --git a/src/allmydata/immutable/filenode.py b/src/allmydata/immutable/filenode.py new file mode 100644 index 00000000..c3caae73 --- /dev/null +++ b/src/allmydata/immutable/filenode.py @@ -0,0 +1,118 @@ + +from zope.interface import implements +from twisted.internet import defer +from allmydata.interfaces import IFileNode, IFileURI, IURI, ICheckable +from allmydata import uri +from allmydata.immutable.checker import Results, \ + SimpleCHKFileChecker, SimpleCHKFileVerifier + +class FileNode: + implements(IFileNode, ICheckable) + + def __init__(self, uri, client): + u = IFileURI(uri) + self.uri = u.to_string() + self._client = client + + def get_uri(self): + return self.uri + + def is_mutable(self): + return False + + def is_readonly(self): + return True + + def get_readonly_uri(self): + return self.uri + + def get_size(self): + return IFileURI(self.uri).get_size() + + def __hash__(self): + return hash((self.__class__, self.uri)) + def __cmp__(self, them): + if cmp(type(self), type(them)): + return cmp(type(self), type(them)) + if cmp(self.__class__, them.__class__): + return cmp(self.__class__, them.__class__) + return cmp(self.uri, them.uri) + + def get_verifier(self): + return IFileURI(self.uri).get_verifier() + + def check(self, verify=False, repair=False): + assert repair is False # not implemented yet + vcap = self.get_verifier() + if verify: + v = SimpleCHKFileVerifier(self._client, vcap) + return v.start() + else: + peer_getter = self._client.get_permuted_peers + v = SimpleCHKFileChecker(peer_getter, vcap) + return v.check() + + def download(self, target): + downloader = self._client.getServiceNamed("downloader") + return downloader.download(self.uri, target) + + def download_to_data(self): + downloader = self._client.getServiceNamed("downloader") + return downloader.download_to_data(self.uri) + + + +class LiteralFileNode: + implements(IFileNode, ICheckable) + + def __init__(self, my_uri, client): + u = IFileURI(my_uri) + assert isinstance(u, uri.LiteralFileURI) + self.uri = u.to_string() + self._client = client + + def get_uri(self): + return self.uri + + def is_mutable(self): + return False + + def is_readonly(self): + return True + + def get_readonly_uri(self): + return self.uri + + def get_size(self): + return len(IURI(self.uri).data) + + def __hash__(self): + return hash((self.__class__, self.uri)) + def __cmp__(self, them): + if cmp(type(self), type(them)): + return cmp(type(self), type(them)) + if cmp(self.__class__, them.__class__): + return cmp(self.__class__, them.__class__) + return cmp(self.uri, them.uri) + + def get_verifier(self): + return None + + def check(self, verify=False, repair=False): + # neither verify= nor repair= affect LIT files + r = Results(None) + r.healthy = True + r.problems = [] + return defer.succeed(r) + + def download(self, target): + # note that this does not update the stats_provider + data = IURI(self.uri).data + target.open(len(data)) + target.write(data) + target.close() + return defer.maybeDeferred(target.finish) + + def download_to_data(self): + data = IURI(self.uri).data + return defer.succeed(data) diff --git a/src/allmydata/immutable/upload.py b/src/allmydata/immutable/upload.py new file mode 100644 index 00000000..2f641922 --- /dev/null +++ b/src/allmydata/immutable/upload.py @@ -0,0 +1,1270 @@ + +import os, time, weakref, itertools +from zope.interface import implements +from twisted.python import failure +from twisted.internet import defer +from twisted.application import service +from foolscap import Referenceable, Copyable, RemoteCopy +from foolscap import eventual +from foolscap.logging import log + +from allmydata.util.hashutil import file_renewal_secret_hash, \ + file_cancel_secret_hash, bucket_renewal_secret_hash, \ + bucket_cancel_secret_hash, plaintext_hasher, \ + storage_index_hash, plaintext_segment_hasher, convergence_hasher +from allmydata import storage, hashtree, uri +from allmydata.immutable import encode +from allmydata.util import base32, idlib, mathutil +from allmydata.util.assertutil import precondition +from allmydata.interfaces import IUploadable, IUploader, IUploadResults, \ + IEncryptedUploadable, RIEncryptedUploadable, IUploadStatus +from pycryptopp.cipher.aes import AES + +from cStringIO import StringIO + + +KiB=1024 +MiB=1024*KiB +GiB=1024*MiB +TiB=1024*GiB +PiB=1024*TiB + +class HaveAllPeersError(Exception): + # we use this to jump out of the loop + pass + +# this wants to live in storage, not here +class TooFullError(Exception): + pass + +class UploadResults(Copyable, RemoteCopy): + implements(IUploadResults) + # note: don't change this string, it needs to match the value used on the + # helper, and it does *not* need to match the fully-qualified + # package/module/class name + typeToCopy = "allmydata.upload.UploadResults.tahoe.allmydata.com" + copytype = typeToCopy + + def __init__(self): + self.timings = {} # dict of name to number of seconds + self.sharemap = {} # dict of shnum to placement string + self.servermap = {} # dict of peerid to set(shnums) + self.file_size = None + self.ciphertext_fetched = None # how much the helper fetched + self.uri = None + self.preexisting_shares = None # count of shares already present + self.pushed_shares = None # count of shares we pushed + + +# our current uri_extension is 846 bytes for small files, a few bytes +# more for larger ones (since the filesize is encoded in decimal in a +# few places). Ask for a little bit more just in case we need it. If +# the extension changes size, we can change EXTENSION_SIZE to +# allocate a more accurate amount of space. +EXTENSION_SIZE = 1000 +# TODO: actual extensions are closer to 419 bytes, so we can probably lower +# this. + +class PeerTracker: + def __init__(self, peerid, storage_server, + sharesize, blocksize, num_segments, num_share_hashes, + storage_index, + bucket_renewal_secret, bucket_cancel_secret): + precondition(isinstance(peerid, str), peerid) + precondition(len(peerid) == 20, peerid) + self.peerid = peerid + self._storageserver = storage_server # to an RIStorageServer + self.buckets = {} # k: shareid, v: IRemoteBucketWriter + self.sharesize = sharesize + as = storage.allocated_size(sharesize, + num_segments, + num_share_hashes, + EXTENSION_SIZE) + self.allocated_size = as + + self.blocksize = blocksize + self.num_segments = num_segments + self.num_share_hashes = num_share_hashes + self.storage_index = storage_index + + self.renew_secret = bucket_renewal_secret + self.cancel_secret = bucket_cancel_secret + + def __repr__(self): + return ("" + % (idlib.shortnodeid_b2a(self.peerid), + storage.si_b2a(self.storage_index)[:5])) + + def query(self, sharenums): + d = self._storageserver.callRemote("allocate_buckets", + self.storage_index, + self.renew_secret, + self.cancel_secret, + sharenums, + self.allocated_size, + canary=Referenceable()) + d.addCallback(self._got_reply) + return d + + def _got_reply(self, (alreadygot, buckets)): + #log.msg("%s._got_reply(%s)" % (self, (alreadygot, buckets))) + b = {} + for sharenum, rref in buckets.iteritems(): + bp = storage.WriteBucketProxy(rref, self.sharesize, + self.blocksize, + self.num_segments, + self.num_share_hashes, + EXTENSION_SIZE, + self.peerid) + b[sharenum] = bp + self.buckets.update(b) + return (alreadygot, set(b.keys())) + +class Tahoe2PeerSelector: + + def __init__(self, upload_id, logparent=None, upload_status=None): + self.upload_id = upload_id + self.query_count, self.good_query_count, self.bad_query_count = 0,0,0 + self.error_count = 0 + self.num_peers_contacted = 0 + self.last_failure_msg = None + self._status = IUploadStatus(upload_status) + self._log_parent = log.msg("%s starting" % self, parent=logparent) + + def __repr__(self): + return "" % self.upload_id + + def get_shareholders(self, client, + storage_index, share_size, block_size, + num_segments, total_shares, shares_of_happiness): + """ + @return: (used_peers, already_peers), where used_peers is a set of + PeerTracker instances that have agreed to hold some shares + for us (the shnum is stashed inside the PeerTracker), + and already_peers is a dict mapping shnum to a peer + which claims to already have the share. + """ + + if self._status: + self._status.set_status("Contacting Peers..") + + self.total_shares = total_shares + self.shares_of_happiness = shares_of_happiness + + self.homeless_shares = range(total_shares) + # self.uncontacted_peers = list() # peers we haven't asked yet + self.contacted_peers = [] # peers worth asking again + self.contacted_peers2 = [] # peers that we have asked again + self._started_second_pass = False + self.use_peers = set() # PeerTrackers that have shares assigned to them + self.preexisting_shares = {} # sharenum -> peerid holding the share + + peers = client.get_permuted_peers("storage", storage_index) + if not peers: + raise encode.NotEnoughSharesError("client gave us zero peers") + + # figure out how much space to ask for + + # this needed_hashes computation should mirror + # Encoder.send_all_share_hash_trees. We use an IncompleteHashTree + # (instead of a HashTree) because we don't require actual hashing + # just to count the levels. + ht = hashtree.IncompleteHashTree(total_shares) + num_share_hashes = len(ht.needed_hashes(0, include_leaf=True)) + + # decide upon the renewal/cancel secrets, to include them in the + # allocat_buckets query. + client_renewal_secret = client.get_renewal_secret() + client_cancel_secret = client.get_cancel_secret() + + file_renewal_secret = file_renewal_secret_hash(client_renewal_secret, + storage_index) + file_cancel_secret = file_cancel_secret_hash(client_cancel_secret, + storage_index) + + trackers = [ PeerTracker(peerid, conn, + share_size, block_size, + num_segments, num_share_hashes, + storage_index, + bucket_renewal_secret_hash(file_renewal_secret, + peerid), + bucket_cancel_secret_hash(file_cancel_secret, + peerid), + ) + for (peerid, conn) in peers ] + self.uncontacted_peers = trackers + + d = defer.maybeDeferred(self._loop) + return d + + def _loop(self): + if not self.homeless_shares: + # all done + msg = ("placed all %d shares, " + "sent %d queries to %d peers, " + "%d queries placed some shares, %d placed none, " + "got %d errors" % + (self.total_shares, + self.query_count, self.num_peers_contacted, + self.good_query_count, self.bad_query_count, + self.error_count)) + log.msg("peer selection successful for %s: %s" % (self, msg), + parent=self._log_parent) + return (self.use_peers, self.preexisting_shares) + + if self.uncontacted_peers: + peer = self.uncontacted_peers.pop(0) + # TODO: don't pre-convert all peerids to PeerTrackers + assert isinstance(peer, PeerTracker) + + shares_to_ask = set([self.homeless_shares.pop(0)]) + self.query_count += 1 + self.num_peers_contacted += 1 + if self._status: + self._status.set_status("Contacting Peers [%s] (first query)," + " %d shares left.." + % (idlib.shortnodeid_b2a(peer.peerid), + len(self.homeless_shares))) + d = peer.query(shares_to_ask) + d.addBoth(self._got_response, peer, shares_to_ask, + self.contacted_peers) + return d + elif self.contacted_peers: + # ask a peer that we've already asked. + if not self._started_second_pass: + log.msg("starting second pass", parent=self._log_parent, + level=log.NOISY) + self._started_second_pass = True + num_shares = mathutil.div_ceil(len(self.homeless_shares), + len(self.contacted_peers)) + peer = self.contacted_peers.pop(0) + shares_to_ask = set(self.homeless_shares[:num_shares]) + self.homeless_shares[:num_shares] = [] + self.query_count += 1 + if self._status: + self._status.set_status("Contacting Peers [%s] (second query)," + " %d shares left.." + % (idlib.shortnodeid_b2a(peer.peerid), + len(self.homeless_shares))) + d = peer.query(shares_to_ask) + d.addBoth(self._got_response, peer, shares_to_ask, + self.contacted_peers2) + return d + elif self.contacted_peers2: + # we've finished the second-or-later pass. Move all the remaining + # peers back into self.contacted_peers for the next pass. + self.contacted_peers.extend(self.contacted_peers2) + self.contacted_peers[:] = [] + return self._loop() + else: + # no more peers. If we haven't placed enough shares, we fail. + placed_shares = self.total_shares - len(self.homeless_shares) + if placed_shares < self.shares_of_happiness: + msg = ("placed %d shares out of %d total (%d homeless), " + "sent %d queries to %d peers, " + "%d queries placed some shares, %d placed none, " + "got %d errors" % + (self.total_shares - len(self.homeless_shares), + self.total_shares, len(self.homeless_shares), + self.query_count, self.num_peers_contacted, + self.good_query_count, self.bad_query_count, + self.error_count)) + msg = "peer selection failed for %s: %s" % (self, msg) + if self.last_failure_msg: + msg += " (%s)" % (self.last_failure_msg,) + log.msg(msg, level=log.UNUSUAL, parent=self._log_parent) + raise encode.NotEnoughSharesError(msg) + else: + # we placed enough to be happy, so we're done + if self._status: + self._status.set_status("Placed all shares") + return self.use_peers + + def _got_response(self, res, peer, shares_to_ask, put_peer_here): + if isinstance(res, failure.Failure): + # This is unusual, and probably indicates a bug or a network + # problem. + log.msg("%s got error during peer selection: %s" % (peer, res), + level=log.UNUSUAL, parent=self._log_parent) + self.error_count += 1 + self.homeless_shares = list(shares_to_ask) + self.homeless_shares + if (self.uncontacted_peers + or self.contacted_peers + or self.contacted_peers2): + # there is still hope, so just loop + pass + else: + # No more peers, so this upload might fail (it depends upon + # whether we've hit shares_of_happiness or not). Log the last + # failure we got: if a coding error causes all peers to fail + # in the same way, this allows the common failure to be seen + # by the uploader and should help with debugging + msg = ("last failure (from %s) was: %s" % (peer, res)) + self.last_failure_msg = msg + else: + (alreadygot, allocated) = res + log.msg("response from peer %s: alreadygot=%s, allocated=%s" + % (idlib.shortnodeid_b2a(peer.peerid), + tuple(sorted(alreadygot)), tuple(sorted(allocated))), + level=log.NOISY, parent=self._log_parent) + progress = False + for s in alreadygot: + self.preexisting_shares[s] = peer.peerid + if s in self.homeless_shares: + self.homeless_shares.remove(s) + progress = True + + # the PeerTracker will remember which shares were allocated on + # that peer. We just have to remember to use them. + if allocated: + self.use_peers.add(peer) + progress = True + + not_yet_present = set(shares_to_ask) - set(alreadygot) + still_homeless = not_yet_present - set(allocated) + + if progress: + # they accepted or already had at least one share, so + # progress has been made + self.good_query_count += 1 + else: + self.bad_query_count += 1 + + if still_homeless: + # In networks with lots of space, this is very unusual and + # probably indicates an error. In networks with peers that + # are full, it is merely unusual. In networks that are very + # full, it is common, and many uploads will fail. In most + # cases, this is obviously not fatal, and we'll just use some + # other peers. + + # some shares are still homeless, keep trying to find them a + # home. The ones that were rejected get first priority. + self.homeless_shares = (list(still_homeless) + + self.homeless_shares) + # Since they were unable to accept all of our requests, so it + # is safe to assume that asking them again won't help. + else: + # if they *were* able to accept everything, they might be + # willing to accept even more. + put_peer_here.append(peer) + + # now loop + return self._loop() + + +class EncryptAnUploadable: + """This is a wrapper that takes an IUploadable and provides + IEncryptedUploadable.""" + implements(IEncryptedUploadable) + CHUNKSIZE = 50*1024 + + def __init__(self, original, log_parent=None): + self.original = IUploadable(original) + self._log_number = log_parent + self._encryptor = None + self._plaintext_hasher = plaintext_hasher() + self._plaintext_segment_hasher = None + self._plaintext_segment_hashes = [] + self._encoding_parameters = None + self._file_size = None + self._ciphertext_bytes_read = 0 + self._status = None + + def set_upload_status(self, upload_status): + self._status = IUploadStatus(upload_status) + self.original.set_upload_status(upload_status) + + def log(self, *args, **kwargs): + if "facility" not in kwargs: + kwargs["facility"] = "upload.encryption" + if "parent" not in kwargs: + kwargs["parent"] = self._log_number + return log.msg(*args, **kwargs) + + def get_size(self): + if self._file_size is not None: + return defer.succeed(self._file_size) + d = self.original.get_size() + def _got_size(size): + self._file_size = size + if self._status: + self._status.set_size(size) + return size + d.addCallback(_got_size) + return d + + def get_all_encoding_parameters(self): + if self._encoding_parameters is not None: + return defer.succeed(self._encoding_parameters) + d = self.original.get_all_encoding_parameters() + def _got(encoding_parameters): + (k, happy, n, segsize) = encoding_parameters + self._segment_size = segsize # used by segment hashers + self._encoding_parameters = encoding_parameters + self.log("my encoding parameters: %s" % (encoding_parameters,), + level=log.NOISY) + return encoding_parameters + d.addCallback(_got) + return d + + def _get_encryptor(self): + if self._encryptor: + return defer.succeed(self._encryptor) + + d = self.original.get_encryption_key() + def _got(key): + e = AES(key) + self._encryptor = e + + storage_index = storage_index_hash(key) + assert isinstance(storage_index, str) + # There's no point to having the SI be longer than the key, so we + # specify that it is truncated to the same 128 bits as the AES key. + assert len(storage_index) == 16 # SHA-256 truncated to 128b + self._storage_index = storage_index + if self._status: + self._status.set_storage_index(storage_index) + return e + d.addCallback(_got) + return d + + def get_storage_index(self): + d = self._get_encryptor() + d.addCallback(lambda res: self._storage_index) + return d + + def _get_segment_hasher(self): + p = self._plaintext_segment_hasher + if p: + left = self._segment_size - self._plaintext_segment_hashed_bytes + return p, left + p = plaintext_segment_hasher() + self._plaintext_segment_hasher = p + self._plaintext_segment_hashed_bytes = 0 + return p, self._segment_size + + def _update_segment_hash(self, chunk): + offset = 0 + while offset < len(chunk): + p, segment_left = self._get_segment_hasher() + chunk_left = len(chunk) - offset + this_segment = min(chunk_left, segment_left) + p.update(chunk[offset:offset+this_segment]) + self._plaintext_segment_hashed_bytes += this_segment + + if self._plaintext_segment_hashed_bytes == self._segment_size: + # we've filled this segment + self._plaintext_segment_hashes.append(p.digest()) + self._plaintext_segment_hasher = None + self.log("closed hash [%d]: %dB" % + (len(self._plaintext_segment_hashes)-1, + self._plaintext_segment_hashed_bytes), + level=log.NOISY) + self.log(format="plaintext leaf hash [%(segnum)d] is %(hash)s", + segnum=len(self._plaintext_segment_hashes)-1, + hash=base32.b2a(p.digest()), + level=log.NOISY) + + offset += this_segment + + + def read_encrypted(self, length, hash_only): + # make sure our parameters have been set up first + d = self.get_all_encoding_parameters() + # and size + d.addCallback(lambda ignored: self.get_size()) + d.addCallback(lambda ignored: self._get_encryptor()) + # then fetch and encrypt the plaintext. The unusual structure here + # (passing a Deferred *into* a function) is needed to avoid + # overflowing the stack: Deferreds don't optimize out tail recursion. + # We also pass in a list, to which _read_encrypted will append + # ciphertext. + ciphertext = [] + d2 = defer.Deferred() + d.addCallback(lambda ignored: + self._read_encrypted(length, ciphertext, hash_only, d2)) + d.addCallback(lambda ignored: d2) + return d + + def _read_encrypted(self, remaining, ciphertext, hash_only, fire_when_done): + if not remaining: + fire_when_done.callback(ciphertext) + return None + # tolerate large length= values without consuming a lot of RAM by + # reading just a chunk (say 50kB) at a time. This only really matters + # when hash_only==True (i.e. resuming an interrupted upload), since + # that's the case where we will be skipping over a lot of data. + size = min(remaining, self.CHUNKSIZE) + remaining = remaining - size + # read a chunk of plaintext.. + d = defer.maybeDeferred(self.original.read, size) + # N.B.: if read() is synchronous, then since everything else is + # actually synchronous too, we'd blow the stack unless we stall for a + # tick. Once you accept a Deferred from IUploadable.read(), you must + # be prepared to have it fire immediately too. + d.addCallback(eventual.fireEventually) + def _good(plaintext): + # and encrypt it.. + # o/' over the fields we go, hashing all the way, sHA! sHA! sHA! o/' + ct = self._hash_and_encrypt_plaintext(plaintext, hash_only) + ciphertext.extend(ct) + self._read_encrypted(remaining, ciphertext, hash_only, + fire_when_done) + def _err(why): + fire_when_done.errback(why) + d.addCallback(_good) + d.addErrback(_err) + return None + + def _hash_and_encrypt_plaintext(self, data, hash_only): + assert isinstance(data, (tuple, list)), type(data) + data = list(data) + cryptdata = [] + # we use data.pop(0) instead of 'for chunk in data' to save + # memory: each chunk is destroyed as soon as we're done with it. + bytes_processed = 0 + while data: + chunk = data.pop(0) + self.log(" read_encrypted handling %dB-sized chunk" % len(chunk), + level=log.NOISY) + bytes_processed += len(chunk) + self._plaintext_hasher.update(chunk) + self._update_segment_hash(chunk) + # TODO: we have to encrypt the data (even if hash_only==True) + # because pycryptopp's AES-CTR implementation doesn't offer a + # way to change the counter value. Once pycryptopp acquires + # this ability, change this to simply update the counter + # before each call to (hash_only==False) _encryptor.process() + ciphertext = self._encryptor.process(chunk) + if hash_only: + self.log(" skipping encryption", level=log.NOISY) + else: + cryptdata.append(ciphertext) + del ciphertext + del chunk + self._ciphertext_bytes_read += bytes_processed + if self._status: + progress = float(self._ciphertext_bytes_read) / self._file_size + self._status.set_progress(1, progress) + return cryptdata + + + def get_plaintext_hashtree_leaves(self, first, last, num_segments): + if len(self._plaintext_segment_hashes) < num_segments: + # close out the last one + assert len(self._plaintext_segment_hashes) == num_segments-1 + p, segment_left = self._get_segment_hasher() + self._plaintext_segment_hashes.append(p.digest()) + del self._plaintext_segment_hasher + self.log("closing plaintext leaf hasher, hashed %d bytes" % + self._plaintext_segment_hashed_bytes, + level=log.NOISY) + self.log(format="plaintext leaf hash [%(segnum)d] is %(hash)s", + segnum=len(self._plaintext_segment_hashes)-1, + hash=base32.b2a(p.digest()), + level=log.NOISY) + assert len(self._plaintext_segment_hashes) == num_segments + return defer.succeed(tuple(self._plaintext_segment_hashes[first:last])) + + def get_plaintext_hash(self): + h = self._plaintext_hasher.digest() + return defer.succeed(h) + + def close(self): + return self.original.close() + +class UploadStatus: + implements(IUploadStatus) + statusid_counter = itertools.count(0) + + def __init__(self): + self.storage_index = None + self.size = None + self.helper = False + self.status = "Not started" + self.progress = [0.0, 0.0, 0.0] + self.active = True + self.results = None + self.counter = self.statusid_counter.next() + self.started = time.time() + + def get_started(self): + return self.started + def get_storage_index(self): + return self.storage_index + def get_size(self): + return self.size + def using_helper(self): + return self.helper + def get_status(self): + return self.status + def get_progress(self): + return tuple(self.progress) + def get_active(self): + return self.active + def get_results(self): + return self.results + def get_counter(self): + return self.counter + + def set_storage_index(self, si): + self.storage_index = si + def set_size(self, size): + self.size = size + def set_helper(self, helper): + self.helper = helper + def set_status(self, status): + self.status = status + def set_progress(self, which, value): + # [0]: chk, [1]: ciphertext, [2]: encode+push + self.progress[which] = value + def set_active(self, value): + self.active = value + def set_results(self, value): + self.results = value + +class CHKUploader: + peer_selector_class = Tahoe2PeerSelector + + def __init__(self, client): + self._client = client + self._log_number = self._client.log("CHKUploader starting") + self._encoder = None + self._results = UploadResults() + self._storage_index = None + self._upload_status = UploadStatus() + self._upload_status.set_helper(False) + self._upload_status.set_active(True) + self._upload_status.set_results(self._results) + + def log(self, *args, **kwargs): + if "parent" not in kwargs: + kwargs["parent"] = self._log_number + if "facility" not in kwargs: + kwargs["facility"] = "tahoe.upload" + return self._client.log(*args, **kwargs) + + def start(self, uploadable): + """Start uploading the file. + + This method returns a Deferred that will fire with the URI (a + string).""" + + self._started = time.time() + uploadable = IUploadable(uploadable) + self.log("starting upload of %s" % uploadable) + + eu = EncryptAnUploadable(uploadable, self._log_number) + eu.set_upload_status(self._upload_status) + d = self.start_encrypted(eu) + def _uploaded(res): + d1 = uploadable.get_encryption_key() + d1.addCallback(lambda key: self._compute_uri(res, key)) + return d1 + d.addCallback(_uploaded) + def _done(res): + self._upload_status.set_active(False) + return res + d.addBoth(_done) + return d + + def abort(self): + """Call this is the upload must be abandoned before it completes. + This will tell the shareholders to delete their partial shares. I + return a Deferred that fires when these messages have been acked.""" + if not self._encoder: + # how did you call abort() before calling start() ? + return defer.succeed(None) + return self._encoder.abort() + + def start_encrypted(self, encrypted): + eu = IEncryptedUploadable(encrypted) + + started = time.time() + self._encoder = e = encode.Encoder(self._log_number, + self._upload_status) + d = e.set_encrypted_uploadable(eu) + d.addCallback(self.locate_all_shareholders, started) + d.addCallback(self.set_shareholders, e) + d.addCallback(lambda res: e.start()) + d.addCallback(self._encrypted_done) + # this fires with the uri_extension_hash and other data + return d + + def locate_all_shareholders(self, encoder, started): + peer_selection_started = now = time.time() + self._storage_index_elapsed = now - started + storage_index = encoder.get_param("storage_index") + self._storage_index = storage_index + upload_id = storage.si_b2a(storage_index)[:5] + self.log("using storage index %s" % upload_id) + peer_selector = self.peer_selector_class(upload_id, self._log_number, + self._upload_status) + + share_size = encoder.get_param("share_size") + block_size = encoder.get_param("block_size") + num_segments = encoder.get_param("num_segments") + k,desired,n = encoder.get_param("share_counts") + + self._peer_selection_started = time.time() + d = peer_selector.get_shareholders(self._client, storage_index, + share_size, block_size, + num_segments, n, desired) + def _done(res): + self._peer_selection_elapsed = time.time() - peer_selection_started + return res + d.addCallback(_done) + return d + + def set_shareholders(self, (used_peers, already_peers), encoder): + """ + @param used_peers: a sequence of PeerTracker objects + @paran already_peers: a dict mapping sharenum to a peerid that + claims to already have this share + """ + self.log("_send_shares, used_peers is %s" % (used_peers,)) + # record already-present shares in self._results + for (shnum, peerid) in already_peers.items(): + peerid_s = idlib.shortnodeid_b2a(peerid) + self._results.sharemap[shnum] = "Found on [%s]" % peerid_s + if peerid not in self._results.servermap: + self._results.servermap[peerid] = set() + self._results.servermap[peerid].add(shnum) + self._results.preexisting_shares = len(already_peers) + + self._sharemap = {} + for peer in used_peers: + assert isinstance(peer, PeerTracker) + buckets = {} + for peer in used_peers: + buckets.update(peer.buckets) + for shnum in peer.buckets: + self._sharemap[shnum] = peer + assert len(buckets) == sum([len(peer.buckets) for peer in used_peers]) + encoder.set_shareholders(buckets) + + def _encrypted_done(self, res): + r = self._results + for shnum in self._encoder.get_shares_placed(): + peer_tracker = self._sharemap[shnum] + peerid = peer_tracker.peerid + peerid_s = idlib.shortnodeid_b2a(peerid) + r.sharemap[shnum] = "Placed on [%s]" % peerid_s + if peerid not in r.servermap: + r.servermap[peerid] = set() + r.servermap[peerid].add(shnum) + r.pushed_shares = len(self._encoder.get_shares_placed()) + now = time.time() + r.file_size = self._encoder.file_size + r.timings["total"] = now - self._started + r.timings["storage_index"] = self._storage_index_elapsed + r.timings["peer_selection"] = self._peer_selection_elapsed + r.timings.update(self._encoder.get_times()) + r.uri_extension_data = self._encoder.get_uri_extension_data() + return res + + def _compute_uri(self, (uri_extension_hash, + needed_shares, total_shares, size), + key): + u = uri.CHKFileURI(key=key, + uri_extension_hash=uri_extension_hash, + needed_shares=needed_shares, + total_shares=total_shares, + size=size, + ) + r = self._results + r.uri = u.to_string() + return r + + def get_upload_status(self): + return self._upload_status + +def read_this_many_bytes(uploadable, size, prepend_data=[]): + if size == 0: + return defer.succeed([]) + d = uploadable.read(size) + def _got(data): + assert isinstance(data, list) + bytes = sum([len(piece) for piece in data]) + assert bytes > 0 + assert bytes <= size + remaining = size - bytes + if remaining: + return read_this_many_bytes(uploadable, remaining, + prepend_data + data) + return prepend_data + data + d.addCallback(_got) + return d + +class LiteralUploader: + + def __init__(self, client): + self._client = client + self._results = UploadResults() + self._status = s = UploadStatus() + s.set_storage_index(None) + s.set_helper(False) + s.set_progress(0, 1.0) + s.set_active(False) + s.set_results(self._results) + + def start(self, uploadable): + uploadable = IUploadable(uploadable) + d = uploadable.get_size() + def _got_size(size): + self._size = size + self._status.set_size(size) + self._results.file_size = size + return read_this_many_bytes(uploadable, size) + d.addCallback(_got_size) + d.addCallback(lambda data: uri.LiteralFileURI("".join(data))) + d.addCallback(lambda u: u.to_string()) + d.addCallback(self._build_results) + return d + + def _build_results(self, uri): + self._results.uri = uri + self._status.set_status("Done") + self._status.set_progress(1, 1.0) + self._status.set_progress(2, 1.0) + return self._results + + def close(self): + pass + + def get_upload_status(self): + return self._status + +class RemoteEncryptedUploadable(Referenceable): + implements(RIEncryptedUploadable) + + def __init__(self, encrypted_uploadable, upload_status): + self._eu = IEncryptedUploadable(encrypted_uploadable) + self._offset = 0 + self._bytes_sent = 0 + self._status = IUploadStatus(upload_status) + # we are responsible for updating the status string while we run, and + # for setting the ciphertext-fetch progress. + self._size = None + + def get_size(self): + if self._size is not None: + return defer.succeed(self._size) + d = self._eu.get_size() + def _got_size(size): + self._size = size + return size + d.addCallback(_got_size) + return d + + def remote_get_size(self): + return self.get_size() + def remote_get_all_encoding_parameters(self): + return self._eu.get_all_encoding_parameters() + + def _read_encrypted(self, length, hash_only): + d = self._eu.read_encrypted(length, hash_only) + def _read(strings): + if hash_only: + self._offset += length + else: + size = sum([len(data) for data in strings]) + self._offset += size + return strings + d.addCallback(_read) + return d + + def remote_read_encrypted(self, offset, length): + # we don't support seek backwards, but we allow skipping forwards + precondition(offset >= 0, offset) + precondition(length >= 0, length) + lp = log.msg("remote_read_encrypted(%d-%d)" % (offset, offset+length), + level=log.NOISY) + precondition(offset >= self._offset, offset, self._offset) + if offset > self._offset: + # read the data from disk anyways, to build up the hash tree + skip = offset - self._offset + log.msg("remote_read_encrypted skipping ahead from %d to %d, skip=%d" % + (self._offset, offset, skip), level=log.UNUSUAL, parent=lp) + d = self._read_encrypted(skip, hash_only=True) + else: + d = defer.succeed(None) + + def _at_correct_offset(res): + assert offset == self._offset, "%d != %d" % (offset, self._offset) + return self._read_encrypted(length, hash_only=False) + d.addCallback(_at_correct_offset) + + def _read(strings): + size = sum([len(data) for data in strings]) + self._bytes_sent += size + return strings + d.addCallback(_read) + return d + + def remote_get_plaintext_hashtree_leaves(self, first, last, num_segments): + log.msg("remote_get_plaintext_hashtree_leaves: %d-%d of %d" % + (first, last-1, num_segments), + level=log.NOISY) + d = self._eu.get_plaintext_hashtree_leaves(first, last, num_segments) + d.addCallback(list) + return d + def remote_get_plaintext_hash(self): + return self._eu.get_plaintext_hash() + def remote_close(self): + return self._eu.close() + + +class AssistedUploader: + + def __init__(self, helper): + self._helper = helper + self._log_number = log.msg("AssistedUploader starting") + self._storage_index = None + self._upload_status = s = UploadStatus() + s.set_helper(True) + s.set_active(True) + + def log(self, *args, **kwargs): + if "parent" not in kwargs: + kwargs["parent"] = self._log_number + return log.msg(*args, **kwargs) + + def start(self, uploadable): + self._started = time.time() + u = IUploadable(uploadable) + eu = EncryptAnUploadable(u, self._log_number) + eu.set_upload_status(self._upload_status) + self._encuploadable = eu + d = eu.get_size() + d.addCallback(self._got_size) + d.addCallback(lambda res: eu.get_all_encoding_parameters()) + d.addCallback(self._got_all_encoding_parameters) + # when we get the encryption key, that will also compute the storage + # index, so this only takes one pass. + # TODO: I'm not sure it's cool to switch back and forth between + # the Uploadable and the IEncryptedUploadable that wraps it. + d.addCallback(lambda res: u.get_encryption_key()) + d.addCallback(self._got_encryption_key) + d.addCallback(lambda res: eu.get_storage_index()) + d.addCallback(self._got_storage_index) + d.addCallback(self._contact_helper) + d.addCallback(self._build_readcap) + def _done(res): + self._upload_status.set_active(False) + return res + d.addBoth(_done) + return d + + def _got_size(self, size): + self._size = size + self._upload_status.set_size(size) + + def _got_all_encoding_parameters(self, params): + k, happy, n, segment_size = params + # stash these for URI generation later + self._needed_shares = k + self._total_shares = n + self._segment_size = segment_size + + def _got_encryption_key(self, key): + self._key = key + + def _got_storage_index(self, storage_index): + self._storage_index = storage_index + + + def _contact_helper(self, res): + now = self._time_contacting_helper_start = time.time() + self._storage_index_elapsed = now - self._started + self.log(format="contacting helper for SI %(si)s..", + si=storage.si_b2a(self._storage_index)) + self._upload_status.set_status("Contacting Helper") + d = self._helper.callRemote("upload_chk", self._storage_index) + d.addCallback(self._contacted_helper) + return d + + def _contacted_helper(self, (upload_results, upload_helper)): + now = time.time() + elapsed = now - self._time_contacting_helper_start + self._elapsed_time_contacting_helper = elapsed + if upload_helper: + self.log("helper says we need to upload") + self._upload_status.set_status("Uploading Ciphertext") + # we need to upload the file + reu = RemoteEncryptedUploadable(self._encuploadable, + self._upload_status) + # let it pre-compute the size for progress purposes + d = reu.get_size() + d.addCallback(lambda ignored: + upload_helper.callRemote("upload", reu)) + # this Deferred will fire with the upload results + return d + self.log("helper says file is already uploaded") + self._upload_status.set_progress(1, 1.0) + self._upload_status.set_results(upload_results) + return upload_results + + def _build_readcap(self, upload_results): + self.log("upload finished, building readcap") + self._upload_status.set_status("Building Readcap") + r = upload_results + assert r.uri_extension_data["needed_shares"] == self._needed_shares + assert r.uri_extension_data["total_shares"] == self._total_shares + assert r.uri_extension_data["segment_size"] == self._segment_size + assert r.uri_extension_data["size"] == self._size + u = uri.CHKFileURI(key=self._key, + uri_extension_hash=r.uri_extension_hash, + needed_shares=self._needed_shares, + total_shares=self._total_shares, + size=self._size, + ) + r.uri = u.to_string() + now = time.time() + r.file_size = self._size + r.timings["storage_index"] = self._storage_index_elapsed + r.timings["contacting_helper"] = self._elapsed_time_contacting_helper + if "total" in r.timings: + r.timings["helper_total"] = r.timings["total"] + r.timings["total"] = now - self._started + self._upload_status.set_status("Done") + self._upload_status.set_results(r) + return r + + def get_upload_status(self): + return self._upload_status + +class BaseUploadable: + default_max_segment_size = 128*KiB # overridden by max_segment_size + default_encoding_param_k = 3 # overridden by encoding_parameters + default_encoding_param_happy = 7 + default_encoding_param_n = 10 + + max_segment_size = None + encoding_param_k = None + encoding_param_happy = None + encoding_param_n = None + + _all_encoding_parameters = None + _status = None + + def set_upload_status(self, upload_status): + self._status = IUploadStatus(upload_status) + + def set_default_encoding_parameters(self, default_params): + assert isinstance(default_params, dict) + for k,v in default_params.items(): + precondition(isinstance(k, str), k, v) + precondition(isinstance(v, int), k, v) + if "k" in default_params: + self.default_encoding_param_k = default_params["k"] + if "happy" in default_params: + self.default_encoding_param_happy = default_params["happy"] + if "n" in default_params: + self.default_encoding_param_n = default_params["n"] + if "max_segment_size" in default_params: + self.default_max_segment_size = default_params["max_segment_size"] + + def get_all_encoding_parameters(self): + if self._all_encoding_parameters: + return defer.succeed(self._all_encoding_parameters) + + max_segsize = self.max_segment_size or self.default_max_segment_size + k = self.encoding_param_k or self.default_encoding_param_k + happy = self.encoding_param_happy or self.default_encoding_param_happy + n = self.encoding_param_n or self.default_encoding_param_n + + d = self.get_size() + def _got_size(file_size): + # for small files, shrink the segment size to avoid wasting space + segsize = min(max_segsize, file_size) + # this must be a multiple of 'required_shares'==k + segsize = mathutil.next_multiple(segsize, k) + encoding_parameters = (k, happy, n, segsize) + self._all_encoding_parameters = encoding_parameters + return encoding_parameters + d.addCallback(_got_size) + return d + +class FileHandle(BaseUploadable): + implements(IUploadable) + + def __init__(self, filehandle, convergence): + """ + Upload the data from the filehandle. If convergence is None then a + random encryption key will be used, else the plaintext will be hashed, + then the hash will be hashed together with the string in the + "convergence" argument to form the encryption key. + """ + assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) + self._filehandle = filehandle + self._key = None + self.convergence = convergence + self._size = None + + def _get_encryption_key_convergent(self): + if self._key is not None: + return defer.succeed(self._key) + + d = self.get_size() + # that sets self._size as a side-effect + d.addCallback(lambda size: self.get_all_encoding_parameters()) + def _got(params): + k, happy, n, segsize = params + f = self._filehandle + enckey_hasher = convergence_hasher(k, n, segsize, self.convergence) + f.seek(0) + BLOCKSIZE = 64*1024 + bytes_read = 0 + while True: + data = f.read(BLOCKSIZE) + if not data: + break + enckey_hasher.update(data) + # TODO: setting progress in a non-yielding loop is kind of + # pointless, but I'm anticipating (perhaps prematurely) the + # day when we use a slowjob or twisted's CooperatorService to + # make this yield time to other jobs. + bytes_read += len(data) + if self._status: + self._status.set_progress(0, float(bytes_read)/self._size) + f.seek(0) + self._key = enckey_hasher.digest() + if self._status: + self._status.set_progress(0, 1.0) + assert len(self._key) == 16 + return self._key + d.addCallback(_got) + return d + + def _get_encryption_key_random(self): + if self._key is None: + self._key = os.urandom(16) + return defer.succeed(self._key) + + def get_encryption_key(self): + if self.convergence is not None: + return self._get_encryption_key_convergent() + else: + return self._get_encryption_key_random() + + def get_size(self): + if self._size is not None: + return defer.succeed(self._size) + self._filehandle.seek(0,2) + size = self._filehandle.tell() + self._size = size + self._filehandle.seek(0) + return defer.succeed(size) + + def read(self, length): + return defer.succeed([self._filehandle.read(length)]) + + def close(self): + # the originator of the filehandle reserves the right to close it + pass + +class FileName(FileHandle): + def __init__(self, filename, convergence): + """ + Upload the data from the filename. If convergence is None then a + random encryption key will be used, else the plaintext will be hashed, + then the hash will be hashed together with the string in the + "convergence" argument to form the encryption key. + """ + assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) + FileHandle.__init__(self, open(filename, "rb"), convergence=convergence) + def close(self): + FileHandle.close(self) + self._filehandle.close() + +class Data(FileHandle): + def __init__(self, data, convergence): + """ + Upload the data from the data argument. If convergence is None then a + random encryption key will be used, else the plaintext will be hashed, + then the hash will be hashed together with the string in the + "convergence" argument to form the encryption key. + """ + assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) + FileHandle.__init__(self, StringIO(data), convergence=convergence) + +class Uploader(service.MultiService): + """I am a service that allows file uploading. I am a service-child of the + Client. + """ + implements(IUploader) + name = "uploader" + uploader_class = CHKUploader + URI_LIT_SIZE_THRESHOLD = 55 + MAX_UPLOAD_STATUSES = 10 + + def __init__(self, helper_furl=None, stats_provider=None): + self._helper_furl = helper_furl + self.stats_provider = stats_provider + self._helper = None + self._all_uploads = weakref.WeakKeyDictionary() # for debugging + self._all_upload_statuses = weakref.WeakKeyDictionary() + self._recent_upload_statuses = [] + service.MultiService.__init__(self) + + def startService(self): + service.MultiService.startService(self) + if self._helper_furl: + self.parent.tub.connectTo(self._helper_furl, + self._got_helper) + + def _got_helper(self, helper): + self._helper = helper + helper.notifyOnDisconnect(self._lost_helper) + def _lost_helper(self): + self._helper = None + + def get_helper_info(self): + # return a tuple of (helper_furl_or_None, connected_bool) + return (self._helper_furl, bool(self._helper)) + + def upload(self, uploadable): + # this returns the URI + assert self.parent + assert self.running + + uploadable = IUploadable(uploadable) + d = uploadable.get_size() + def _got_size(size): + default_params = self.parent.get_encoding_parameters() + precondition(isinstance(default_params, dict), default_params) + precondition("max_segment_size" in default_params, default_params) + uploadable.set_default_encoding_parameters(default_params) + + if self.stats_provider: + self.stats_provider.count('uploader.files_uploaded', 1) + self.stats_provider.count('uploader.bytes_uploaded', size) + + if size <= self.URI_LIT_SIZE_THRESHOLD: + uploader = LiteralUploader(self.parent) + elif self._helper: + uploader = AssistedUploader(self._helper) + else: + uploader = self.uploader_class(self.parent) + self._add_upload(uploader) + return uploader.start(uploadable) + d.addCallback(_got_size) + def _done(res): + uploadable.close() + return res + d.addBoth(_done) + return d + + def _add_upload(self, uploader): + s = uploader.get_upload_status() + self._all_uploads[uploader] = None + self._all_upload_statuses[s] = None + self._recent_upload_statuses.append(s) + while len(self._recent_upload_statuses) > self.MAX_UPLOAD_STATUSES: + self._recent_upload_statuses.pop(0) + + def list_all_upload_statuses(self): + for us in self._all_upload_statuses: + yield us diff --git a/src/allmydata/mutable/node.py b/src/allmydata/mutable/node.py index 368b9962..cd9f9763 100644 --- a/src/allmydata/mutable/node.py +++ b/src/allmydata/mutable/node.py @@ -10,7 +10,7 @@ from allmydata.interfaces import IMutableFileNode, IMutableFileURI, ICheckable from allmydata.util import hashutil from allmydata.util.assertutil import precondition from allmydata.uri import WriteableSSKFileURI -from allmydata.encode import NotEnoughSharesError +from allmydata.immutable.encode import NotEnoughSharesError from pycryptopp.publickey import rsa from pycryptopp.cipher.aes import AES diff --git a/src/allmydata/mutable/retrieve.py b/src/allmydata/mutable/retrieve.py index 66a049a4..b844bcf0 100644 --- a/src/allmydata/mutable/retrieve.py +++ b/src/allmydata/mutable/retrieve.py @@ -8,7 +8,7 @@ from foolscap.eventual import eventually, fireEventually from allmydata.interfaces import IRetrieveStatus from allmydata.util import hashutil, idlib, log from allmydata import hashtree, codec, storage -from allmydata.encode import NotEnoughSharesError +from allmydata.immutable.encode import NotEnoughSharesError from pycryptopp.cipher.aes import AES from common import DictOfSets, CorruptShareError, UncoordinatedWriteError diff --git a/src/allmydata/offloaded.py b/src/allmydata/offloaded.py index d4c94a1a..06a5345b 100644 --- a/src/allmydata/offloaded.py +++ b/src/allmydata/offloaded.py @@ -5,7 +5,8 @@ from twisted.application import service from twisted.internet import defer from foolscap import Referenceable from foolscap.eventual import eventually -from allmydata import upload, interfaces, storage, uri +from allmydata import interfaces, storage, uri +from allmydata.immutable import upload from allmydata.util import idlib, log, observer, fileutil, hashutil diff --git a/src/allmydata/test/check_memory.py b/src/allmydata/test/check_memory.py index 908e21ec..ddfa4337 100644 --- a/src/allmydata/test/check_memory.py +++ b/src/allmydata/test/check_memory.py @@ -5,7 +5,8 @@ from cStringIO import StringIO from twisted.internet import defer, reactor, protocol, error from twisted.application import service, internet from twisted.web import client as tw_client -from allmydata import client, introducer, upload +from allmydata import client, introducer +from allmydata.immutable import upload from allmydata.scripts import create_node from allmydata.util import testutil, fileutil import foolscap diff --git a/src/allmydata/test/common.py b/src/allmydata/test/common.py index ce4569a3..09698883 100644 --- a/src/allmydata/test/common.py +++ b/src/allmydata/test/common.py @@ -4,10 +4,11 @@ from zope.interface import implements from twisted.internet import defer from twisted.python import failure from twisted.application import service -from allmydata import uri, dirnode, checker +from allmydata import uri, dirnode from allmydata.interfaces import IURI, IMutableFileNode, IFileNode, \ FileTooLargeError -from allmydata.encode import NotEnoughSharesError +from allmydata.immutable import checker +from allmydata.immutable.encode import NotEnoughSharesError from allmydata.util import log class FakeCHKFileNode: diff --git a/src/allmydata/test/test_dirnode.py b/src/allmydata/test/test_dirnode.py index 9ef0af5a..4aba00c4 100644 --- a/src/allmydata/test/test_dirnode.py +++ b/src/allmydata/test/test_dirnode.py @@ -2,7 +2,8 @@ import time from zope.interface import implements from twisted.trial import unittest -from allmydata import uri, dirnode, upload +from allmydata import uri, dirnode +from allmydata.immutable import upload from allmydata.interfaces import IURI, IClient, IMutableFileNode, \ INewDirectoryURI, IReadonlyNewDirectoryURI, IFileNode, ExistingChildError from allmydata.util import hashutil, testutil diff --git a/src/allmydata/test/test_encode.py b/src/allmydata/test/test_encode.py index 6ba0e6db..bb9ba969 100644 --- a/src/allmydata/test/test_encode.py +++ b/src/allmydata/test/test_encode.py @@ -5,7 +5,8 @@ from twisted.internet import defer, reactor from twisted.internet.interfaces import IConsumer from twisted.python.failure import Failure from foolscap import eventual -from allmydata import encode, upload, download, hashtree, uri +from allmydata import hashtree, uri +from allmydata.immutable import encode, upload, download from allmydata.util import hashutil, testutil from allmydata.util.assertutil import _assert from allmydata.interfaces import IStorageBucketWriter, IStorageBucketReader diff --git a/src/allmydata/test/test_filenode.py b/src/allmydata/test/test_filenode.py index 39f7abb5..a08e97bf 100644 --- a/src/allmydata/test/test_filenode.py +++ b/src/allmydata/test/test_filenode.py @@ -1,6 +1,7 @@ from twisted.trial import unittest -from allmydata import filenode, uri, download +from allmydata import uri +from allmydata.immutable import filenode, download from allmydata.mutable.node import MutableFileNode from allmydata.util import hashutil diff --git a/src/allmydata/test/test_helper.py b/src/allmydata/test/test_helper.py index d72b9514..2f61eab2 100644 --- a/src/allmydata/test/test_helper.py +++ b/src/allmydata/test/test_helper.py @@ -6,7 +6,8 @@ from twisted.application import service from foolscap import Tub, eventual from foolscap.logging import log -from allmydata import offloaded, storage, upload +from allmydata import offloaded, storage +from allmydata.immutable import upload from allmydata.util import hashutil, fileutil, mathutil from pycryptopp.cipher.aes import AES diff --git a/src/allmydata/test/test_mutable.py b/src/allmydata/test/test_mutable.py index eb80b181..3ce59502 100644 --- a/src/allmydata/test/test_mutable.py +++ b/src/allmydata/test/test_mutable.py @@ -4,12 +4,13 @@ from cStringIO import StringIO from twisted.trial import unittest from twisted.internet import defer, reactor from twisted.python import failure -from allmydata import uri, download, storage +from allmydata import uri, storage +from allmydata.immutable import download +from allmydata.immutable.encode import NotEnoughSharesError from allmydata.util import base32, testutil, idlib from allmydata.util.idlib import shortnodeid_b2a from allmydata.util.hashutil import tagged_hash from allmydata.util.fileutil import make_dirs -from allmydata.encode import NotEnoughSharesError from allmydata.interfaces import IURI, IMutableFileURI, IUploadable, \ FileTooLargeError from foolscap.eventual import eventually, fireEventually diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py index 2b3e7a1a..237b8920 100644 --- a/src/allmydata/test/test_system.py +++ b/src/allmydata/test/test_system.py @@ -8,8 +8,8 @@ from twisted.internet import threads # CLI tests use deferToThread from twisted.internet.error import ConnectionDone, ConnectionLost from twisted.application import service import allmydata -from allmydata import client, uri, download, upload, storage, offloaded, \ - filenode +from allmydata import client, uri, storage, offloaded +from allmydata.immutable import download, upload, filenode from allmydata.introducer.server import IntroducerNode from allmydata.util import fileutil, idlib, mathutil, testutil from allmydata.util import log, base32 diff --git a/src/allmydata/test/test_upload.py b/src/allmydata/test/test_upload.py index 7dd7e008..221c985b 100644 --- a/src/allmydata/test/test_upload.py +++ b/src/allmydata/test/test_upload.py @@ -6,7 +6,8 @@ from twisted.python import log from twisted.internet import defer from cStringIO import StringIO -from allmydata import upload, encode, uri +from allmydata import uri +from allmydata.immutable import upload, encode from allmydata.interfaces import IFileURI, FileTooLargeError from allmydata.util.assertutil import precondition from allmydata.util.deferredutil import DeferredListShouldSucceed diff --git a/src/allmydata/test/test_web.py b/src/allmydata/test/test_web.py index 326689a3..6ef18678 100644 --- a/src/allmydata/test/test_web.py +++ b/src/allmydata/test/test_web.py @@ -5,7 +5,8 @@ from twisted.trial import unittest from twisted.internet import defer, reactor from twisted.web import client, error, http from twisted.python import failure, log -from allmydata import interfaces, provisioning, uri, webish, upload, download +from allmydata import interfaces, provisioning, uri, webish +from allmydata.immutable import upload, download from allmydata.web import status, common from allmydata.util import fileutil from allmydata.test.common import FakeDirectoryNode, FakeCHKFileNode, \ diff --git a/src/allmydata/upload.py b/src/allmydata/upload.py deleted file mode 100644 index 714e7a53..00000000 --- a/src/allmydata/upload.py +++ /dev/null @@ -1,1266 +0,0 @@ - -import os, time, weakref, itertools -from zope.interface import implements -from twisted.python import failure -from twisted.internet import defer -from twisted.application import service -from foolscap import Referenceable, Copyable, RemoteCopy -from foolscap import eventual -from foolscap.logging import log - -from allmydata.util.hashutil import file_renewal_secret_hash, \ - file_cancel_secret_hash, bucket_renewal_secret_hash, \ - bucket_cancel_secret_hash, plaintext_hasher, \ - storage_index_hash, plaintext_segment_hasher, convergence_hasher -from allmydata import encode, storage, hashtree, uri -from allmydata.util import base32, idlib, mathutil -from allmydata.util.assertutil import precondition -from allmydata.interfaces import IUploadable, IUploader, IUploadResults, \ - IEncryptedUploadable, RIEncryptedUploadable, IUploadStatus -from pycryptopp.cipher.aes import AES - -from cStringIO import StringIO - - -KiB=1024 -MiB=1024*KiB -GiB=1024*MiB -TiB=1024*GiB -PiB=1024*TiB - -class HaveAllPeersError(Exception): - # we use this to jump out of the loop - pass - -# this wants to live in storage, not here -class TooFullError(Exception): - pass - -class UploadResults(Copyable, RemoteCopy): - implements(IUploadResults) - typeToCopy = "allmydata.upload.UploadResults.tahoe.allmydata.com" - copytype = typeToCopy - - def __init__(self): - self.timings = {} # dict of name to number of seconds - self.sharemap = {} # dict of shnum to placement string - self.servermap = {} # dict of peerid to set(shnums) - self.file_size = None - self.ciphertext_fetched = None # how much the helper fetched - self.uri = None - self.preexisting_shares = None # count of shares already present - self.pushed_shares = None # count of shares we pushed - - -# our current uri_extension is 846 bytes for small files, a few bytes -# more for larger ones (since the filesize is encoded in decimal in a -# few places). Ask for a little bit more just in case we need it. If -# the extension changes size, we can change EXTENSION_SIZE to -# allocate a more accurate amount of space. -EXTENSION_SIZE = 1000 -# TODO: actual extensions are closer to 419 bytes, so we can probably lower -# this. - -class PeerTracker: - def __init__(self, peerid, storage_server, - sharesize, blocksize, num_segments, num_share_hashes, - storage_index, - bucket_renewal_secret, bucket_cancel_secret): - precondition(isinstance(peerid, str), peerid) - precondition(len(peerid) == 20, peerid) - self.peerid = peerid - self._storageserver = storage_server # to an RIStorageServer - self.buckets = {} # k: shareid, v: IRemoteBucketWriter - self.sharesize = sharesize - as = storage.allocated_size(sharesize, - num_segments, - num_share_hashes, - EXTENSION_SIZE) - self.allocated_size = as - - self.blocksize = blocksize - self.num_segments = num_segments - self.num_share_hashes = num_share_hashes - self.storage_index = storage_index - - self.renew_secret = bucket_renewal_secret - self.cancel_secret = bucket_cancel_secret - - def __repr__(self): - return ("" - % (idlib.shortnodeid_b2a(self.peerid), - storage.si_b2a(self.storage_index)[:5])) - - def query(self, sharenums): - d = self._storageserver.callRemote("allocate_buckets", - self.storage_index, - self.renew_secret, - self.cancel_secret, - sharenums, - self.allocated_size, - canary=Referenceable()) - d.addCallback(self._got_reply) - return d - - def _got_reply(self, (alreadygot, buckets)): - #log.msg("%s._got_reply(%s)" % (self, (alreadygot, buckets))) - b = {} - for sharenum, rref in buckets.iteritems(): - bp = storage.WriteBucketProxy(rref, self.sharesize, - self.blocksize, - self.num_segments, - self.num_share_hashes, - EXTENSION_SIZE, - self.peerid) - b[sharenum] = bp - self.buckets.update(b) - return (alreadygot, set(b.keys())) - -class Tahoe2PeerSelector: - - def __init__(self, upload_id, logparent=None, upload_status=None): - self.upload_id = upload_id - self.query_count, self.good_query_count, self.bad_query_count = 0,0,0 - self.error_count = 0 - self.num_peers_contacted = 0 - self.last_failure_msg = None - self._status = IUploadStatus(upload_status) - self._log_parent = log.msg("%s starting" % self, parent=logparent) - - def __repr__(self): - return "" % self.upload_id - - def get_shareholders(self, client, - storage_index, share_size, block_size, - num_segments, total_shares, shares_of_happiness): - """ - @return: (used_peers, already_peers), where used_peers is a set of - PeerTracker instances that have agreed to hold some shares - for us (the shnum is stashed inside the PeerTracker), - and already_peers is a dict mapping shnum to a peer - which claims to already have the share. - """ - - if self._status: - self._status.set_status("Contacting Peers..") - - self.total_shares = total_shares - self.shares_of_happiness = shares_of_happiness - - self.homeless_shares = range(total_shares) - # self.uncontacted_peers = list() # peers we haven't asked yet - self.contacted_peers = [] # peers worth asking again - self.contacted_peers2 = [] # peers that we have asked again - self._started_second_pass = False - self.use_peers = set() # PeerTrackers that have shares assigned to them - self.preexisting_shares = {} # sharenum -> peerid holding the share - - peers = client.get_permuted_peers("storage", storage_index) - if not peers: - raise encode.NotEnoughSharesError("client gave us zero peers") - - # figure out how much space to ask for - - # this needed_hashes computation should mirror - # Encoder.send_all_share_hash_trees. We use an IncompleteHashTree - # (instead of a HashTree) because we don't require actual hashing - # just to count the levels. - ht = hashtree.IncompleteHashTree(total_shares) - num_share_hashes = len(ht.needed_hashes(0, include_leaf=True)) - - # decide upon the renewal/cancel secrets, to include them in the - # allocat_buckets query. - client_renewal_secret = client.get_renewal_secret() - client_cancel_secret = client.get_cancel_secret() - - file_renewal_secret = file_renewal_secret_hash(client_renewal_secret, - storage_index) - file_cancel_secret = file_cancel_secret_hash(client_cancel_secret, - storage_index) - - trackers = [ PeerTracker(peerid, conn, - share_size, block_size, - num_segments, num_share_hashes, - storage_index, - bucket_renewal_secret_hash(file_renewal_secret, - peerid), - bucket_cancel_secret_hash(file_cancel_secret, - peerid), - ) - for (peerid, conn) in peers ] - self.uncontacted_peers = trackers - - d = defer.maybeDeferred(self._loop) - return d - - def _loop(self): - if not self.homeless_shares: - # all done - msg = ("placed all %d shares, " - "sent %d queries to %d peers, " - "%d queries placed some shares, %d placed none, " - "got %d errors" % - (self.total_shares, - self.query_count, self.num_peers_contacted, - self.good_query_count, self.bad_query_count, - self.error_count)) - log.msg("peer selection successful for %s: %s" % (self, msg), - parent=self._log_parent) - return (self.use_peers, self.preexisting_shares) - - if self.uncontacted_peers: - peer = self.uncontacted_peers.pop(0) - # TODO: don't pre-convert all peerids to PeerTrackers - assert isinstance(peer, PeerTracker) - - shares_to_ask = set([self.homeless_shares.pop(0)]) - self.query_count += 1 - self.num_peers_contacted += 1 - if self._status: - self._status.set_status("Contacting Peers [%s] (first query)," - " %d shares left.." - % (idlib.shortnodeid_b2a(peer.peerid), - len(self.homeless_shares))) - d = peer.query(shares_to_ask) - d.addBoth(self._got_response, peer, shares_to_ask, - self.contacted_peers) - return d - elif self.contacted_peers: - # ask a peer that we've already asked. - if not self._started_second_pass: - log.msg("starting second pass", parent=self._log_parent, - level=log.NOISY) - self._started_second_pass = True - num_shares = mathutil.div_ceil(len(self.homeless_shares), - len(self.contacted_peers)) - peer = self.contacted_peers.pop(0) - shares_to_ask = set(self.homeless_shares[:num_shares]) - self.homeless_shares[:num_shares] = [] - self.query_count += 1 - if self._status: - self._status.set_status("Contacting Peers [%s] (second query)," - " %d shares left.." - % (idlib.shortnodeid_b2a(peer.peerid), - len(self.homeless_shares))) - d = peer.query(shares_to_ask) - d.addBoth(self._got_response, peer, shares_to_ask, - self.contacted_peers2) - return d - elif self.contacted_peers2: - # we've finished the second-or-later pass. Move all the remaining - # peers back into self.contacted_peers for the next pass. - self.contacted_peers.extend(self.contacted_peers2) - self.contacted_peers[:] = [] - return self._loop() - else: - # no more peers. If we haven't placed enough shares, we fail. - placed_shares = self.total_shares - len(self.homeless_shares) - if placed_shares < self.shares_of_happiness: - msg = ("placed %d shares out of %d total (%d homeless), " - "sent %d queries to %d peers, " - "%d queries placed some shares, %d placed none, " - "got %d errors" % - (self.total_shares - len(self.homeless_shares), - self.total_shares, len(self.homeless_shares), - self.query_count, self.num_peers_contacted, - self.good_query_count, self.bad_query_count, - self.error_count)) - msg = "peer selection failed for %s: %s" % (self, msg) - if self.last_failure_msg: - msg += " (%s)" % (self.last_failure_msg,) - log.msg(msg, level=log.UNUSUAL, parent=self._log_parent) - raise encode.NotEnoughSharesError(msg) - else: - # we placed enough to be happy, so we're done - if self._status: - self._status.set_status("Placed all shares") - return self.use_peers - - def _got_response(self, res, peer, shares_to_ask, put_peer_here): - if isinstance(res, failure.Failure): - # This is unusual, and probably indicates a bug or a network - # problem. - log.msg("%s got error during peer selection: %s" % (peer, res), - level=log.UNUSUAL, parent=self._log_parent) - self.error_count += 1 - self.homeless_shares = list(shares_to_ask) + self.homeless_shares - if (self.uncontacted_peers - or self.contacted_peers - or self.contacted_peers2): - # there is still hope, so just loop - pass - else: - # No more peers, so this upload might fail (it depends upon - # whether we've hit shares_of_happiness or not). Log the last - # failure we got: if a coding error causes all peers to fail - # in the same way, this allows the common failure to be seen - # by the uploader and should help with debugging - msg = ("last failure (from %s) was: %s" % (peer, res)) - self.last_failure_msg = msg - else: - (alreadygot, allocated) = res - log.msg("response from peer %s: alreadygot=%s, allocated=%s" - % (idlib.shortnodeid_b2a(peer.peerid), - tuple(sorted(alreadygot)), tuple(sorted(allocated))), - level=log.NOISY, parent=self._log_parent) - progress = False - for s in alreadygot: - self.preexisting_shares[s] = peer.peerid - if s in self.homeless_shares: - self.homeless_shares.remove(s) - progress = True - - # the PeerTracker will remember which shares were allocated on - # that peer. We just have to remember to use them. - if allocated: - self.use_peers.add(peer) - progress = True - - not_yet_present = set(shares_to_ask) - set(alreadygot) - still_homeless = not_yet_present - set(allocated) - - if progress: - # they accepted or already had at least one share, so - # progress has been made - self.good_query_count += 1 - else: - self.bad_query_count += 1 - - if still_homeless: - # In networks with lots of space, this is very unusual and - # probably indicates an error. In networks with peers that - # are full, it is merely unusual. In networks that are very - # full, it is common, and many uploads will fail. In most - # cases, this is obviously not fatal, and we'll just use some - # other peers. - - # some shares are still homeless, keep trying to find them a - # home. The ones that were rejected get first priority. - self.homeless_shares = (list(still_homeless) - + self.homeless_shares) - # Since they were unable to accept all of our requests, so it - # is safe to assume that asking them again won't help. - else: - # if they *were* able to accept everything, they might be - # willing to accept even more. - put_peer_here.append(peer) - - # now loop - return self._loop() - - -class EncryptAnUploadable: - """This is a wrapper that takes an IUploadable and provides - IEncryptedUploadable.""" - implements(IEncryptedUploadable) - CHUNKSIZE = 50*1024 - - def __init__(self, original, log_parent=None): - self.original = IUploadable(original) - self._log_number = log_parent - self._encryptor = None - self._plaintext_hasher = plaintext_hasher() - self._plaintext_segment_hasher = None - self._plaintext_segment_hashes = [] - self._encoding_parameters = None - self._file_size = None - self._ciphertext_bytes_read = 0 - self._status = None - - def set_upload_status(self, upload_status): - self._status = IUploadStatus(upload_status) - self.original.set_upload_status(upload_status) - - def log(self, *args, **kwargs): - if "facility" not in kwargs: - kwargs["facility"] = "upload.encryption" - if "parent" not in kwargs: - kwargs["parent"] = self._log_number - return log.msg(*args, **kwargs) - - def get_size(self): - if self._file_size is not None: - return defer.succeed(self._file_size) - d = self.original.get_size() - def _got_size(size): - self._file_size = size - if self._status: - self._status.set_size(size) - return size - d.addCallback(_got_size) - return d - - def get_all_encoding_parameters(self): - if self._encoding_parameters is not None: - return defer.succeed(self._encoding_parameters) - d = self.original.get_all_encoding_parameters() - def _got(encoding_parameters): - (k, happy, n, segsize) = encoding_parameters - self._segment_size = segsize # used by segment hashers - self._encoding_parameters = encoding_parameters - self.log("my encoding parameters: %s" % (encoding_parameters,), - level=log.NOISY) - return encoding_parameters - d.addCallback(_got) - return d - - def _get_encryptor(self): - if self._encryptor: - return defer.succeed(self._encryptor) - - d = self.original.get_encryption_key() - def _got(key): - e = AES(key) - self._encryptor = e - - storage_index = storage_index_hash(key) - assert isinstance(storage_index, str) - # There's no point to having the SI be longer than the key, so we - # specify that it is truncated to the same 128 bits as the AES key. - assert len(storage_index) == 16 # SHA-256 truncated to 128b - self._storage_index = storage_index - if self._status: - self._status.set_storage_index(storage_index) - return e - d.addCallback(_got) - return d - - def get_storage_index(self): - d = self._get_encryptor() - d.addCallback(lambda res: self._storage_index) - return d - - def _get_segment_hasher(self): - p = self._plaintext_segment_hasher - if p: - left = self._segment_size - self._plaintext_segment_hashed_bytes - return p, left - p = plaintext_segment_hasher() - self._plaintext_segment_hasher = p - self._plaintext_segment_hashed_bytes = 0 - return p, self._segment_size - - def _update_segment_hash(self, chunk): - offset = 0 - while offset < len(chunk): - p, segment_left = self._get_segment_hasher() - chunk_left = len(chunk) - offset - this_segment = min(chunk_left, segment_left) - p.update(chunk[offset:offset+this_segment]) - self._plaintext_segment_hashed_bytes += this_segment - - if self._plaintext_segment_hashed_bytes == self._segment_size: - # we've filled this segment - self._plaintext_segment_hashes.append(p.digest()) - self._plaintext_segment_hasher = None - self.log("closed hash [%d]: %dB" % - (len(self._plaintext_segment_hashes)-1, - self._plaintext_segment_hashed_bytes), - level=log.NOISY) - self.log(format="plaintext leaf hash [%(segnum)d] is %(hash)s", - segnum=len(self._plaintext_segment_hashes)-1, - hash=base32.b2a(p.digest()), - level=log.NOISY) - - offset += this_segment - - - def read_encrypted(self, length, hash_only): - # make sure our parameters have been set up first - d = self.get_all_encoding_parameters() - # and size - d.addCallback(lambda ignored: self.get_size()) - d.addCallback(lambda ignored: self._get_encryptor()) - # then fetch and encrypt the plaintext. The unusual structure here - # (passing a Deferred *into* a function) is needed to avoid - # overflowing the stack: Deferreds don't optimize out tail recursion. - # We also pass in a list, to which _read_encrypted will append - # ciphertext. - ciphertext = [] - d2 = defer.Deferred() - d.addCallback(lambda ignored: - self._read_encrypted(length, ciphertext, hash_only, d2)) - d.addCallback(lambda ignored: d2) - return d - - def _read_encrypted(self, remaining, ciphertext, hash_only, fire_when_done): - if not remaining: - fire_when_done.callback(ciphertext) - return None - # tolerate large length= values without consuming a lot of RAM by - # reading just a chunk (say 50kB) at a time. This only really matters - # when hash_only==True (i.e. resuming an interrupted upload), since - # that's the case where we will be skipping over a lot of data. - size = min(remaining, self.CHUNKSIZE) - remaining = remaining - size - # read a chunk of plaintext.. - d = defer.maybeDeferred(self.original.read, size) - # N.B.: if read() is synchronous, then since everything else is - # actually synchronous too, we'd blow the stack unless we stall for a - # tick. Once you accept a Deferred from IUploadable.read(), you must - # be prepared to have it fire immediately too. - d.addCallback(eventual.fireEventually) - def _good(plaintext): - # and encrypt it.. - # o/' over the fields we go, hashing all the way, sHA! sHA! sHA! o/' - ct = self._hash_and_encrypt_plaintext(plaintext, hash_only) - ciphertext.extend(ct) - self._read_encrypted(remaining, ciphertext, hash_only, - fire_when_done) - def _err(why): - fire_when_done.errback(why) - d.addCallback(_good) - d.addErrback(_err) - return None - - def _hash_and_encrypt_plaintext(self, data, hash_only): - assert isinstance(data, (tuple, list)), type(data) - data = list(data) - cryptdata = [] - # we use data.pop(0) instead of 'for chunk in data' to save - # memory: each chunk is destroyed as soon as we're done with it. - bytes_processed = 0 - while data: - chunk = data.pop(0) - self.log(" read_encrypted handling %dB-sized chunk" % len(chunk), - level=log.NOISY) - bytes_processed += len(chunk) - self._plaintext_hasher.update(chunk) - self._update_segment_hash(chunk) - # TODO: we have to encrypt the data (even if hash_only==True) - # because pycryptopp's AES-CTR implementation doesn't offer a - # way to change the counter value. Once pycryptopp acquires - # this ability, change this to simply update the counter - # before each call to (hash_only==False) _encryptor.process() - ciphertext = self._encryptor.process(chunk) - if hash_only: - self.log(" skipping encryption", level=log.NOISY) - else: - cryptdata.append(ciphertext) - del ciphertext - del chunk - self._ciphertext_bytes_read += bytes_processed - if self._status: - progress = float(self._ciphertext_bytes_read) / self._file_size - self._status.set_progress(1, progress) - return cryptdata - - - def get_plaintext_hashtree_leaves(self, first, last, num_segments): - if len(self._plaintext_segment_hashes) < num_segments: - # close out the last one - assert len(self._plaintext_segment_hashes) == num_segments-1 - p, segment_left = self._get_segment_hasher() - self._plaintext_segment_hashes.append(p.digest()) - del self._plaintext_segment_hasher - self.log("closing plaintext leaf hasher, hashed %d bytes" % - self._plaintext_segment_hashed_bytes, - level=log.NOISY) - self.log(format="plaintext leaf hash [%(segnum)d] is %(hash)s", - segnum=len(self._plaintext_segment_hashes)-1, - hash=base32.b2a(p.digest()), - level=log.NOISY) - assert len(self._plaintext_segment_hashes) == num_segments - return defer.succeed(tuple(self._plaintext_segment_hashes[first:last])) - - def get_plaintext_hash(self): - h = self._plaintext_hasher.digest() - return defer.succeed(h) - - def close(self): - return self.original.close() - -class UploadStatus: - implements(IUploadStatus) - statusid_counter = itertools.count(0) - - def __init__(self): - self.storage_index = None - self.size = None - self.helper = False - self.status = "Not started" - self.progress = [0.0, 0.0, 0.0] - self.active = True - self.results = None - self.counter = self.statusid_counter.next() - self.started = time.time() - - def get_started(self): - return self.started - def get_storage_index(self): - return self.storage_index - def get_size(self): - return self.size - def using_helper(self): - return self.helper - def get_status(self): - return self.status - def get_progress(self): - return tuple(self.progress) - def get_active(self): - return self.active - def get_results(self): - return self.results - def get_counter(self): - return self.counter - - def set_storage_index(self, si): - self.storage_index = si - def set_size(self, size): - self.size = size - def set_helper(self, helper): - self.helper = helper - def set_status(self, status): - self.status = status - def set_progress(self, which, value): - # [0]: chk, [1]: ciphertext, [2]: encode+push - self.progress[which] = value - def set_active(self, value): - self.active = value - def set_results(self, value): - self.results = value - -class CHKUploader: - peer_selector_class = Tahoe2PeerSelector - - def __init__(self, client): - self._client = client - self._log_number = self._client.log("CHKUploader starting") - self._encoder = None - self._results = UploadResults() - self._storage_index = None - self._upload_status = UploadStatus() - self._upload_status.set_helper(False) - self._upload_status.set_active(True) - self._upload_status.set_results(self._results) - - def log(self, *args, **kwargs): - if "parent" not in kwargs: - kwargs["parent"] = self._log_number - if "facility" not in kwargs: - kwargs["facility"] = "tahoe.upload" - return self._client.log(*args, **kwargs) - - def start(self, uploadable): - """Start uploading the file. - - This method returns a Deferred that will fire with the URI (a - string).""" - - self._started = time.time() - uploadable = IUploadable(uploadable) - self.log("starting upload of %s" % uploadable) - - eu = EncryptAnUploadable(uploadable, self._log_number) - eu.set_upload_status(self._upload_status) - d = self.start_encrypted(eu) - def _uploaded(res): - d1 = uploadable.get_encryption_key() - d1.addCallback(lambda key: self._compute_uri(res, key)) - return d1 - d.addCallback(_uploaded) - def _done(res): - self._upload_status.set_active(False) - return res - d.addBoth(_done) - return d - - def abort(self): - """Call this is the upload must be abandoned before it completes. - This will tell the shareholders to delete their partial shares. I - return a Deferred that fires when these messages have been acked.""" - if not self._encoder: - # how did you call abort() before calling start() ? - return defer.succeed(None) - return self._encoder.abort() - - def start_encrypted(self, encrypted): - eu = IEncryptedUploadable(encrypted) - - started = time.time() - self._encoder = e = encode.Encoder(self._log_number, - self._upload_status) - d = e.set_encrypted_uploadable(eu) - d.addCallback(self.locate_all_shareholders, started) - d.addCallback(self.set_shareholders, e) - d.addCallback(lambda res: e.start()) - d.addCallback(self._encrypted_done) - # this fires with the uri_extension_hash and other data - return d - - def locate_all_shareholders(self, encoder, started): - peer_selection_started = now = time.time() - self._storage_index_elapsed = now - started - storage_index = encoder.get_param("storage_index") - self._storage_index = storage_index - upload_id = storage.si_b2a(storage_index)[:5] - self.log("using storage index %s" % upload_id) - peer_selector = self.peer_selector_class(upload_id, self._log_number, - self._upload_status) - - share_size = encoder.get_param("share_size") - block_size = encoder.get_param("block_size") - num_segments = encoder.get_param("num_segments") - k,desired,n = encoder.get_param("share_counts") - - self._peer_selection_started = time.time() - d = peer_selector.get_shareholders(self._client, storage_index, - share_size, block_size, - num_segments, n, desired) - def _done(res): - self._peer_selection_elapsed = time.time() - peer_selection_started - return res - d.addCallback(_done) - return d - - def set_shareholders(self, (used_peers, already_peers), encoder): - """ - @param used_peers: a sequence of PeerTracker objects - @paran already_peers: a dict mapping sharenum to a peerid that - claims to already have this share - """ - self.log("_send_shares, used_peers is %s" % (used_peers,)) - # record already-present shares in self._results - for (shnum, peerid) in already_peers.items(): - peerid_s = idlib.shortnodeid_b2a(peerid) - self._results.sharemap[shnum] = "Found on [%s]" % peerid_s - if peerid not in self._results.servermap: - self._results.servermap[peerid] = set() - self._results.servermap[peerid].add(shnum) - self._results.preexisting_shares = len(already_peers) - - self._sharemap = {} - for peer in used_peers: - assert isinstance(peer, PeerTracker) - buckets = {} - for peer in used_peers: - buckets.update(peer.buckets) - for shnum in peer.buckets: - self._sharemap[shnum] = peer - assert len(buckets) == sum([len(peer.buckets) for peer in used_peers]) - encoder.set_shareholders(buckets) - - def _encrypted_done(self, res): - r = self._results - for shnum in self._encoder.get_shares_placed(): - peer_tracker = self._sharemap[shnum] - peerid = peer_tracker.peerid - peerid_s = idlib.shortnodeid_b2a(peerid) - r.sharemap[shnum] = "Placed on [%s]" % peerid_s - if peerid not in r.servermap: - r.servermap[peerid] = set() - r.servermap[peerid].add(shnum) - r.pushed_shares = len(self._encoder.get_shares_placed()) - now = time.time() - r.file_size = self._encoder.file_size - r.timings["total"] = now - self._started - r.timings["storage_index"] = self._storage_index_elapsed - r.timings["peer_selection"] = self._peer_selection_elapsed - r.timings.update(self._encoder.get_times()) - r.uri_extension_data = self._encoder.get_uri_extension_data() - return res - - def _compute_uri(self, (uri_extension_hash, - needed_shares, total_shares, size), - key): - u = uri.CHKFileURI(key=key, - uri_extension_hash=uri_extension_hash, - needed_shares=needed_shares, - total_shares=total_shares, - size=size, - ) - r = self._results - r.uri = u.to_string() - return r - - def get_upload_status(self): - return self._upload_status - -def read_this_many_bytes(uploadable, size, prepend_data=[]): - if size == 0: - return defer.succeed([]) - d = uploadable.read(size) - def _got(data): - assert isinstance(data, list) - bytes = sum([len(piece) for piece in data]) - assert bytes > 0 - assert bytes <= size - remaining = size - bytes - if remaining: - return read_this_many_bytes(uploadable, remaining, - prepend_data + data) - return prepend_data + data - d.addCallback(_got) - return d - -class LiteralUploader: - - def __init__(self, client): - self._client = client - self._results = UploadResults() - self._status = s = UploadStatus() - s.set_storage_index(None) - s.set_helper(False) - s.set_progress(0, 1.0) - s.set_active(False) - s.set_results(self._results) - - def start(self, uploadable): - uploadable = IUploadable(uploadable) - d = uploadable.get_size() - def _got_size(size): - self._size = size - self._status.set_size(size) - self._results.file_size = size - return read_this_many_bytes(uploadable, size) - d.addCallback(_got_size) - d.addCallback(lambda data: uri.LiteralFileURI("".join(data))) - d.addCallback(lambda u: u.to_string()) - d.addCallback(self._build_results) - return d - - def _build_results(self, uri): - self._results.uri = uri - self._status.set_status("Done") - self._status.set_progress(1, 1.0) - self._status.set_progress(2, 1.0) - return self._results - - def close(self): - pass - - def get_upload_status(self): - return self._status - -class RemoteEncryptedUploadable(Referenceable): - implements(RIEncryptedUploadable) - - def __init__(self, encrypted_uploadable, upload_status): - self._eu = IEncryptedUploadable(encrypted_uploadable) - self._offset = 0 - self._bytes_sent = 0 - self._status = IUploadStatus(upload_status) - # we are responsible for updating the status string while we run, and - # for setting the ciphertext-fetch progress. - self._size = None - - def get_size(self): - if self._size is not None: - return defer.succeed(self._size) - d = self._eu.get_size() - def _got_size(size): - self._size = size - return size - d.addCallback(_got_size) - return d - - def remote_get_size(self): - return self.get_size() - def remote_get_all_encoding_parameters(self): - return self._eu.get_all_encoding_parameters() - - def _read_encrypted(self, length, hash_only): - d = self._eu.read_encrypted(length, hash_only) - def _read(strings): - if hash_only: - self._offset += length - else: - size = sum([len(data) for data in strings]) - self._offset += size - return strings - d.addCallback(_read) - return d - - def remote_read_encrypted(self, offset, length): - # we don't support seek backwards, but we allow skipping forwards - precondition(offset >= 0, offset) - precondition(length >= 0, length) - lp = log.msg("remote_read_encrypted(%d-%d)" % (offset, offset+length), - level=log.NOISY) - precondition(offset >= self._offset, offset, self._offset) - if offset > self._offset: - # read the data from disk anyways, to build up the hash tree - skip = offset - self._offset - log.msg("remote_read_encrypted skipping ahead from %d to %d, skip=%d" % - (self._offset, offset, skip), level=log.UNUSUAL, parent=lp) - d = self._read_encrypted(skip, hash_only=True) - else: - d = defer.succeed(None) - - def _at_correct_offset(res): - assert offset == self._offset, "%d != %d" % (offset, self._offset) - return self._read_encrypted(length, hash_only=False) - d.addCallback(_at_correct_offset) - - def _read(strings): - size = sum([len(data) for data in strings]) - self._bytes_sent += size - return strings - d.addCallback(_read) - return d - - def remote_get_plaintext_hashtree_leaves(self, first, last, num_segments): - log.msg("remote_get_plaintext_hashtree_leaves: %d-%d of %d" % - (first, last-1, num_segments), - level=log.NOISY) - d = self._eu.get_plaintext_hashtree_leaves(first, last, num_segments) - d.addCallback(list) - return d - def remote_get_plaintext_hash(self): - return self._eu.get_plaintext_hash() - def remote_close(self): - return self._eu.close() - - -class AssistedUploader: - - def __init__(self, helper): - self._helper = helper - self._log_number = log.msg("AssistedUploader starting") - self._storage_index = None - self._upload_status = s = UploadStatus() - s.set_helper(True) - s.set_active(True) - - def log(self, *args, **kwargs): - if "parent" not in kwargs: - kwargs["parent"] = self._log_number - return log.msg(*args, **kwargs) - - def start(self, uploadable): - self._started = time.time() - u = IUploadable(uploadable) - eu = EncryptAnUploadable(u, self._log_number) - eu.set_upload_status(self._upload_status) - self._encuploadable = eu - d = eu.get_size() - d.addCallback(self._got_size) - d.addCallback(lambda res: eu.get_all_encoding_parameters()) - d.addCallback(self._got_all_encoding_parameters) - # when we get the encryption key, that will also compute the storage - # index, so this only takes one pass. - # TODO: I'm not sure it's cool to switch back and forth between - # the Uploadable and the IEncryptedUploadable that wraps it. - d.addCallback(lambda res: u.get_encryption_key()) - d.addCallback(self._got_encryption_key) - d.addCallback(lambda res: eu.get_storage_index()) - d.addCallback(self._got_storage_index) - d.addCallback(self._contact_helper) - d.addCallback(self._build_readcap) - def _done(res): - self._upload_status.set_active(False) - return res - d.addBoth(_done) - return d - - def _got_size(self, size): - self._size = size - self._upload_status.set_size(size) - - def _got_all_encoding_parameters(self, params): - k, happy, n, segment_size = params - # stash these for URI generation later - self._needed_shares = k - self._total_shares = n - self._segment_size = segment_size - - def _got_encryption_key(self, key): - self._key = key - - def _got_storage_index(self, storage_index): - self._storage_index = storage_index - - - def _contact_helper(self, res): - now = self._time_contacting_helper_start = time.time() - self._storage_index_elapsed = now - self._started - self.log(format="contacting helper for SI %(si)s..", - si=storage.si_b2a(self._storage_index)) - self._upload_status.set_status("Contacting Helper") - d = self._helper.callRemote("upload_chk", self._storage_index) - d.addCallback(self._contacted_helper) - return d - - def _contacted_helper(self, (upload_results, upload_helper)): - now = time.time() - elapsed = now - self._time_contacting_helper_start - self._elapsed_time_contacting_helper = elapsed - if upload_helper: - self.log("helper says we need to upload") - self._upload_status.set_status("Uploading Ciphertext") - # we need to upload the file - reu = RemoteEncryptedUploadable(self._encuploadable, - self._upload_status) - # let it pre-compute the size for progress purposes - d = reu.get_size() - d.addCallback(lambda ignored: - upload_helper.callRemote("upload", reu)) - # this Deferred will fire with the upload results - return d - self.log("helper says file is already uploaded") - self._upload_status.set_progress(1, 1.0) - self._upload_status.set_results(upload_results) - return upload_results - - def _build_readcap(self, upload_results): - self.log("upload finished, building readcap") - self._upload_status.set_status("Building Readcap") - r = upload_results - assert r.uri_extension_data["needed_shares"] == self._needed_shares - assert r.uri_extension_data["total_shares"] == self._total_shares - assert r.uri_extension_data["segment_size"] == self._segment_size - assert r.uri_extension_data["size"] == self._size - u = uri.CHKFileURI(key=self._key, - uri_extension_hash=r.uri_extension_hash, - needed_shares=self._needed_shares, - total_shares=self._total_shares, - size=self._size, - ) - r.uri = u.to_string() - now = time.time() - r.file_size = self._size - r.timings["storage_index"] = self._storage_index_elapsed - r.timings["contacting_helper"] = self._elapsed_time_contacting_helper - if "total" in r.timings: - r.timings["helper_total"] = r.timings["total"] - r.timings["total"] = now - self._started - self._upload_status.set_status("Done") - self._upload_status.set_results(r) - return r - - def get_upload_status(self): - return self._upload_status - -class BaseUploadable: - default_max_segment_size = 128*KiB # overridden by max_segment_size - default_encoding_param_k = 3 # overridden by encoding_parameters - default_encoding_param_happy = 7 - default_encoding_param_n = 10 - - max_segment_size = None - encoding_param_k = None - encoding_param_happy = None - encoding_param_n = None - - _all_encoding_parameters = None - _status = None - - def set_upload_status(self, upload_status): - self._status = IUploadStatus(upload_status) - - def set_default_encoding_parameters(self, default_params): - assert isinstance(default_params, dict) - for k,v in default_params.items(): - precondition(isinstance(k, str), k, v) - precondition(isinstance(v, int), k, v) - if "k" in default_params: - self.default_encoding_param_k = default_params["k"] - if "happy" in default_params: - self.default_encoding_param_happy = default_params["happy"] - if "n" in default_params: - self.default_encoding_param_n = default_params["n"] - if "max_segment_size" in default_params: - self.default_max_segment_size = default_params["max_segment_size"] - - def get_all_encoding_parameters(self): - if self._all_encoding_parameters: - return defer.succeed(self._all_encoding_parameters) - - max_segsize = self.max_segment_size or self.default_max_segment_size - k = self.encoding_param_k or self.default_encoding_param_k - happy = self.encoding_param_happy or self.default_encoding_param_happy - n = self.encoding_param_n or self.default_encoding_param_n - - d = self.get_size() - def _got_size(file_size): - # for small files, shrink the segment size to avoid wasting space - segsize = min(max_segsize, file_size) - # this must be a multiple of 'required_shares'==k - segsize = mathutil.next_multiple(segsize, k) - encoding_parameters = (k, happy, n, segsize) - self._all_encoding_parameters = encoding_parameters - return encoding_parameters - d.addCallback(_got_size) - return d - -class FileHandle(BaseUploadable): - implements(IUploadable) - - def __init__(self, filehandle, convergence): - """ - Upload the data from the filehandle. If convergence is None then a - random encryption key will be used, else the plaintext will be hashed, - then the hash will be hashed together with the string in the - "convergence" argument to form the encryption key. - """ - assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) - self._filehandle = filehandle - self._key = None - self.convergence = convergence - self._size = None - - def _get_encryption_key_convergent(self): - if self._key is not None: - return defer.succeed(self._key) - - d = self.get_size() - # that sets self._size as a side-effect - d.addCallback(lambda size: self.get_all_encoding_parameters()) - def _got(params): - k, happy, n, segsize = params - f = self._filehandle - enckey_hasher = convergence_hasher(k, n, segsize, self.convergence) - f.seek(0) - BLOCKSIZE = 64*1024 - bytes_read = 0 - while True: - data = f.read(BLOCKSIZE) - if not data: - break - enckey_hasher.update(data) - # TODO: setting progress in a non-yielding loop is kind of - # pointless, but I'm anticipating (perhaps prematurely) the - # day when we use a slowjob or twisted's CooperatorService to - # make this yield time to other jobs. - bytes_read += len(data) - if self._status: - self._status.set_progress(0, float(bytes_read)/self._size) - f.seek(0) - self._key = enckey_hasher.digest() - if self._status: - self._status.set_progress(0, 1.0) - assert len(self._key) == 16 - return self._key - d.addCallback(_got) - return d - - def _get_encryption_key_random(self): - if self._key is None: - self._key = os.urandom(16) - return defer.succeed(self._key) - - def get_encryption_key(self): - if self.convergence is not None: - return self._get_encryption_key_convergent() - else: - return self._get_encryption_key_random() - - def get_size(self): - if self._size is not None: - return defer.succeed(self._size) - self._filehandle.seek(0,2) - size = self._filehandle.tell() - self._size = size - self._filehandle.seek(0) - return defer.succeed(size) - - def read(self, length): - return defer.succeed([self._filehandle.read(length)]) - - def close(self): - # the originator of the filehandle reserves the right to close it - pass - -class FileName(FileHandle): - def __init__(self, filename, convergence): - """ - Upload the data from the filename. If convergence is None then a - random encryption key will be used, else the plaintext will be hashed, - then the hash will be hashed together with the string in the - "convergence" argument to form the encryption key. - """ - assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) - FileHandle.__init__(self, open(filename, "rb"), convergence=convergence) - def close(self): - FileHandle.close(self) - self._filehandle.close() - -class Data(FileHandle): - def __init__(self, data, convergence): - """ - Upload the data from the data argument. If convergence is None then a - random encryption key will be used, else the plaintext will be hashed, - then the hash will be hashed together with the string in the - "convergence" argument to form the encryption key. - """ - assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) - FileHandle.__init__(self, StringIO(data), convergence=convergence) - -class Uploader(service.MultiService): - """I am a service that allows file uploading. I am a service-child of the - Client. - """ - implements(IUploader) - name = "uploader" - uploader_class = CHKUploader - URI_LIT_SIZE_THRESHOLD = 55 - MAX_UPLOAD_STATUSES = 10 - - def __init__(self, helper_furl=None, stats_provider=None): - self._helper_furl = helper_furl - self.stats_provider = stats_provider - self._helper = None - self._all_uploads = weakref.WeakKeyDictionary() # for debugging - self._all_upload_statuses = weakref.WeakKeyDictionary() - self._recent_upload_statuses = [] - service.MultiService.__init__(self) - - def startService(self): - service.MultiService.startService(self) - if self._helper_furl: - self.parent.tub.connectTo(self._helper_furl, - self._got_helper) - - def _got_helper(self, helper): - self._helper = helper - helper.notifyOnDisconnect(self._lost_helper) - def _lost_helper(self): - self._helper = None - - def get_helper_info(self): - # return a tuple of (helper_furl_or_None, connected_bool) - return (self._helper_furl, bool(self._helper)) - - def upload(self, uploadable): - # this returns the URI - assert self.parent - assert self.running - - uploadable = IUploadable(uploadable) - d = uploadable.get_size() - def _got_size(size): - default_params = self.parent.get_encoding_parameters() - precondition(isinstance(default_params, dict), default_params) - precondition("max_segment_size" in default_params, default_params) - uploadable.set_default_encoding_parameters(default_params) - - if self.stats_provider: - self.stats_provider.count('uploader.files_uploaded', 1) - self.stats_provider.count('uploader.bytes_uploaded', size) - - if size <= self.URI_LIT_SIZE_THRESHOLD: - uploader = LiteralUploader(self.parent) - elif self._helper: - uploader = AssistedUploader(self._helper) - else: - uploader = self.uploader_class(self.parent) - self._add_upload(uploader) - return uploader.start(uploadable) - d.addCallback(_got_size) - def _done(res): - uploadable.close() - return res - d.addBoth(_done) - return d - - def _add_upload(self, uploader): - s = uploader.get_upload_status() - self._all_uploads[uploader] = None - self._all_upload_statuses[s] = None - self._recent_upload_statuses.append(s) - while len(self._recent_upload_statuses) > self.MAX_UPLOAD_STATUSES: - self._recent_upload_statuses.pop(0) - - def list_all_upload_statuses(self): - for us in self._all_upload_statuses: - yield us diff --git a/src/allmydata/web/filenode.py b/src/allmydata/web/filenode.py index 19b1b47e..9a9a28ef 100644 --- a/src/allmydata/web/filenode.py +++ b/src/allmydata/web/filenode.py @@ -8,9 +8,9 @@ from twisted.internet import defer from nevow import url, rend from nevow.inevow import IRequest -from allmydata.upload import FileHandle from allmydata.interfaces import IDownloadTarget, ExistingChildError from allmydata.mutable.common import MODE_READ +from allmydata.immutable.upload import FileHandle from allmydata.util import log from allmydata.web.common import text_plain, WebError, IClient, RenderMixin, \ diff --git a/src/allmydata/web/unlinked.py b/src/allmydata/web/unlinked.py index 076908e9..3b8d9538 100644 --- a/src/allmydata/web/unlinked.py +++ b/src/allmydata/web/unlinked.py @@ -4,7 +4,7 @@ from twisted.web import http from twisted.internet import defer from nevow import rend, url, tags as T from nevow.inevow import IRequest -from allmydata.upload import FileHandle +from allmydata.immutable.upload import FileHandle from allmydata.web.common import IClient, getxmlfile, get_arg, boolean_of_arg from allmydata.web import status