From: Brian Warner Date: Sat, 7 Apr 2007 03:37:38 +0000 (-0700) Subject: misc cleanup: remove old .tac files, move old stuff into misc/ X-Git-Url: https://git.rkrishnan.org/%5B/%5D%20/file/URI:LIT:krugkidfnzsc4/(%5B%5E?a=commitdiff_plain;h=3687b37c60729380ecd2b6da30285af1a6c8be74;p=tahoe-lafs%2Ftahoe-lafs.git misc cleanup: remove old .tac files, move old stuff into misc/ --- diff --git a/client.tac b/client.tac deleted file mode 100644 index 28dc0748..00000000 --- a/client.tac +++ /dev/null @@ -1,9 +0,0 @@ -# -*- python -*- - -from allmydata import client -from twisted.application import service - -c = client.Client() - -application = service.Application("allmydata_client") -c.setServiceParent(application) diff --git a/misc/simulator.py b/misc/simulator.py new file mode 100644 index 00000000..5a53fadb --- /dev/null +++ b/misc/simulator.py @@ -0,0 +1,292 @@ +#! /usr/bin/env python + +import sha as shamodule +import os, random + +from pkg_resources import require +require('PyRRD') +from pyrrd import graph +from pyrrd.rrd import DataSource, RRD, RRA + + +def sha(s): + return shamodule.new(s).digest() + +def randomid(): + return os.urandom(20) + +class Node: + def __init__(self, nid, queen, simulator): + self.nid = nid + self.queen = queen + self.simulator = simulator + self.shares = {} + self.capacity = random.randrange(1000) + self.utilization = 0 + self.files = [] + + def permute_peers(self, fileid): + permuted = [(sha(fileid+n.nid),n) + for n in self.queen.get_all_nodes()] + permuted.sort() + return permuted + + def publish_file(self, fileid, size, numshares=100): + sharesize = 4 * size / numshares + permuted = self.permute_peers(fileid) + last_givento = None + tried = 0 + givento = [] + while numshares and permuted: + pid,node = permuted.pop(0) + tried += 1 + last_givento = pid + if node.accept_share(fileid, sharesize): + givento.append((pid,node)) + numshares -= 1 + if numshares: + # couldn't push, should delete + for pid,node in givento: + node.delete_share(fileid) + return False + self.files.append((fileid, numshares)) + self.queen.please_preserve(fileid, size, tried, last_givento) + return (True, tried) + + def accept_share(self, fileid, sharesize): + accept = False + if self.utilization < self.capacity: + # we have room! yay! + self.shares[fileid] = sharesize + self.utilization += sharesize + return True + if self.decide(sharesize): + # we don't, but we'll make room + self.make_space(sharesize) + self.shares[fileid] = sharesize + self.utilization += sharesize + return True + else: + # we're full, try elsewhere + return False + + def decide(self, sharesize): + if sharesize > self.capacity: + return False + return False + return random.random() > 0.5 + + def make_space(self, sharesize): + assert sharesize <= self.capacity + while self.capacity - self.utilization < sharesize: + victim = random.choice(self.shares.keys()) + self.simulator.lost_data(self.shares[victim]) + self.delete_share(victim) + + def delete_share(self, fileid): + if fileid in self.shares: + self.utilization -= self.shares[fileid] + del self.shares[fileid] + return True + return False + + def retrieve_file(self): + if not self.files: + return + fileid,numshares = random.choice(self.files) + needed = numshares / 4 + peers = [] + for pid,node in self.permute_peers(fileid): + if random.random() > self.simulator.P_NODEAVAIL: + continue # node isn't available right now + if node.has_share(fileid): + peers.append(node) + if len(peers) >= needed: + return True + return False + + def delete_file(self): + if not self.files: + return False + which = random.choice(self.files) + self.files.remove(which) + fileid,numshares = which + self.queen.delete(fileid) + return True + +class Queen: + def __init__(self, simulator): + self.living_files = {} + self.utilization = 0 # total size of all active files + self.simulator = simulator + self.simulator.stamp_utilization(self.utilization) + + def get_all_nodes(self): + return self.all_nodes + + def please_preserve(self, fileid, size, tried, last_givento): + self.living_files[fileid] = (size, tried, last_givento) + self.utilization += size + self.simulator.stamp_utilization(self.utilization) + + def please_delete(self, fileid): + self.delete(fileid) + + def permute_peers(self, fileid): + permuted = [(sha(fileid+n.nid),n) + for n in self.get_all_nodes()] + permuted.sort() + return permuted + + def delete(self, fileid): + permuted = self.permute_peers(fileid) + size, tried, last_givento = self.living_files[fileid] + pid = "" + while tried and pid < last_givento: + pid,node = permuted.pop(0) + had_it = node.delete_share(fileid) + if had_it: + tried -= 1 + self.utilization -= size + self.simulator.stamp_utilization(self.utilization) + del self.living_files[fileid] + +class Simulator: + NUM_NODES = 1000 + EVENTS = ["ADDFILE", "DELFILE", "ADDNODE", "DELNODE"] + RATE_ADDFILE = 1.0 / 10 + RATE_DELFILE = 1.0 / 20 + RATE_ADDNODE = 1.0 / 3000 + RATE_DELNODE = 1.0 / 4000 + P_NODEAVAIL = 1.0 + + def __init__(self): + self.time = 1164783600 # small numbers of seconds since the epoch confuse rrdtool + self.prevstamptime = int(self.time) + + ds = DataSource(ds_name='utilizationds', ds_type='GAUGE', heartbeat=1) + rra = RRA(cf='AVERAGE', xff=0.1, steps=1, rows=1200) + self.rrd = RRD("/tmp/utilization.rrd", ds=[ds], rra=[rra], start=self.time) + self.rrd.create() + + self.queen = q = Queen(self) + self.all_nodes = [Node(randomid(), q, self) + for i in range(self.NUM_NODES)] + q.all_nodes = self.all_nodes + self.next = [] + self.schedule_events() + self.verbose = False + + self.added_files = 0 + self.added_data = 0 + self.deleted_files = 0 + self.published_files = [] + self.failed_files = 0 + self.lost_data_bytes = 0 # bytes deleted to make room for new shares + + def stamp_utilization(self, utilization): + if int(self.time) > (self.prevstamptime+1): + self.rrd.bufferValue(self.time, utilization) + self.prevstamptime = int(self.time) + + def write_graph(self): + self.rrd.update() + self.rrd = None + import gc + gc.collect() + + def1 = graph.DataDefinition(vname="a", rrdfile='/tmp/utilization.rrd', ds_name='utilizationds') + area1 = graph.Area(value="a", color="#990033", legend='utilizationlegend') + g = graph.Graph('/tmp/utilization.png', imgformat='PNG', width=540, height=100, vertical_label='utilizationverticallabel', title='utilizationtitle', lower_limit=0) + g.data.append(def1) + g.data.append(area1) + g.write() + + def add_file(self): + size = random.randrange(1000) + n = random.choice(self.all_nodes) + if self.verbose: + print "add_file(size=%d, from node %s)" % (size, n) + fileid = randomid() + able = n.publish_file(fileid, size) + if able: + able, tried = able + self.added_files += 1 + self.added_data += size + self.published_files.append(tried) + else: + self.failed_files += 1 + + def lost_data(self, size): + self.lost_data_bytes += size + + def delete_file(self): + all_nodes = self.all_nodes[:] + random.shuffle(all_nodes) + for n in all_nodes: + if n.delete_file(): + self.deleted_files += 1 + return + print "no files to delete" + + def _add_event(self, etype): + rate = getattr(self, "RATE_" + etype) + next = self.time + random.expovariate(rate) + self.next.append((next, etype)) + self.next.sort() + + def schedule_events(self): + types = set([e[1] for e in self.next]) + for etype in self.EVENTS: + if not etype in types: + self._add_event(etype) + + def do_event(self): + time, etype = self.next.pop(0) + assert time > self.time + current_time = self.time + self.time = time + self._add_event(etype) + if etype == "ADDFILE": + self.add_file() + elif etype == "DELFILE": + self.delete_file() + elif etype == "ADDNODE": + pass + #self.add_node() + elif etype == "DELNODE": + #self.del_node() + pass + # self.print_stats(current_time, etype) + + def print_stats_header(self): + print "time: added failed lost avg_tried" + + def print_stats(self, time, etype): + if not self.published_files: + avg_tried = "NONE" + else: + avg_tried = sum(self.published_files) / len(self.published_files) + print time, etype, self.added_data, self.failed_files, self.lost_data_bytes, avg_tried, len(self.queen.living_files), self.queen.utilization + +global s +s = None + +def main(): +# rrdtool.create("foo.rrd", +# "--step 10", +# "DS:files-added:DERIVE::0:1000", +# "RRA:AVERAGE:1:1:1200", +# ) + global s + s = Simulator() + # s.print_stats_header() + for i in range(1000): + s.do_event() + print "%d files added, %d files deleted" % (s.added_files, s.deleted_files) + return s + +if __name__ == '__main__': + main() + + diff --git a/misc/sizes.py b/misc/sizes.py new file mode 100644 index 00000000..5d7eec04 --- /dev/null +++ b/misc/sizes.py @@ -0,0 +1,213 @@ +#! /usr/bin/env python + +import random, math, os, re +from twisted.python import usage + +class Args(usage.Options): + optParameters = [ + ["mode", "m", "alpha", "validation scheme"], + ["arity", "k", 2, "k (airty) for hash tree"], + ] + def opt_arity(self, option): + self['arity'] = int(option) + def parseArgs(self, *args): + if len(args) > 0: + self['mode'] = args[0] + + +def charttest(): + import gdchart + sizes = [random.randrange(10, 20) for i in range(10)] + x = gdchart.Line() + x.width = 250 + x.height = 250 + x.xtitle = "sample" + x.ytitle = "size" + x.title = "Example Graph" + #x.ext_color = [ "white", "yellow", "red", "blue", "green"] + x.setData(sizes) + #x.setLabels(["Mon", "Tue", "Wed", "Thu", "Fri"]) + x.draw("simple.png") + +KiB=1024 +MiB=1024*KiB +GiB=1024*MiB +TiB=1024*GiB +PiB=1024*TiB + +class Sizes: + def __init__(self, mode, file_size, arity=2): + MAX_SEGSIZE = 2*MiB + self.mode = mode + self.file_size = file_size + self.seg_size = seg_size = 1.0 * min(MAX_SEGSIZE, file_size) + self.num_segs = num_segs = math.ceil(file_size / seg_size) + self.num_subblocks = num_subblocks = num_segs + + self.num_blocks = num_blocks = 100 + self.blocks_needed = blocks_needed = 25 + + self.subblock_size = subblock_size = seg_size / blocks_needed + self.block_size = block_size = subblock_size * num_subblocks + + # none of this includes the block-level hash chain yet, since that is + # only a function of the number of blocks. All overhead numbers + # assume that the block-level hash chain has already been sent, + # including the root of the subblock-level hash tree. + + if mode == "alpha": + # no hash tree at all + self.subblock_arity = 0 + self.subblock_tree_depth = 0 + self.subblock_overhead = 0 + self.bytes_until_some_data = 20 + block_size + self.block_storage_overhead = 0 + self.block_transmission_overhead = 0 + + elif mode == "beta": + # k=num_subblocks, d=1 + # each subblock has a 20-byte hash + self.subblock_arity = num_subblocks + self.subblock_tree_depth = 1 + self.subblock_overhead = 20 + # the block has a list of hashes, one for each subblock + self.block_storage_overhead = (self.subblock_overhead * + num_subblocks) + # we can get away with not sending the hash of the block that + # we're sending in full, once + self.block_transmission_overhead = self.block_storage_overhead - 20 + # we must get the whole list (so it can be validated) before + # any data can be validated + self.bytes_until_some_data = (self.block_transmission_overhead + + subblock_size) + + elif mode == "gamma": + self.subblock_arity = k = arity + d = math.ceil(math.log(num_subblocks, k)) + self.subblock_tree_depth = d + num_leaves = k ** d + # to make things easier, we make the pessimistic assumption that + # we have to store hashes for all the empty places in the tree + # (when the number of blocks is not an exact exponent of k) + self.subblock_overhead = 20 + # the subblock hashes are organized into a k-ary tree, which + # means storing (and eventually transmitting) more hashes. This + # count includes all the low-level block hashes and the root. + hash_nodes = (num_leaves*k - 1) / (k - 1) + #print "hash_depth", d + #print "num_leaves", num_leaves + #print "hash_nodes", hash_nodes + # the storage overhead is this + self.block_storage_overhead = 20 * (hash_nodes - 1) + # the transmission overhead is smaller: if we actually transmit + # every subblock, we don't have to transmit 1/k of the + # lowest-level subblock hashes, and we don't have to transmit the + # root because it was already sent with the block-level hash tree + self.block_transmission_overhead = 20 * (hash_nodes + - 1 # the root + - num_leaves / k) + # we must get a full sibling hash chain before we can validate + # any data + sibling_length = d * (k-1) + self.bytes_until_some_data = 20 * sibling_length + subblock_size + + + + else: + raise RuntimeError("unknown mode '%s" % mode) + + self.storage_overhead = self.block_storage_overhead * num_blocks + self.storage_overhead_percentage = 100.0 * self.storage_overhead / file_size + + def dump(self): + for k in ("mode", "file_size", "seg_size", + "num_segs", "num_subblocks", "num_blocks", "blocks_needed", + "subblock_size", "block_size", + "subblock_arity", "subblock_tree_depth", + "subblock_overhead", + "block_storage_overhead", "block_transmission_overhead", + "storage_overhead", "storage_overhead_percentage", + "bytes_until_some_data"): + print k, getattr(self, k) + +def fmt(num, trim=False): + if num < KiB: + #s = str(num) + "#" + s = "%.2f#" % num + elif num < MiB: + s = "%.2fk" % (num / KiB) + elif num < GiB: + s = "%.2fM" % (num / MiB) + elif num < TiB: + s = "%.2fG" % (num / GiB) + elif num < PiB: + s = "%.2fT" % (num / TiB) + else: + s = "big" + if trim: + s = re.sub(r'(\.0+)([kMGT#])', + lambda m: m.group(2), + s) + else: + s = re.sub(r'(\.0+)([kMGT#])', + lambda m: (" "*len(m.group(1))+m.group(2)), + s) + if s.endswith("#"): + s = s[:-1] + " " + return s + +def text(): + opts = Args() + opts.parseOptions() + mode = opts["mode"] + arity = opts["arity"] + # 0123456789012345678901234567890123456789012345678901234567890123456 + print "mode=%s" % mode, " arity=%d" % arity + print " storage storage" + print "Size blocksize overhead overhead k d alacrity" + print " (bytes) (%)" + print "------- ------- -------- -------- ---- -- --------" + #sizes = [2 ** i for i in range(7, 41)] + radix = math.sqrt(10); expstep = 2 + radix = 2; expstep = 2 + #radix = 10; expstep = 1 + maxexp = int(math.ceil(math.log(1e12, radix)))+2 + sizes = [radix ** i for i in range(2,maxexp,expstep)] + for file_size in sizes: + s = Sizes(mode, file_size, arity) + out = "" + out += "%7s " % fmt(file_size, trim=True) + out += "%7s " % fmt(s.block_size) + out += "%8s" % fmt(s.storage_overhead) + out += "%10.2f " % s.storage_overhead_percentage + out += " %4d" % int(s.subblock_arity) + out += " %2d" % int(s.subblock_tree_depth) + out += " %8s" % fmt(s.bytes_until_some_data) + print out + + +def graph(): + # doesn't work yet + import Gnuplot + opts = Args() + opts.parseOptions() + mode = opts["mode"] + arity = opts["arity"] + g = Gnuplot.Gnuplot(debug=1) + g.title("overhead / alacrity tradeoffs") + g.xlabel("file size") + g.ylabel("stuff") + sizes = [2 ** i for i in range(7, 32)] + series = {"overhead": {}, "alacrity": {}} + for file_size in sizes: + s = Sizes(mode, file_size, arity) + series["overhead"][file_size] = s.storage_overhead_percentage + series["alacrity"][file_size] = s.bytes_until_some_data + g.plot([ (fs, series["overhead"][fs]) + for fs in sizes ]) + raw_input("press return") + + +if __name__ == '__main__': + text() + #graph() diff --git a/queen.tac b/queen.tac deleted file mode 100644 index ac9b9b1a..00000000 --- a/queen.tac +++ /dev/null @@ -1,9 +0,0 @@ -# -*- python -*- - -from allmydata import queen -from twisted.application import service - -c = queen.Queen() - -application = service.Application("allmydata_queen") -c.setServiceParent(application) diff --git a/simulator.py b/simulator.py deleted file mode 100644 index 5a53fadb..00000000 --- a/simulator.py +++ /dev/null @@ -1,292 +0,0 @@ -#! /usr/bin/env python - -import sha as shamodule -import os, random - -from pkg_resources import require -require('PyRRD') -from pyrrd import graph -from pyrrd.rrd import DataSource, RRD, RRA - - -def sha(s): - return shamodule.new(s).digest() - -def randomid(): - return os.urandom(20) - -class Node: - def __init__(self, nid, queen, simulator): - self.nid = nid - self.queen = queen - self.simulator = simulator - self.shares = {} - self.capacity = random.randrange(1000) - self.utilization = 0 - self.files = [] - - def permute_peers(self, fileid): - permuted = [(sha(fileid+n.nid),n) - for n in self.queen.get_all_nodes()] - permuted.sort() - return permuted - - def publish_file(self, fileid, size, numshares=100): - sharesize = 4 * size / numshares - permuted = self.permute_peers(fileid) - last_givento = None - tried = 0 - givento = [] - while numshares and permuted: - pid,node = permuted.pop(0) - tried += 1 - last_givento = pid - if node.accept_share(fileid, sharesize): - givento.append((pid,node)) - numshares -= 1 - if numshares: - # couldn't push, should delete - for pid,node in givento: - node.delete_share(fileid) - return False - self.files.append((fileid, numshares)) - self.queen.please_preserve(fileid, size, tried, last_givento) - return (True, tried) - - def accept_share(self, fileid, sharesize): - accept = False - if self.utilization < self.capacity: - # we have room! yay! - self.shares[fileid] = sharesize - self.utilization += sharesize - return True - if self.decide(sharesize): - # we don't, but we'll make room - self.make_space(sharesize) - self.shares[fileid] = sharesize - self.utilization += sharesize - return True - else: - # we're full, try elsewhere - return False - - def decide(self, sharesize): - if sharesize > self.capacity: - return False - return False - return random.random() > 0.5 - - def make_space(self, sharesize): - assert sharesize <= self.capacity - while self.capacity - self.utilization < sharesize: - victim = random.choice(self.shares.keys()) - self.simulator.lost_data(self.shares[victim]) - self.delete_share(victim) - - def delete_share(self, fileid): - if fileid in self.shares: - self.utilization -= self.shares[fileid] - del self.shares[fileid] - return True - return False - - def retrieve_file(self): - if not self.files: - return - fileid,numshares = random.choice(self.files) - needed = numshares / 4 - peers = [] - for pid,node in self.permute_peers(fileid): - if random.random() > self.simulator.P_NODEAVAIL: - continue # node isn't available right now - if node.has_share(fileid): - peers.append(node) - if len(peers) >= needed: - return True - return False - - def delete_file(self): - if not self.files: - return False - which = random.choice(self.files) - self.files.remove(which) - fileid,numshares = which - self.queen.delete(fileid) - return True - -class Queen: - def __init__(self, simulator): - self.living_files = {} - self.utilization = 0 # total size of all active files - self.simulator = simulator - self.simulator.stamp_utilization(self.utilization) - - def get_all_nodes(self): - return self.all_nodes - - def please_preserve(self, fileid, size, tried, last_givento): - self.living_files[fileid] = (size, tried, last_givento) - self.utilization += size - self.simulator.stamp_utilization(self.utilization) - - def please_delete(self, fileid): - self.delete(fileid) - - def permute_peers(self, fileid): - permuted = [(sha(fileid+n.nid),n) - for n in self.get_all_nodes()] - permuted.sort() - return permuted - - def delete(self, fileid): - permuted = self.permute_peers(fileid) - size, tried, last_givento = self.living_files[fileid] - pid = "" - while tried and pid < last_givento: - pid,node = permuted.pop(0) - had_it = node.delete_share(fileid) - if had_it: - tried -= 1 - self.utilization -= size - self.simulator.stamp_utilization(self.utilization) - del self.living_files[fileid] - -class Simulator: - NUM_NODES = 1000 - EVENTS = ["ADDFILE", "DELFILE", "ADDNODE", "DELNODE"] - RATE_ADDFILE = 1.0 / 10 - RATE_DELFILE = 1.0 / 20 - RATE_ADDNODE = 1.0 / 3000 - RATE_DELNODE = 1.0 / 4000 - P_NODEAVAIL = 1.0 - - def __init__(self): - self.time = 1164783600 # small numbers of seconds since the epoch confuse rrdtool - self.prevstamptime = int(self.time) - - ds = DataSource(ds_name='utilizationds', ds_type='GAUGE', heartbeat=1) - rra = RRA(cf='AVERAGE', xff=0.1, steps=1, rows=1200) - self.rrd = RRD("/tmp/utilization.rrd", ds=[ds], rra=[rra], start=self.time) - self.rrd.create() - - self.queen = q = Queen(self) - self.all_nodes = [Node(randomid(), q, self) - for i in range(self.NUM_NODES)] - q.all_nodes = self.all_nodes - self.next = [] - self.schedule_events() - self.verbose = False - - self.added_files = 0 - self.added_data = 0 - self.deleted_files = 0 - self.published_files = [] - self.failed_files = 0 - self.lost_data_bytes = 0 # bytes deleted to make room for new shares - - def stamp_utilization(self, utilization): - if int(self.time) > (self.prevstamptime+1): - self.rrd.bufferValue(self.time, utilization) - self.prevstamptime = int(self.time) - - def write_graph(self): - self.rrd.update() - self.rrd = None - import gc - gc.collect() - - def1 = graph.DataDefinition(vname="a", rrdfile='/tmp/utilization.rrd', ds_name='utilizationds') - area1 = graph.Area(value="a", color="#990033", legend='utilizationlegend') - g = graph.Graph('/tmp/utilization.png', imgformat='PNG', width=540, height=100, vertical_label='utilizationverticallabel', title='utilizationtitle', lower_limit=0) - g.data.append(def1) - g.data.append(area1) - g.write() - - def add_file(self): - size = random.randrange(1000) - n = random.choice(self.all_nodes) - if self.verbose: - print "add_file(size=%d, from node %s)" % (size, n) - fileid = randomid() - able = n.publish_file(fileid, size) - if able: - able, tried = able - self.added_files += 1 - self.added_data += size - self.published_files.append(tried) - else: - self.failed_files += 1 - - def lost_data(self, size): - self.lost_data_bytes += size - - def delete_file(self): - all_nodes = self.all_nodes[:] - random.shuffle(all_nodes) - for n in all_nodes: - if n.delete_file(): - self.deleted_files += 1 - return - print "no files to delete" - - def _add_event(self, etype): - rate = getattr(self, "RATE_" + etype) - next = self.time + random.expovariate(rate) - self.next.append((next, etype)) - self.next.sort() - - def schedule_events(self): - types = set([e[1] for e in self.next]) - for etype in self.EVENTS: - if not etype in types: - self._add_event(etype) - - def do_event(self): - time, etype = self.next.pop(0) - assert time > self.time - current_time = self.time - self.time = time - self._add_event(etype) - if etype == "ADDFILE": - self.add_file() - elif etype == "DELFILE": - self.delete_file() - elif etype == "ADDNODE": - pass - #self.add_node() - elif etype == "DELNODE": - #self.del_node() - pass - # self.print_stats(current_time, etype) - - def print_stats_header(self): - print "time: added failed lost avg_tried" - - def print_stats(self, time, etype): - if not self.published_files: - avg_tried = "NONE" - else: - avg_tried = sum(self.published_files) / len(self.published_files) - print time, etype, self.added_data, self.failed_files, self.lost_data_bytes, avg_tried, len(self.queen.living_files), self.queen.utilization - -global s -s = None - -def main(): -# rrdtool.create("foo.rrd", -# "--step 10", -# "DS:files-added:DERIVE::0:1000", -# "RRA:AVERAGE:1:1:1200", -# ) - global s - s = Simulator() - # s.print_stats_header() - for i in range(1000): - s.do_event() - print "%d files added, %d files deleted" % (s.added_files, s.deleted_files) - return s - -if __name__ == '__main__': - main() - - diff --git a/sizes.py b/sizes.py deleted file mode 100644 index 5d7eec04..00000000 --- a/sizes.py +++ /dev/null @@ -1,213 +0,0 @@ -#! /usr/bin/env python - -import random, math, os, re -from twisted.python import usage - -class Args(usage.Options): - optParameters = [ - ["mode", "m", "alpha", "validation scheme"], - ["arity", "k", 2, "k (airty) for hash tree"], - ] - def opt_arity(self, option): - self['arity'] = int(option) - def parseArgs(self, *args): - if len(args) > 0: - self['mode'] = args[0] - - -def charttest(): - import gdchart - sizes = [random.randrange(10, 20) for i in range(10)] - x = gdchart.Line() - x.width = 250 - x.height = 250 - x.xtitle = "sample" - x.ytitle = "size" - x.title = "Example Graph" - #x.ext_color = [ "white", "yellow", "red", "blue", "green"] - x.setData(sizes) - #x.setLabels(["Mon", "Tue", "Wed", "Thu", "Fri"]) - x.draw("simple.png") - -KiB=1024 -MiB=1024*KiB -GiB=1024*MiB -TiB=1024*GiB -PiB=1024*TiB - -class Sizes: - def __init__(self, mode, file_size, arity=2): - MAX_SEGSIZE = 2*MiB - self.mode = mode - self.file_size = file_size - self.seg_size = seg_size = 1.0 * min(MAX_SEGSIZE, file_size) - self.num_segs = num_segs = math.ceil(file_size / seg_size) - self.num_subblocks = num_subblocks = num_segs - - self.num_blocks = num_blocks = 100 - self.blocks_needed = blocks_needed = 25 - - self.subblock_size = subblock_size = seg_size / blocks_needed - self.block_size = block_size = subblock_size * num_subblocks - - # none of this includes the block-level hash chain yet, since that is - # only a function of the number of blocks. All overhead numbers - # assume that the block-level hash chain has already been sent, - # including the root of the subblock-level hash tree. - - if mode == "alpha": - # no hash tree at all - self.subblock_arity = 0 - self.subblock_tree_depth = 0 - self.subblock_overhead = 0 - self.bytes_until_some_data = 20 + block_size - self.block_storage_overhead = 0 - self.block_transmission_overhead = 0 - - elif mode == "beta": - # k=num_subblocks, d=1 - # each subblock has a 20-byte hash - self.subblock_arity = num_subblocks - self.subblock_tree_depth = 1 - self.subblock_overhead = 20 - # the block has a list of hashes, one for each subblock - self.block_storage_overhead = (self.subblock_overhead * - num_subblocks) - # we can get away with not sending the hash of the block that - # we're sending in full, once - self.block_transmission_overhead = self.block_storage_overhead - 20 - # we must get the whole list (so it can be validated) before - # any data can be validated - self.bytes_until_some_data = (self.block_transmission_overhead + - subblock_size) - - elif mode == "gamma": - self.subblock_arity = k = arity - d = math.ceil(math.log(num_subblocks, k)) - self.subblock_tree_depth = d - num_leaves = k ** d - # to make things easier, we make the pessimistic assumption that - # we have to store hashes for all the empty places in the tree - # (when the number of blocks is not an exact exponent of k) - self.subblock_overhead = 20 - # the subblock hashes are organized into a k-ary tree, which - # means storing (and eventually transmitting) more hashes. This - # count includes all the low-level block hashes and the root. - hash_nodes = (num_leaves*k - 1) / (k - 1) - #print "hash_depth", d - #print "num_leaves", num_leaves - #print "hash_nodes", hash_nodes - # the storage overhead is this - self.block_storage_overhead = 20 * (hash_nodes - 1) - # the transmission overhead is smaller: if we actually transmit - # every subblock, we don't have to transmit 1/k of the - # lowest-level subblock hashes, and we don't have to transmit the - # root because it was already sent with the block-level hash tree - self.block_transmission_overhead = 20 * (hash_nodes - - 1 # the root - - num_leaves / k) - # we must get a full sibling hash chain before we can validate - # any data - sibling_length = d * (k-1) - self.bytes_until_some_data = 20 * sibling_length + subblock_size - - - - else: - raise RuntimeError("unknown mode '%s" % mode) - - self.storage_overhead = self.block_storage_overhead * num_blocks - self.storage_overhead_percentage = 100.0 * self.storage_overhead / file_size - - def dump(self): - for k in ("mode", "file_size", "seg_size", - "num_segs", "num_subblocks", "num_blocks", "blocks_needed", - "subblock_size", "block_size", - "subblock_arity", "subblock_tree_depth", - "subblock_overhead", - "block_storage_overhead", "block_transmission_overhead", - "storage_overhead", "storage_overhead_percentage", - "bytes_until_some_data"): - print k, getattr(self, k) - -def fmt(num, trim=False): - if num < KiB: - #s = str(num) + "#" - s = "%.2f#" % num - elif num < MiB: - s = "%.2fk" % (num / KiB) - elif num < GiB: - s = "%.2fM" % (num / MiB) - elif num < TiB: - s = "%.2fG" % (num / GiB) - elif num < PiB: - s = "%.2fT" % (num / TiB) - else: - s = "big" - if trim: - s = re.sub(r'(\.0+)([kMGT#])', - lambda m: m.group(2), - s) - else: - s = re.sub(r'(\.0+)([kMGT#])', - lambda m: (" "*len(m.group(1))+m.group(2)), - s) - if s.endswith("#"): - s = s[:-1] + " " - return s - -def text(): - opts = Args() - opts.parseOptions() - mode = opts["mode"] - arity = opts["arity"] - # 0123456789012345678901234567890123456789012345678901234567890123456 - print "mode=%s" % mode, " arity=%d" % arity - print " storage storage" - print "Size blocksize overhead overhead k d alacrity" - print " (bytes) (%)" - print "------- ------- -------- -------- ---- -- --------" - #sizes = [2 ** i for i in range(7, 41)] - radix = math.sqrt(10); expstep = 2 - radix = 2; expstep = 2 - #radix = 10; expstep = 1 - maxexp = int(math.ceil(math.log(1e12, radix)))+2 - sizes = [radix ** i for i in range(2,maxexp,expstep)] - for file_size in sizes: - s = Sizes(mode, file_size, arity) - out = "" - out += "%7s " % fmt(file_size, trim=True) - out += "%7s " % fmt(s.block_size) - out += "%8s" % fmt(s.storage_overhead) - out += "%10.2f " % s.storage_overhead_percentage - out += " %4d" % int(s.subblock_arity) - out += " %2d" % int(s.subblock_tree_depth) - out += " %8s" % fmt(s.bytes_until_some_data) - print out - - -def graph(): - # doesn't work yet - import Gnuplot - opts = Args() - opts.parseOptions() - mode = opts["mode"] - arity = opts["arity"] - g = Gnuplot.Gnuplot(debug=1) - g.title("overhead / alacrity tradeoffs") - g.xlabel("file size") - g.ylabel("stuff") - sizes = [2 ** i for i in range(7, 32)] - series = {"overhead": {}, "alacrity": {}} - for file_size in sizes: - s = Sizes(mode, file_size, arity) - series["overhead"][file_size] = s.storage_overhead_percentage - series["alacrity"][file_size] = s.bytes_until_some_data - g.plot([ (fs, series["overhead"][fs]) - for fs in sizes ]) - raw_input("press return") - - -if __name__ == '__main__': - text() - #graph()