From: Brian Warner Date: Sat, 26 Dec 2009 06:03:39 +0000 (-0500) Subject: misc/ringsim.py: tool to discuss #302 X-Git-Tag: trac-4200~65 X-Git-Url: https://git.rkrishnan.org/specifications/%5B/%5D%20/statistics?a=commitdiff_plain;h=aa102da8041e181bd7f0d45f86f7ff9512cde0ee;p=tahoe-lafs%2Ftahoe-lafs.git misc/ringsim.py: tool to discuss #302 --- diff --git a/misc/ringsim.py b/misc/ringsim.py new file mode 100755 index 00000000..b25c5629 --- /dev/null +++ b/misc/ringsim.py @@ -0,0 +1,202 @@ +#! /usr/bin/python + +# used to discuss ticket #302: "stop permuting peerlist?" + +import time +import random, math +from hashlib import sha1, md5, sha256 +sha1 = md5 +# md5: 1520 "uploads" per second +# sha1: 1350 ups +# sha256: 930 ups +from itertools import count +from twisted.python import usage + +def abbreviate_space(s, SI=True): + if s is None: + return "unknown" + if SI: + U = 1000.0 + isuffix = "B" + else: + U = 1024.0 + isuffix = "iB" + def r(count, suffix): + return "%.2f %s%s" % (count, suffix, isuffix) + + if s < 1024: # 1000-1023 get emitted as bytes, even in SI mode + return "%d B" % s + if s < U*U: + return r(s/U, "k") + if s < U*U*U: + return r(s/(U*U), "M") + if s < U*U*U*U: + return r(s/(U*U*U), "G") + if s < U*U*U*U*U: + return r(s/(U*U*U*U), "T") + return r(s/(U*U*U*U*U), "P") + +def make_up_a_file_size(max=2**31): + #return (2 ** random.randrange(8, 31)) # avg=?? + return random.randrange(max) # avg 1GB +sizes = [make_up_a_file_size() for i in range(10000)] +avg_filesize = sum(sizes)/len(sizes) +print "average file size:", abbreviate_space(avg_filesize) + +SERVER_CAPACITY = 10**12 * 1000 + +class Server: + def __init__(self, nodeid, capacity): + self.nodeid = nodeid + self.used = 0 + self.capacity = capacity + self.numshares = 0 + self.full_at_tick = None + + def upload(self, sharesize): + if self.used + sharesize < self.capacity: + self.used += sharesize + self.numshares += 1 + return True + return False + + def __repr__(self): + if self.full_at_tick is not None: + return "<%s %s full at %d>" % (self.__class__.__name__, self.nodeid, self.full_at_tick) + else: + return "<%s %s>" % (self.__class__.__name__, self.nodeid) + +class Ring: + def __init__(self, numservers, seed, permute): + self.servers = [] + for i in range(numservers): + nodeid = sha1(str(seed)+str(i)).hexdigest() + capacity = SERVER_CAPACITY + s = Server(nodeid, capacity) + self.servers.append(s) + self.servers.sort(key=lambda s: s.nodeid) + for i in range(len(self.servers)): + s = self.servers[i] + next_s = self.servers[(i+1)%len(self.servers)] + diff = "%032x" % (int(next_s.nodeid,16) - int(s.nodeid,16)) + s.next_diff = diff + prev_s = self.servers[(i-1)%len(self.servers)] + diff = "%032x" % (int(s.nodeid,16) - int(prev_s.nodeid,16)) + s.prev_diff = diff + print s, s.prev_diff + + print "sorted by delta" + for s in sorted(self.servers, key=lambda s:s.prev_diff): + print s, s.prev_diff + self.permute = permute + + def servers_for_si(self, si): + if self.permute: + def sortkey(s): + return sha1(s.nodeid+si).digest() + return sorted(self.servers, key=sortkey) + for i in range(len(self.servers)): + if self.servers[i].nodeid >= si: + return self.servers[i:] + self.servers[:i] + return list(self.servers) + + def show_servers(self, picked): + bits = [] + for s in self.servers: + if s in picked: + bits.append("1") + else: + bits.append("0") + #d = [s in picked and "1" or "0" for s in self.servers] + return "".join(bits) + + def dump_usage(self, numfiles, avg_space_per_file): + print "uploaded", numfiles + # avg_space_per_file measures expected grid-wide ciphertext per file + used = list(reversed(sorted([s.used for s in self.servers]))) + # used is actual per-server ciphertext + usedpf = [1.0*u/numfiles for u in used] + # usedpf is actual per-server-per-file ciphertext + #print "min/max usage: %s/%s" % (abbreviate_space(used[-1]), + # abbreviate_space(used[0])) + avg_usage_per_file = avg_space_per_file/len(self.servers) + # avg_usage_per_file is expected per-server-per-file ciphertext + spreadpf = usedpf[0] - usedpf[-1] + average_usagepf = sum(usedpf) / len(usedpf) + variance = sum([(u-average_usagepf)**2 for u in usedpf])/(len(usedpf)-1) + std_deviation = math.sqrt(variance) + sd_of_total = std_deviation / avg_usage_per_file + + print "min/max/(exp) usage-pf-ps %s/%s/(%s):" % ( + abbreviate_space(usedpf[-1]), + abbreviate_space(usedpf[0]), + abbreviate_space(avg_usage_per_file) ), + print "spread-pf: %s (%.2f%%)" % ( + abbreviate_space(spreadpf), 100.0*spreadpf/avg_usage_per_file), + #print "average_usage:", abbreviate_space(average_usagepf) + print "stddev: %s (%.2f%%)" % (abbreviate_space(std_deviation), + 100.0*sd_of_total) + s2 = sorted(self.servers, key=lambda s: s.used) + print "least:", s2[0].nodeid + print "most:", s2[-1].nodeid + + +class Options(usage.Options): + optParameters = [ + ("k", "k", 3, "required shares", int), + ("N", "N", 10, "total shares", int), + ("servers", None, 100, "number of servers", int), + ("seed", None, None, "seed to use for creating ring"), + ("permute", "p", 1, "1 to permute, 0 to use flat ring", int), + ] + def postOptions(self): + assert self["seed"] + + +def do_run(ring, opts): + avg_space_per_file = avg_filesize * opts["N"] / opts["k"] + start = time.time() + for filenum in count(0): + #used = list(reversed(sorted([s.used for s in ring.servers]))) + #used = [s.used for s in ring.servers] + #print used + filesize = make_up_a_file_size() + sharesize = filesize / opts["k"] + si = sha1(str(random.randrange(2**40))).hexdigest() + if filenum%4000==0 and filenum > 1: + ring.dump_usage(filenum, avg_space_per_file) + servers = ring.servers_for_si(si) + #print ring.show_servers(servers[:opts["N"]]) + remaining_shares = opts["N"] + index = 0 + while remaining_shares: + s = servers[index] + accepted = s.upload(sharesize) + if not accepted: + return filenum # number of files successfully uploaded + remaining_shares -= 1 + index += 1 + + +def do_ring(opts): + #seed = str(random.randrange(2**31)) + total_capacity = opts["servers"]*SERVER_CAPACITY + avg_space_per_file = avg_filesize * opts["N"] / opts["k"] + avg_files = total_capacity / avg_space_per_file + print "expected number of uploads:", avg_files + if opts["permute"]: + print " PERMUTED" + else: + print " LINEAR" + seed = opts["seed"] + + ring = Ring(opts["servers"], seed, opts["permute"]) + num_files = do_run(ring, opts) + +def run(opts): + do_ring(opts) + +if __name__ == "__main__": + opts = Options() + opts.parseOptions() + run(opts)