misc/ringsim.py: tool to discuss #302
authorBrian Warner <warner@lothar.com>
Sat, 26 Dec 2009 06:03:39 +0000 (01:03 -0500)
committerBrian Warner <warner@lothar.com>
Sat, 26 Dec 2009 06:03:39 +0000 (01:03 -0500)
misc/ringsim.py [new file with mode: 0755]

diff --git a/misc/ringsim.py b/misc/ringsim.py
new file mode 100755 (executable)
index 0000000..b25c562
--- /dev/null
@@ -0,0 +1,202 @@
+#! /usr/bin/python
+
+# used to discuss ticket #302: "stop permuting peerlist?"
+
+import time
+import random, math
+from hashlib import sha1, md5, sha256
+sha1 = md5
+# md5: 1520 "uploads" per second
+# sha1: 1350 ups
+# sha256: 930 ups
+from itertools import count
+from twisted.python import usage
+
+def abbreviate_space(s, SI=True):
+    if s is None:
+        return "unknown"
+    if SI:
+        U = 1000.0
+        isuffix = "B"
+    else:
+        U = 1024.0
+        isuffix = "iB"
+    def r(count, suffix):
+        return "%.2f %s%s" % (count, suffix, isuffix)
+
+    if s < 1024: # 1000-1023 get emitted as bytes, even in SI mode
+        return "%d B" % s
+    if s < U*U:
+        return r(s/U, "k")
+    if s < U*U*U:
+        return r(s/(U*U), "M")
+    if s < U*U*U*U:
+        return r(s/(U*U*U), "G")
+    if s < U*U*U*U*U:
+        return r(s/(U*U*U*U), "T")
+    return r(s/(U*U*U*U*U), "P")
+
+def make_up_a_file_size(max=2**31):
+    #return (2 ** random.randrange(8, 31)) # avg=??
+    return random.randrange(max) # avg 1GB
+sizes = [make_up_a_file_size() for i in range(10000)]
+avg_filesize = sum(sizes)/len(sizes)
+print "average file size:", abbreviate_space(avg_filesize)
+
+SERVER_CAPACITY = 10**12 * 1000
+
+class Server:
+    def __init__(self, nodeid, capacity):
+        self.nodeid = nodeid
+        self.used = 0
+        self.capacity = capacity
+        self.numshares = 0
+        self.full_at_tick = None
+
+    def upload(self, sharesize):
+        if self.used + sharesize < self.capacity:
+            self.used += sharesize
+            self.numshares += 1
+            return True
+        return False
+
+    def __repr__(self):
+        if self.full_at_tick is not None:
+            return "<%s %s full at %d>" % (self.__class__.__name__, self.nodeid, self.full_at_tick)
+        else:
+            return "<%s %s>" % (self.__class__.__name__, self.nodeid)
+
+class Ring:
+    def __init__(self, numservers, seed, permute):
+        self.servers = []
+        for i in range(numservers):
+            nodeid = sha1(str(seed)+str(i)).hexdigest()
+            capacity = SERVER_CAPACITY
+            s = Server(nodeid, capacity)
+            self.servers.append(s)
+        self.servers.sort(key=lambda s: s.nodeid)
+        for i in range(len(self.servers)):
+            s = self.servers[i]
+            next_s = self.servers[(i+1)%len(self.servers)]
+            diff = "%032x" % (int(next_s.nodeid,16) - int(s.nodeid,16))
+            s.next_diff = diff
+            prev_s = self.servers[(i-1)%len(self.servers)]
+            diff = "%032x" % (int(s.nodeid,16) - int(prev_s.nodeid,16))
+            s.prev_diff = diff
+            print s, s.prev_diff
+
+        print "sorted by delta"
+        for s in sorted(self.servers, key=lambda s:s.prev_diff):
+            print s, s.prev_diff
+        self.permute = permute
+
+    def servers_for_si(self, si):
+        if self.permute:
+            def sortkey(s):
+                return sha1(s.nodeid+si).digest()
+            return sorted(self.servers, key=sortkey)
+        for i in range(len(self.servers)):
+            if self.servers[i].nodeid >= si:
+                return self.servers[i:] + self.servers[:i]
+        return list(self.servers)
+
+    def show_servers(self, picked):
+        bits = []
+        for s in self.servers:
+            if s in picked:
+                bits.append("1")
+            else:
+                bits.append("0")
+        #d = [s in picked and "1" or "0" for s in self.servers]
+        return "".join(bits)
+
+    def dump_usage(self, numfiles, avg_space_per_file):
+        print "uploaded", numfiles
+        # avg_space_per_file measures expected grid-wide ciphertext per file
+        used = list(reversed(sorted([s.used for s in self.servers])))
+        # used is actual per-server ciphertext
+        usedpf = [1.0*u/numfiles for u in used]
+        # usedpf is actual per-server-per-file ciphertext
+        #print "min/max usage: %s/%s" % (abbreviate_space(used[-1]),
+        #                                abbreviate_space(used[0]))
+        avg_usage_per_file = avg_space_per_file/len(self.servers)
+        # avg_usage_per_file is expected per-server-per-file ciphertext
+        spreadpf = usedpf[0] - usedpf[-1]
+        average_usagepf = sum(usedpf) / len(usedpf)
+        variance = sum([(u-average_usagepf)**2 for u in usedpf])/(len(usedpf)-1)
+        std_deviation = math.sqrt(variance)
+        sd_of_total = std_deviation / avg_usage_per_file
+
+        print "min/max/(exp) usage-pf-ps %s/%s/(%s):" % (
+            abbreviate_space(usedpf[-1]),
+            abbreviate_space(usedpf[0]),
+            abbreviate_space(avg_usage_per_file) ),
+        print "spread-pf: %s (%.2f%%)" % (
+            abbreviate_space(spreadpf), 100.0*spreadpf/avg_usage_per_file),
+        #print "average_usage:", abbreviate_space(average_usagepf)
+        print "stddev: %s (%.2f%%)" % (abbreviate_space(std_deviation),
+                                       100.0*sd_of_total)
+        s2 = sorted(self.servers, key=lambda s: s.used)
+        print "least:", s2[0].nodeid
+        print "most:", s2[-1].nodeid
+
+
+class Options(usage.Options):
+    optParameters = [
+        ("k", "k", 3, "required shares", int),
+        ("N", "N", 10, "total shares", int),
+        ("servers", None, 100, "number of servers", int),
+        ("seed", None, None, "seed to use for creating ring"),
+        ("permute", "p", 1, "1 to permute, 0 to use flat ring", int),
+        ]
+    def postOptions(self):
+        assert self["seed"]
+
+
+def do_run(ring, opts):
+    avg_space_per_file = avg_filesize * opts["N"] / opts["k"]
+    start = time.time()
+    for filenum in count(0):
+        #used = list(reversed(sorted([s.used for s in ring.servers])))
+        #used = [s.used for s in ring.servers]
+        #print used
+        filesize = make_up_a_file_size()
+        sharesize = filesize / opts["k"]
+        si = sha1(str(random.randrange(2**40))).hexdigest()
+        if filenum%4000==0 and filenum > 1:
+            ring.dump_usage(filenum, avg_space_per_file)
+        servers = ring.servers_for_si(si)
+        #print ring.show_servers(servers[:opts["N"]])
+        remaining_shares = opts["N"]
+        index = 0
+        while remaining_shares:
+            s = servers[index]
+            accepted = s.upload(sharesize)
+            if not accepted:
+                return filenum # number of files successfully uploaded
+            remaining_shares -= 1
+            index += 1
+
+
+def do_ring(opts):
+    #seed = str(random.randrange(2**31))
+    total_capacity = opts["servers"]*SERVER_CAPACITY
+    avg_space_per_file = avg_filesize * opts["N"] / opts["k"]
+    avg_files = total_capacity / avg_space_per_file
+    print "expected number of uploads:", avg_files
+    if opts["permute"]:
+        print " PERMUTED"
+    else:
+        print " LINEAR"
+    seed = opts["seed"]
+
+    ring = Ring(opts["servers"], seed, opts["permute"])
+    num_files = do_run(ring, opts)
+
+def run(opts):
+    do_ring(opts)
+
+if __name__ == "__main__":
+    opts = Options()
+    opts.parseOptions()
+    run(opts)