From: Brian Warner <warner@lothar.com>
Date: Thu, 19 Feb 2009 04:46:33 +0000 (-0700)
Subject: #633: first version of a rate-limited interruptable share-crawler
X-Git-Tag: allmydata-tahoe-1.4.0~190
X-Git-Url: https://git.rkrishnan.org/frontends/module-simplejson._speedups.html?a=commitdiff_plain;h=193889f793ce8bc2541b91395af1f32138ffe7c8;p=tahoe-lafs%2Ftahoe-lafs.git

#633: first version of a rate-limited interruptable share-crawler
---

diff --git a/src/allmydata/storage/crawler.py b/src/allmydata/storage/crawler.py
new file mode 100644
index 00000000..071f2060
--- /dev/null
+++ b/src/allmydata/storage/crawler.py
@@ -0,0 +1,183 @@
+
+import os, time, struct, pickle
+from twisted.internet import reactor
+from twisted.application import service
+from allmydata.storage.server import si_b2a
+
+class TimeSliceExceeded(Exception):
+    pass
+
+class ShareCrawler(service.MultiService):
+    """A ShareCrawler subclass is attached to a StorageServer, and
+    periodically walks all of its shares, processing each one in some
+    fashion. This crawl is rate-limited, to reduce the IO burden on the host,
+    since large servers will have several million shares, which can take
+    hours or days to read.
+
+    We assume that the normal upload/download/get_buckets traffic of a tahoe
+    grid will cause the prefixdir contents to be mostly cached, or that the
+    number of buckets in each prefixdir will be small enough to load quickly.
+    A 1TB allmydata.com server was measured to have 2.56M buckets, spread
+    into the 1040 prefixdirs, with about 2460 buckets per prefix. On this
+    server, each prefixdir took 130ms-200ms to list the first time, and 17ms
+    to list the second time.
+
+    To use this, create a subclass which implements the process_bucket()
+    method. It will be called with a prefixdir and a base32 storage index
+    string. process_bucket() should run synchronously.
+
+    Then create an instance, with a reference to a StorageServer and a
+    filename where it can store persistent state. The statefile is used to
+    keep track of how far around the ring the process has travelled, as well
+    as timing history to allow the pace to be predicted and controlled. The
+    statefile will be updated and written to disk after every bucket is
+    processed.
+
+    The crawler instance must be started with startService() before it will
+    do any work. To make it stop doing work, call stopService() and wait for
+    the Deferred that it returns.
+    """
+
+    # use up to 10% of the CPU, on average. This can be changed at any time.
+    allowed_cpu_percentage = .10
+    # use up to 1.0 seconds before yielding. This can be changed at any time.
+    cpu_slice = 1.0
+    # don't run a cycle faster than this
+    minimum_cycle_time = 300
+
+    def __init__(self, server, statefile):
+        service.MultiService.__init__(self)
+        self.server = server
+        self.sharedir = server.sharedir
+        self.statefile = statefile
+        self.prefixes = [si_b2a(struct.pack(">H", i << (16-10)))[:2]
+                         for i in range(2**10)]
+        self.prefixes.sort()
+        self.timer = None
+        self.bucket_cache = (None, [])
+        self.first_cycle_finished = False
+
+    def load_state(self):
+        try:
+            f = open(self.statefile, "rb")
+            state = pickle.load(f)
+            lcp = state["last-complete-prefix"]
+            if lcp == None:
+                self.last_complete_prefix_index = -1
+            else:
+                self.last_complete_prefix_index = self.prefixes.index(lcp)
+            self.last_complete_bucket = state["last-complete-bucket"]
+            self.first_cycle_finished = state["first-cycle-finished"]
+            f.close()
+        except EnvironmentError:
+            self.last_complete_prefix_index = -1
+            self.last_complete_bucket = None
+            self.first_cycle_finished = False
+
+    def save_state(self):
+        lcpi = self.last_complete_prefix_index
+        if lcpi == -1:
+            last_complete_prefix = None
+        else:
+            last_complete_prefix = self.prefixes[lcpi]
+        state = {"version": 1,
+                 "last-complete-prefix": last_complete_prefix,
+                 "last-complete-bucket": self.last_complete_bucket,
+                 "first-cycle-finished": self.first_cycle_finished,
+                 }
+        tmpfile = self.statefile + ".tmp"
+        f = open(tmpfile, "wb")
+        pickle.dump(state, f)
+        f.close()
+        os.rename(tmpfile, self.statefile)
+
+    def startService(self):
+        self.load_state()
+        self.timer = reactor.callLater(0, self.start_slice)
+        service.MultiService.startService(self)
+
+    def stopService(self):
+        if self.timer:
+            self.timer.cancel()
+            self.timer = None
+        return service.MultiService.stopService(self)
+
+    def start_slice(self):
+        self.timer = None
+        start_slice = time.time()
+        try:
+            self.start_current_prefix(start_slice)
+            finished_cycle = True
+        except TimeSliceExceeded:
+            finished_cycle = False
+        # either we finished a whole cycle, or we ran out of time
+        this_slice = time.time() - start_slice
+        # this_slice/(this_slice+sleep_time) = percentage
+        # this_slice/percentage = this_slice+sleep_time
+        # sleep_time = (this_slice/percentage) - this_slice
+        sleep_time = (this_slice / self.allowed_cpu_percentage) - this_slice
+        # if the math gets weird, or a timequake happens, don't sleep forever
+        sleep_time = max(0.0, min(sleep_time, 299))
+        if finished_cycle:
+            # how long should we sleep between cycles? Don't run faster than
+            # allowed_cpu_percentage says, but also run faster than
+            # minimum_cycle_time
+            self.sleeping_between_cycles = True
+            sleep_time = max(sleep_time, self.minimum_cycle_time)
+        else:
+            self.sleeping_between_cycles = False
+        self.current_sleep_time = sleep_time # for status page
+        self.yielding(sleep_time)
+        self.timer = reactor.callLater(sleep_time, self.start_slice)
+
+    def start_current_prefix(self, start_slice):
+        for i in range(self.last_complete_prefix_index+1, len(self.prefixes)):
+            if time.time() > start_slice + self.cpu_slice:
+                raise TimeSliceExceeded()
+            # if we want to yield earlier, just raise TimeSliceExceeded()
+            prefix = self.prefixes[i]
+            prefixdir = os.path.join(self.sharedir, prefix)
+            if i == self.bucket_cache[0]:
+                buckets = self.bucket_cache[1]
+            else:
+                try:
+                    buckets = os.listdir(prefixdir)
+                    buckets.sort()
+                except EnvironmentError:
+                    buckets = []
+                self.bucket_cache = (i, buckets)
+            self.process_prefixdir(prefixdir, buckets, start_slice)
+            self.last_complete_prefix_index = i
+            self.save_state()
+        # yay! we finished the whole cycle
+        self.last_complete_prefix_index = -1
+        self.last_complete_bucket = None
+        self.first_cycle_finished = True
+        self.save_state()
+        self.finished_cycle()
+
+    def process_prefixdir(self, prefixdir, buckets, start_slice):
+        """This gets a list of bucket names (i.e. storage index strings,
+        base32-encoded) in sorted order.
+
+        Override this if your crawler doesn't care about the actual shares,
+        for example a crawler which merely keeps track of how many buckets
+        are being managed by this server.
+        """
+        for bucket in buckets:
+            if bucket <= self.last_complete_bucket:
+                continue
+            if time.time() > start_slice + self.cpu_slice:
+                raise TimeSliceExceeded()
+            self.process_bucket(prefixdir, bucket)
+            self.last_complete_bucket = bucket
+            self.save_state()
+
+    def process_bucket(self, prefixdir, storage_index_b32):
+        pass
+
+    def finished_cycle(self):
+        pass
+
+    def yielding(self, sleep_time):
+        pass
diff --git a/src/allmydata/test/test_crawler.py b/src/allmydata/test/test_crawler.py
new file mode 100644
index 00000000..dfee79a3
--- /dev/null
+++ b/src/allmydata/test/test_crawler.py
@@ -0,0 +1,320 @@
+
+import time
+import os.path
+from twisted.trial import unittest
+from twisted.application import service
+from twisted.internet import defer
+from foolscap.eventual import eventually
+
+from allmydata.util import fileutil, hashutil, pollmixin
+from allmydata.storage.server import StorageServer, si_b2a
+from allmydata.storage.crawler import ShareCrawler, TimeSliceExceeded
+
+from test_storage import FakeCanary
+from common_util import StallMixin
+
+class BucketEnumeratingCrawler(ShareCrawler):
+    cpu_slice = 500 # make sure it can complete in a single slice
+    def __init__(self, server, statefile):
+        ShareCrawler.__init__(self, server, statefile)
+        self.all_buckets = []
+        self.finished_d = defer.Deferred()
+    def process_bucket(self, prefixdir, storage_index_b32):
+        self.all_buckets.append(storage_index_b32)
+    def finished_cycle(self):
+        eventually(self.finished_d.callback, None)
+
+class PacedCrawler(ShareCrawler):
+    cpu_slice = 500 # make sure it can complete in a single slice
+    def __init__(self, server, statefile):
+        ShareCrawler.__init__(self, server, statefile)
+        self.countdown = 6
+        self.all_buckets = []
+        self.finished_d = defer.Deferred()
+    def process_bucket(self, prefixdir, storage_index_b32):
+        self.all_buckets.append(storage_index_b32)
+        self.countdown -= 1
+        if self.countdown == 0:
+            # force a timeout. We restore it in yielding()
+            self.cpu_slice = -1.0
+    def yielding(self, sleep_time):
+        self.cpu_slice = 500
+    def finished_cycle(self):
+        eventually(self.finished_d.callback, None)
+
+class ConsumingCrawler(ShareCrawler):
+    cpu_slice = 0.5
+    allowed_cpu_percentage = 0.5
+    minimum_cycle_time = 0
+
+    def __init__(self, server, statefile):
+        ShareCrawler.__init__(self, server, statefile)
+        self.accumulated = 0.0
+        self.cycles = 0
+        self.last_yield = 0.0
+    def process_bucket(self, prefixdir, storage_index_b32):
+        start = time.time()
+        time.sleep(0.05)
+        elapsed = time.time() - start
+        self.accumulated += elapsed
+        self.last_yield += elapsed
+    def finished_cycle(self):
+        self.cycles += 1
+    def yielding(self, sleep_time):
+        self.last_yield = 0.0
+
+class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin):
+    def setUp(self):
+        self.s = service.MultiService()
+        self.s.startService()
+
+    def tearDown(self):
+        return self.s.stopService()
+
+    def si(self, i):
+        return hashutil.storage_index_hash(str(i))
+    def rs(self, i, serverid):
+        return hashutil.bucket_renewal_secret_hash(str(i), serverid)
+    def cs(self, i, serverid):
+        return hashutil.bucket_cancel_secret_hash(str(i), serverid)
+
+    def write(self, i, ss, serverid, tail=0):
+        si = self.si(i)
+        si = si[:-1] + chr(tail)
+        had,made = ss.remote_allocate_buckets(si,
+                                              self.rs(i, serverid),
+                                              self.cs(i, serverid),
+                                              set([0]), 99, FakeCanary())
+        made[0].remote_write(0, "data")
+        made[0].remote_close()
+        return si_b2a(si)
+
+    def test_immediate(self):
+        self.basedir = "crawler/Basic/immediate"
+        fileutil.make_dirs(self.basedir)
+        serverid = "\x00" * 20
+        ss = StorageServer(self.basedir, serverid)
+        ss.setServiceParent(self.s)
+
+        sis = [self.write(i, ss, serverid) for i in range(10)]
+        statefile = os.path.join(self.basedir, "statefile")
+
+        c = BucketEnumeratingCrawler(ss, statefile)
+        c.load_state()
+
+        c.start_current_prefix(time.time())
+        self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
+
+        # make sure the statefile has been returned to the starting point
+        c.finished_d = defer.Deferred()
+        c.all_buckets = []
+        c.start_current_prefix(time.time())
+        self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
+
+        # check that a new crawler picks up on the state file properly
+        c2 = BucketEnumeratingCrawler(ss, statefile)
+        c2.load_state()
+
+        c2.start_current_prefix(time.time())
+        self.failUnlessEqual(sorted(sis), sorted(c2.all_buckets))
+
+    def test_service(self):
+        self.basedir = "crawler/Basic/service"
+        fileutil.make_dirs(self.basedir)
+        serverid = "\x00" * 20
+        ss = StorageServer(self.basedir, serverid)
+        ss.setServiceParent(self.s)
+
+        sis = [self.write(i, ss, serverid) for i in range(10)]
+
+        statefile = os.path.join(self.basedir, "statefile")
+        c = BucketEnumeratingCrawler(ss, statefile)
+        c.setServiceParent(self.s)
+
+        d = c.finished_d
+        def _check(ignored):
+            self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
+        d.addCallback(_check)
+        return d
+
+    def test_paced(self):
+        self.basedir = "crawler/Basic/paced"
+        fileutil.make_dirs(self.basedir)
+        serverid = "\x00" * 20
+        ss = StorageServer(self.basedir, serverid)
+        ss.setServiceParent(self.s)
+
+        # put four buckets in each prefixdir
+        sis = []
+        for i in range(10):
+            for tail in range(4):
+                sis.append(self.write(i, ss, serverid, tail))
+
+        statefile = os.path.join(self.basedir, "statefile")
+
+        c = PacedCrawler(ss, statefile)
+        c.load_state()
+        try:
+            c.start_current_prefix(time.time())
+        except TimeSliceExceeded:
+            pass
+        # that should stop in the middle of one of the buckets.
+        c.cpu_slice = PacedCrawler.cpu_slice
+        self.failUnlessEqual(len(c.all_buckets), 6)
+        c.start_current_prefix(time.time()) # finish it
+        self.failUnlessEqual(len(sis), len(c.all_buckets))
+        self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
+
+        # make sure the statefile has been returned to the starting point
+        c.finished_d = defer.Deferred()
+        c.all_buckets = []
+        c.start_current_prefix(time.time())
+        self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
+        del c
+
+        # start a new crawler, it should start from the beginning
+        c = PacedCrawler(ss, statefile)
+        c.load_state()
+        try:
+            c.start_current_prefix(time.time())
+        except TimeSliceExceeded:
+            pass
+        # that should stop in the middle of one of the buckets
+        c.cpu_slice = PacedCrawler.cpu_slice
+
+        # a third crawler should pick up from where it left off
+        c2 = PacedCrawler(ss, statefile)
+        c2.all_buckets = c.all_buckets[:]
+        c2.load_state()
+        c2.countdown = -1
+        c2.start_current_prefix(time.time())
+        self.failUnlessEqual(len(sis), len(c2.all_buckets))
+        self.failUnlessEqual(sorted(sis), sorted(c2.all_buckets))
+        del c, c2
+
+        # now stop it at the end of a bucket (countdown=4), to exercise a
+        # different place that checks the time
+        c = PacedCrawler(ss, statefile)
+        c.load_state()
+        c.countdown = 4
+        try:
+            c.start_current_prefix(time.time())
+        except TimeSliceExceeded:
+            pass
+        # that should stop at the end of one of the buckets.
+        c.cpu_slice = PacedCrawler.cpu_slice
+        self.failUnlessEqual(len(c.all_buckets), 4)
+        c.start_current_prefix(time.time()) # finish it
+        self.failUnlessEqual(len(sis), len(c.all_buckets))
+        self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
+        del c
+
+        # stop it again at the end of the bucket, check that a new checker
+        # picks up correctly
+        c = PacedCrawler(ss, statefile)
+        c.load_state()
+        c.countdown = 4
+        try:
+            c.start_current_prefix(time.time())
+        except TimeSliceExceeded:
+            pass
+        # that should stop at the end of one of the buckets.
+
+        c2 = PacedCrawler(ss, statefile)
+        c2.all_buckets = c.all_buckets[:]
+        c2.load_state()
+        c2.countdown = -1
+        c2.start_current_prefix(time.time())
+        self.failUnlessEqual(len(sis), len(c2.all_buckets))
+        self.failUnlessEqual(sorted(sis), sorted(c2.all_buckets))
+        del c, c2
+
+    def test_paced_service(self):
+        self.basedir = "crawler/Basic/paced_service"
+        fileutil.make_dirs(self.basedir)
+        serverid = "\x00" * 20
+        ss = StorageServer(self.basedir, serverid)
+        ss.setServiceParent(self.s)
+
+        sis = [self.write(i, ss, serverid) for i in range(10)]
+
+        statefile = os.path.join(self.basedir, "statefile")
+        c = PacedCrawler(ss, statefile)
+        c.setServiceParent(self.s)
+        # that should get through 6 buckets, pause for a little while, then
+        # resume
+
+        d = c.finished_d
+        def _check(ignored):
+            self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
+            # at this point, the crawler should be sitting in the inter-cycle
+            # timer, which should be pegged at the minumum cycle time
+            self.failUnless(c.timer)
+            self.failUnless(c.sleeping_between_cycles)
+            self.failUnlessEqual(c.current_sleep_time, c.minimum_cycle_time)
+        d.addCallback(_check)
+        return d
+
+    def test_cpu_usage(self):
+        self.basedir = "crawler/Basic/cpu_usage"
+        fileutil.make_dirs(self.basedir)
+        serverid = "\x00" * 20
+        ss = StorageServer(self.basedir, serverid)
+        ss.setServiceParent(self.s)
+
+        sis = [self.write(i, ss, serverid) for i in range(10)]
+
+        statefile = os.path.join(self.basedir, "statefile")
+        c = ConsumingCrawler(ss, statefile)
+        c.setServiceParent(self.s)
+
+        # this will run as fast as it can, consuming about 50ms per call to
+        # process_bucket(), limited by the Crawler to about 50% cpu. We let
+        # it run for a few seconds, then compare how much time
+        # process_bucket() got vs wallclock time. It should get between 10%
+        # and 70% CPU. This is dicey, there's about 100ms of overhead per
+        # 300ms slice (saving the state file takes about 150-200us, but we do
+        # it 1024 times per cycle, one for each [empty] prefixdir), leaving
+        # 200ms for actual processing, which is enough to get through 4
+        # buckets each slice, then the crawler sleeps for 300ms/0.5 = 600ms,
+        # giving us 900ms wallclock per slice. In 4.0 seconds we can do 4.4
+        # slices, giving us about 17 shares, so we merely assert that we've
+        # finished at least one cycle in that time.
+
+        # with a short cpu_slice (so we can keep this test down to 4
+        # seconds), the overhead is enough to make a nominal 50% usage more
+        # like 30%. Forcing sleep_time to 0 only gets us 67% usage.
+
+        # who knows what will happen on our slower buildslaves. I'll ditch
+        # the cycles>1 test first.
+
+        start = time.time()
+        d = self.stall(delay=4.0)
+        def _done(res):
+            elapsed = time.time() - start
+            percent = 100.0 * c.accumulated / elapsed
+            self.failUnless(20 < percent < 70, "crawler got %d%%" % percent)
+            self.failUnless(c.cycles >= 1, c.cycles)
+        d.addCallback(_done)
+        return d
+
+    def test_empty_subclass(self):
+        self.basedir = "crawler/Basic/empty_subclass"
+        fileutil.make_dirs(self.basedir)
+        serverid = "\x00" * 20
+        ss = StorageServer(self.basedir, serverid)
+        ss.setServiceParent(self.s)
+
+        sis = [self.write(i, ss, serverid) for i in range(10)]
+
+        statefile = os.path.join(self.basedir, "statefile")
+        c = ShareCrawler(ss, statefile)
+        c.setServiceParent(self.s)
+
+        # we just let it run for a while, to get figleaf coverage of the
+        # empty methods in the base class
+
+        def _check():
+            return c.first_cycle_finished
+        d = self.poll(_check)
+        return d