From 73e05bf967e56de40d4f56e7f0d0583b7929d39c Mon Sep 17 00:00:00 2001 From: Brian Warner <warner@lothar.com> Date: Fri, 20 Feb 2009 18:27:43 -0700 Subject: [PATCH] crawler: add get_progress, clean up get_state --- src/allmydata/storage/crawler.py | 41 +++++++++++++++++++++++----- src/allmydata/test/test_crawler.py | 43 ++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/allmydata/storage/crawler.py b/src/allmydata/storage/crawler.py index 4ead36ce..abdb0db4 100644 --- a/src/allmydata/storage/crawler.py +++ b/src/allmydata/storage/crawler.py @@ -75,12 +75,43 @@ class ShareCrawler(service.MultiService): self.current_sleep_time = None self.next_wake_time = None + def get_progress(self): + """I return information about how much progress the crawler is + making. My return value is a dictionary. The primary key is + 'cycle-in-progress': True if the crawler is currently traversing the + shares, False if it is idle between cycles. + + If cycle-in-progress is True, the following keys will be present:: + + cycle-complete-percentage': float, from 0.0 to 100.0, indicating how + far the crawler has progressed through + the current cycle + remaining-sleep-time: float, seconds from now when we do more work + + + If cycle-in-progress is False, the following keys are available:: + + next-crawl-time: float, seconds-since-epoch when next crawl starts + + remaining-wait-time: float, seconds from now when next crawl starts + """ + + d = {} + if self.state["current-cycle"] is None: + assert self.sleeping_between_cycles + d["cycle-in-progress"] = False + d["next-crawl-time"] = self.next_wake_time + d["remaining-wait-time"] = self.next_wake_time - time.time() + else: + d["cycle-in-progress"] = True + pct = 100.0 * self.last_complete_prefix_index / len(self.prefixes) + d["cycle-complete-percentage"] = pct + d["remaining-sleep-time"] = self.next_wake_time - time.time() + return d + def get_state(self): """I return the current state of the crawler. This is a copy of my - state dictionary, plus the following keys:: - - current-sleep-time: float, duration of our current sleep - next-wake-time: float, seconds-since-epoch of when we will next wake + state dictionary. If we are not currently sleeping (i.e. get_state() was called from inside the process_prefixdir, process_bucket, or finished_cycle() @@ -88,8 +119,6 @@ class ShareCrawler(service.MultiService): these two keys will be None. """ state = self.state.copy() # it isn't a deepcopy, so don't go crazy - state["current-sleep-time"] = self.current_sleep_time - state["next-wake-time"] = self.next_wake_time return state def load_state(self): diff --git a/src/allmydata/test/test_crawler.py b/src/allmydata/test/test_crawler.py index 49b60f7d..113a94e3 100644 --- a/src/allmydata/test/test_crawler.py +++ b/src/allmydata/test/test_crawler.py @@ -31,6 +31,7 @@ class PacedCrawler(ShareCrawler): self.countdown = 6 self.all_buckets = [] self.finished_d = defer.Deferred() + self.yield_cb = None def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32): self.all_buckets.append(storage_index_b32) self.countdown -= 1 @@ -39,6 +40,8 @@ class PacedCrawler(ShareCrawler): self.cpu_slice = -1.0 def yielding(self, sleep_time): self.cpu_slice = 500 + if self.yield_cb: + self.yield_cb() def finished_cycle(self, cycle): eventual.eventually(self.finished_d.callback, None) @@ -173,6 +176,7 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin): # that should stop in the middle of one of the buckets. c.cpu_slice = PacedCrawler.cpu_slice self.failUnlessEqual(len(c.all_buckets), 6) + c.start_current_prefix(time.time()) # finish it self.failUnlessEqual(len(sis), len(c.all_buckets)) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) @@ -252,18 +256,53 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin): statefile = os.path.join(self.basedir, "statefile") c = PacedCrawler(ss, statefile) + + did_check_progress = [False] + def check_progress(): + c.yield_cb = None + try: + p = c.get_progress() + self.failUnlessEqual(p["cycle-in-progress"], True) + pct = p["cycle-complete-percentage"] + # after 6 buckets, we happen to be at 76.17% complete. As + # long as we create shares in deterministic order, this will + # continue to be true. + self.failUnlessEqual(int(pct), 76) + left = p["remaining-sleep-time"] + self.failUnless(isinstance(left, float), left) + self.failUnless(left > 0.0, left) + except Exception, e: + did_check_progress[0] = e + else: + did_check_progress[0] = True + c.yield_cb = check_progress + c.setServiceParent(self.s) - # that should get through 6 buckets, pause for a little while, then - # resume + # that should get through 6 buckets, pause for a little while (and + # run check_progress()), then resume d = c.finished_d def _check(ignored): + if did_check_progress[0] is not True: + raise did_check_progress[0] + self.failUnless(did_check_progress[0]) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) # at this point, the crawler should be sitting in the inter-cycle # timer, which should be pegged at the minumum cycle time self.failUnless(c.timer) self.failUnless(c.sleeping_between_cycles) self.failUnlessEqual(c.current_sleep_time, c.minimum_cycle_time) + + p = c.get_progress() + self.failUnlessEqual(p["cycle-in-progress"], False) + naptime = p["remaining-wait-time"] + self.failUnless(isinstance(naptime, float), naptime) + # min-cycle-time is 300, so this is basically testing that it took + # less than 290s to crawl + self.failUnless(naptime > 10.0, naptime) + soon = p["next-crawl-time"] - time.time() + self.failUnless(soon > 10.0, soon) + d.addCallback(_check) return d -- 2.45.2