From 73e05bf967e56de40d4f56e7f0d0583b7929d39c Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Fri, 20 Feb 2009 18:27:43 -0700
Subject: [PATCH] crawler: add get_progress, clean up get_state

---
 src/allmydata/storage/crawler.py   | 41 +++++++++++++++++++++++-----
 src/allmydata/test/test_crawler.py | 43 ++++++++++++++++++++++++++++--
 2 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/src/allmydata/storage/crawler.py b/src/allmydata/storage/crawler.py
index 4ead36ce..abdb0db4 100644
--- a/src/allmydata/storage/crawler.py
+++ b/src/allmydata/storage/crawler.py
@@ -75,12 +75,43 @@ class ShareCrawler(service.MultiService):
         self.current_sleep_time = None
         self.next_wake_time = None
 
+    def get_progress(self):
+        """I return information about how much progress the crawler is
+        making. My return value is a dictionary. The primary key is
+        'cycle-in-progress': True if the crawler is currently traversing the
+        shares, False if it is idle between cycles.
+
+        If cycle-in-progress is True, the following keys will be present::
+
+         cycle-complete-percentage': float, from 0.0 to 100.0, indicating how
+                                     far the crawler has progressed through
+                                     the current cycle
+         remaining-sleep-time: float, seconds from now when we do more work
+
+
+        If cycle-in-progress is False, the following keys are available::
+
+           next-crawl-time: float, seconds-since-epoch when next crawl starts
+
+           remaining-wait-time: float, seconds from now when next crawl starts
+        """
+
+        d = {}
+        if self.state["current-cycle"] is None:
+            assert self.sleeping_between_cycles
+            d["cycle-in-progress"] = False
+            d["next-crawl-time"] = self.next_wake_time
+            d["remaining-wait-time"] = self.next_wake_time - time.time()
+        else:
+            d["cycle-in-progress"] = True
+            pct = 100.0 * self.last_complete_prefix_index / len(self.prefixes)
+            d["cycle-complete-percentage"] = pct
+            d["remaining-sleep-time"] = self.next_wake_time - time.time()
+        return d
+
     def get_state(self):
         """I return the current state of the crawler. This is a copy of my
-        state dictionary, plus the following keys::
-
-         current-sleep-time: float, duration of our current sleep
-         next-wake-time: float, seconds-since-epoch of when we will next wake
+        state dictionary.
 
         If we are not currently sleeping (i.e. get_state() was called from
         inside the process_prefixdir, process_bucket, or finished_cycle()
@@ -88,8 +119,6 @@ class ShareCrawler(service.MultiService):
         these two keys will be None.
         """
         state = self.state.copy() # it isn't a deepcopy, so don't go crazy
-        state["current-sleep-time"] = self.current_sleep_time
-        state["next-wake-time"] = self.next_wake_time
         return state
 
     def load_state(self):
diff --git a/src/allmydata/test/test_crawler.py b/src/allmydata/test/test_crawler.py
index 49b60f7d..113a94e3 100644
--- a/src/allmydata/test/test_crawler.py
+++ b/src/allmydata/test/test_crawler.py
@@ -31,6 +31,7 @@ class PacedCrawler(ShareCrawler):
         self.countdown = 6
         self.all_buckets = []
         self.finished_d = defer.Deferred()
+        self.yield_cb = None
     def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32):
         self.all_buckets.append(storage_index_b32)
         self.countdown -= 1
@@ -39,6 +40,8 @@ class PacedCrawler(ShareCrawler):
             self.cpu_slice = -1.0
     def yielding(self, sleep_time):
         self.cpu_slice = 500
+        if self.yield_cb:
+            self.yield_cb()
     def finished_cycle(self, cycle):
         eventual.eventually(self.finished_d.callback, None)
 
@@ -173,6 +176,7 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin):
         # that should stop in the middle of one of the buckets.
         c.cpu_slice = PacedCrawler.cpu_slice
         self.failUnlessEqual(len(c.all_buckets), 6)
+
         c.start_current_prefix(time.time()) # finish it
         self.failUnlessEqual(len(sis), len(c.all_buckets))
         self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
@@ -252,18 +256,53 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin):
 
         statefile = os.path.join(self.basedir, "statefile")
         c = PacedCrawler(ss, statefile)
+
+        did_check_progress = [False]
+        def check_progress():
+            c.yield_cb = None
+            try:
+                p = c.get_progress()
+                self.failUnlessEqual(p["cycle-in-progress"], True)
+                pct = p["cycle-complete-percentage"]
+                # after 6 buckets, we happen to be at 76.17% complete. As
+                # long as we create shares in deterministic order, this will
+                # continue to be true.
+                self.failUnlessEqual(int(pct), 76)
+                left = p["remaining-sleep-time"]
+                self.failUnless(isinstance(left, float), left)
+                self.failUnless(left > 0.0, left)
+            except Exception, e:
+                did_check_progress[0] = e
+            else:
+                did_check_progress[0] = True
+        c.yield_cb = check_progress
+
         c.setServiceParent(self.s)
-        # that should get through 6 buckets, pause for a little while, then
-        # resume
+        # that should get through 6 buckets, pause for a little while (and
+        # run check_progress()), then resume
 
         d = c.finished_d
         def _check(ignored):
+            if did_check_progress[0] is not True:
+                raise did_check_progress[0]
+            self.failUnless(did_check_progress[0])
             self.failUnlessEqual(sorted(sis), sorted(c.all_buckets))
             # at this point, the crawler should be sitting in the inter-cycle
             # timer, which should be pegged at the minumum cycle time
             self.failUnless(c.timer)
             self.failUnless(c.sleeping_between_cycles)
             self.failUnlessEqual(c.current_sleep_time, c.minimum_cycle_time)
+
+            p = c.get_progress()
+            self.failUnlessEqual(p["cycle-in-progress"], False)
+            naptime = p["remaining-wait-time"]
+            self.failUnless(isinstance(naptime, float), naptime)
+            # min-cycle-time is 300, so this is basically testing that it took
+            # less than 290s to crawl
+            self.failUnless(naptime > 10.0, naptime)
+            soon = p["next-crawl-time"] - time.time()
+            self.failUnless(soon > 10.0, soon)
+
         d.addCallback(_check)
         return d
 
-- 
2.45.2