Unicode fixes.

[tahoe-lafs/tahoe-lafs.git] / src / allmydata / scripts / consolidate.py
diff --git a/src/allmydata/scripts/consolidate.py b/src/allmydata/scripts/consolidate.py

index 5088b66665eea58629c385e61671fdbae97cfef4..da0252fa4c6d5240c2dd62ae2e7d019079b1aeeb 100644 (file)
--- a/src/allmydata/scripts/consolidate.py
+++ b/src/allmydata/scripts/consolidate.py
@@ -4,12 +4,61 @@ import sqlite3 as sqlite
  
  import urllib
  import simplejson
-from allmydata.scripts.common_http import do_http
-from allmydata.scripts.tahoe_backup import parse_old_timestamp, readonly, \
-     raiseHTTPError, HTTPError
-from allmydata.util import hashutil, base32
+from allmydata.scripts.common_http import do_http, HTTPError
+from allmydata.util import hashutil, base32, time_format
+from allmydata.util.stringutils import to_str, quote_output, quote_path
  from allmydata.util.netstring import netstring
  from allmydata.scripts.common import get_alias, DEFAULT_ALIAS
+from allmydata import uri
+
+
+def readonly(writedircap):
+    return uri.from_string_dirnode(writedircap).get_readonly().to_string()
+
+def parse_old_timestamp(s, options):
+    try:
+        if not s.endswith("Z"):
+            raise ValueError
+        # This returns seconds-since-epoch for an ISO-8601-ish-formatted UTC
+        # time string. This might raise ValueError if the string is not in the
+        # right format.
+        when = time_format.iso_utc_time_to_seconds(s[:-1])
+        return when
+    except ValueError:
+        pass
+
+    try:
+        # "2008-11-16 10.34 PM" (localtime)
+        if s[-3:] in (" AM", " PM"):
+            # this might raise ValueError
+            when = time.strptime(s[:-3], "%Y-%m-%d %I.%M")
+            if s[-3:] == "PM":
+                when += 12*60*60
+            return when
+    except ValueError:
+        pass
+
+    try:
+        # "2008-11-16 10.34.56 PM" (localtime)
+        if s[-3:] in (" AM", " PM"):
+            # this might raise ValueError
+            when = time.strptime(s[:-3], "%Y-%m-%d %I.%M.%S")
+            if s[-3:] == "PM":
+                when += 12*60*60
+            return when
+    except ValueError:
+        pass
+
+    try:
+        # "2008-12-31 18.21.43"
+        when = time.strptime(s, "%Y-%m-%d %H.%M.%S")
+        return when
+    except ValueError:
+        pass
+
+    print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s
+    return None
+
  
  TAG = "consolidator_dirhash_v1"
  
@@ -23,6 +72,7 @@ class Consolidator:
          self.rootcap, path = get_alias(options.aliases, options.where,
                                         DEFAULT_ALIAS)
          assert path == ""
+        # TODO: allow dbfile and backupfile to be Unicode
          self.dbfile = options["dbfile"]
          assert self.dbfile, "--dbfile is required"
          self.backupfile = options["backupfile"]
@@ -47,7 +97,7 @@ class Consolidator:
          url = self.nodeurl + "uri/%s?t=json" % urllib.quote(dircap)
          resp = do_http("GET", url)
          if resp.status != 200:
-            raiseHTTPError("Error during directory GET", resp)
+            raise HTTPError("Error during directory GET", resp)
          jd = simplejson.load(resp)
          ntype, ndata = jd
          if ntype != "dirnode":
@@ -72,18 +122,25 @@ class Consolidator:
          for (childname, (childtype, childdata)) in kids.items():
              if childtype != "dirnode":
                  continue
-            potential_systems[childname] = str(childdata["rw_uri"])
+            if "rw_uri" not in childdata:
+                self.msg("%s: not writeable" % quote_output(childname))
+                continue
+            potential_systems[childname] = to_str(childdata["rw_uri"])
          backup_data = {"Backups": data, "systems": {}, "archives": {}}
          systems = {}
          for name, sdircap in potential_systems.items():
              sdata = self.read_directory_json(sdircap)
              kids = sdata["children"]
              if not u"Archives" in kids and not u"Latest Backup" in kids:
-                self.msg("%s: not a backupdir, no 'Archives' and 'Latest'" % name)
+                self.msg("%s: not a backupdir, no 'Archives' and 'Latest'" % quote_output(name))
+                continue
+            archives_capdata = kids[u"Archives"][1]
+            if "rw_uri" not in archives_capdata:
+                self.msg("%s: /Archives is not writeable" % quote_output(name))
                  continue
-            self.msg("%s is a system" % name)
+            self.msg("%s is a system" % quote_output(name))
              backup_data["systems"][name] = sdata
-            archives_dircap = kids[u"Archives"][1]["rw_uri"]
+            archives_dircap = to_str(archives_capdata["rw_uri"])
              archives_data = self.read_directory_json(archives_dircap)
              backup_data["archives"][name] = archives_data
              systems[name] = archives_dircap
@@ -136,7 +193,7 @@ class Consolidator:
          #  [$NAME, writecap, $NAME-readonly, readcap] : processed, not replaced
          #  [None, None, $NAME, readcap] : processed and replaced
  
-        self.msg("consolidating system %s" % system_name)
+        self.msg("consolidating system %s" % quote_output(system_name))
          self.directories_reused = 0
          self.directories_used_as_is = 0
          self.directories_created = 0
@@ -149,11 +206,11 @@ class Consolidator:
          children = sorted(data["children"].items())
          for i, (childname, (childtype, childdata)) in enumerate(children):
              if childtype != "dirnode":
-                self.msg("non-dirnode %s in Archives/" % childname)
+                self.msg("non-dirnode %s in Archives/" % quote_output(childname))
                  continue
-            timename = childname
-            if childname.endswith("-readonly"):
-                timename = childname[:-len("-readonly")]
+            timename = to_str(childname)
+            if timename.endswith("-readonly"):
+                timename = timename[:-len("-readonly")]
              timestamp = parse_old_timestamp(timename, self.options)
              assert timestamp is not None, timename
              snapshots.setdefault(timestamp, [None, None, None, None])
@@ -161,15 +218,15 @@ class Consolidator:
              # need to re-scan it
              is_readonly = not childdata.has_key("rw_uri")
              if is_readonly:
-                readcap = str(childdata["ro_uri"])
+                readcap = to_str(childdata["ro_uri"])
                  if self.must_rescan_readonly_snapshots:
                      self.msg(" scanning old %s (%d/%d)" %
-                             (childname, i+1, len(children)))
-                    self.scan_old_directory(str(childdata["ro_uri"]))
+                             (quote_output(childname), i+1, len(children)))
+                    self.scan_old_directory(to_str(childdata["ro_uri"]))
                  snapshots[timestamp][2] = childname
                  snapshots[timestamp][3] = readcap
              else:
-                writecap = str(childdata["rw_uri"])
+                writecap = to_str(childdata["rw_uri"])
                  snapshots[timestamp][0] = childname
                  snapshots[timestamp][1] = writecap
          snapshots = [ [timestamp] + values
@@ -197,7 +254,7 @@ class Consolidator:
                  assert roname
                  assert not rwname
                  first_snapshot = False
-                self.msg(" %s already readonly" % roname)
+                self.msg(" %s already readonly" % quote_output(roname))
                  continue
              if readcap and writecap:
                  # we've processed it, creating a -readonly version, but we
@@ -205,9 +262,9 @@ class Consolidator:
                  assert roname
                  assert rwname
                  first_snapshot = False
-                self.msg(" %s processed but not yet replaced" % roname)
+                self.msg(" %s processed but not yet replaced" % quote_output(roname))
                  if self.options["really"]:
-                    self.msg("  replacing %s with %s" % (rwname, roname))
+                    self.msg("  replacing %s with %s" % (quote_output(rwname), quote_output(roname)))
                      self.put_child(archives_dircap, rwname, readcap)
                      self.delete_child(archives_dircap, roname)
                  continue
@@ -221,29 +278,29 @@ class Consolidator:
                  first_snapshot = False
                  readcap = readonly(writecap)
                  self.directories_used_as_is += 1
-                self.msg(" %s: oldest snapshot, using as-is" % rwname)
+                self.msg(" %s: oldest snapshot, using as-is" % quote_output(rwname))
                  self.scan_old_directory(readcap)
              else:
                  # for the others, we must scan their contents and build up a new
                  # readonly directory (which shares common subdirs with previous
                  # backups)
-                self.msg(" %s: processing (%d/%d)" % (rwname, i+1, len(snapshots)))
+                self.msg(" %s: processing (%d/%d)" % (quote_output(rwname), i+1, len(snapshots)))
                  started = time.time()
                  readcap = self.process_directory(readonly(writecap), (rwname,))
                  elapsed = time.time() - started
                  eta = "%ds" % (elapsed * (len(snapshots) - i-1))
              if self.options["really"]:
-                self.msg("  replaced %s" % rwname)
+                self.msg("  replaced %s" % quote_output(rwname))
                  self.put_child(archives_dircap, rwname, readcap)
              else:
-                self.msg("  created %s" % roname)
+                self.msg("  created %s" % quote_output(roname))
                  self.put_child(archives_dircap, roname, readcap)
  
              snapshot_created = self.directories_created - start_created
              snapshot_used_as_is = self.directories_used_as_is - start_used_as_is
              snapshot_reused = self.directories_reused - start_reused
              self.msg("  %s: done: %d dirs created, %d used as-is, %d reused, eta %s"
-                     % (rwname,
+                     % (quote_output(rwname),
                          snapshot_created, snapshot_used_as_is, snapshot_reused,
                          eta))
          # done!
@@ -259,7 +316,7 @@ class Consolidator:
          # for my contents. In all cases I return a directory readcap that
          # points to my contents.
  
-        assert isinstance(readcap, str)
+        readcap = to_str(readcap)
          self.directories_seen.add(readcap)
  
          # build up contents to pass to mkdir() (which uses t=set_children)
@@ -271,13 +328,13 @@ class Consolidator:
          for (childname, (childtype, childdata)) in sorted(data["children"].items()):
              if childtype == "dirnode":
                  childpath = path + (childname,)
-                old_childcap = str(childdata["ro_uri"])
+                old_childcap = to_str(childdata["ro_uri"])
                  childcap = self.process_directory(old_childcap, childpath)
                  if childcap != old_childcap:
                      children_modified = True
                  contents[childname] = ("dirnode", childcap, None)
              else:
-                childcap = str(childdata["ro_uri"])
+                childcap = to_str(childdata["ro_uri"])
                  contents[childname] = (childtype, childcap, None)
              hashkids.append( (childname, childcap) )
  
@@ -285,7 +342,7 @@ class Consolidator:
          old_dircap = self.get_old_dirhash(dirhash)
          if old_dircap:
              if self.options["verbose"]:
-                self.msg("   %r: reused" % "/".join(path))
+                self.msg("   %s: reused" % quote_path(path))
              assert isinstance(old_dircap, str)
              self.directories_reused += 1
              self.directories_used.add(old_dircap)
@@ -293,7 +350,7 @@ class Consolidator:
          if not children_modified:
              # we're allowed to use this directory as-is
              if self.options["verbose"]:
-                self.msg("   %r: used as-is" % "/".join(path))
+                self.msg("   %s: used as-is" % quote_path(path))
              new_dircap = readonly(readcap)
              assert isinstance(new_dircap, str)
              self.store_dirhash(dirhash, new_dircap)
@@ -302,7 +359,7 @@ class Consolidator:
              return new_dircap
          # otherwise, we need to create a new directory
          if self.options["verbose"]:
-            self.msg("   %r: created" % "/".join(path))
+            self.msg("   %s: created" % quote_path(path))
          new_dircap = readonly(self.mkdir(contents))
          assert isinstance(new_dircap, str)
          self.store_dirhash(dirhash, new_dircap)
@@ -315,21 +372,21 @@ class Consolidator:
                                                    urllib.quote(childname))
          resp = do_http("PUT", url, childcap)
          if resp.status not in (200, 201):
-            raiseHTTPError("error during put_child", resp)
+            raise HTTPError("Error during put_child", resp)
  
      def delete_child(self, dircap, childname):
          url = self.nodeurl + "uri/%s/%s" % (urllib.quote(dircap),
                                              urllib.quote(childname))
          resp = do_http("DELETE", url)
          if resp.status not in (200, 201):
-            raiseHTTPError("error during delete_child", resp)
+            raise HTTPError("Error during delete_child", resp)
  
      def mkdir(self, contents):
          url = self.nodeurl + "uri?t=mkdir"
          resp = do_http("POST", url)
          if resp.status < 200 or resp.status >= 300:
-            raiseHTTPError("error during mkdir", resp)
-        dircap = str(resp.read().strip())
+            raise HTTPError("Error during mkdir", resp)
+        dircap = to_str(resp.read().strip())
          url = self.nodeurl + "uri/%s?t=set_children" % urllib.quote(dircap)
          body = dict([ (childname, (contents[childname][0],
                                     {"ro_uri": contents[childname][1],
@@ -339,7 +396,7 @@ class Consolidator:
                        ])
          resp = do_http("POST", url, simplejson.dumps(body))
          if resp.status != 200:
-            raiseHTTPError("error during set_children", resp)
+            raise HTTPError("Error during set_children", resp)
          return dircap
  
      def scan_old_directory(self, dircap, ancestors=()):
@@ -358,7 +415,7 @@ class Consolidator:
          data = self.read_directory_json(dircap)
          kids = []
          for (childname, (childtype, childdata)) in data["children"].items():
-            childcap = str(childdata["ro_uri"])
+            childcap = to_str(childdata["ro_uri"])
              if childtype == "dirnode":
                  self.scan_old_directory(childcap, ancestors)
              kids.append( (childname, childcap) )
@@ -368,7 +425,7 @@ class Consolidator:
  
      def hash_directory_contents(self, kids):
          kids.sort()
-        s = "".join([netstring(childname.encode("utf-8"))+netstring(childcap)
+        s = "".join([netstring(to_str(childname))+netstring(childcap)
                       for (childname, childcap) in kids])
          return hashutil.tagged_hash(TAG, s)