From: Brian Warner <>
Date: Tue, 3 Feb 2009 04:09:02 +0000 (-0700)
Subject: #598: first cut of 'tahoe backup' command: no backupdb, but yes shared-unchanged... 
X-Git-Tag: allmydata-tahoe-1.3.0~103

#598: first cut of 'tahoe backup' command: no backupdb, but yes shared-unchanged-directories and Archives/TIMESTAMP and Latest/

diff --git a/src/allmydata/scripts/ b/src/allmydata/scripts/
index 06d2d2a6..aad9978b 100644
--- a/src/allmydata/scripts/
+++ b/src/allmydata/scripts/
@@ -190,6 +190,20 @@ class LnOptions(VDriveOptions):
     def getSynopsis(self):
         return "%s ln FROM TO" % (os.path.basename(sys.argv[0]),)
+class BackupOptions(VDriveOptions):
+    optFlags = [
+        ("verbose", "v", "Be noisy about what is happening."),
+        ]
+    def parseArgs(self, localdir, topath):
+        self.from_dir = localdir
+        self.to_dir = topath
+    def getSynopsis(Self):
+        return "%s backup FROM ALIAS:TO" % os.path.basename(sys.argv[0])
+    longdesc = """Add a versioned backup of the local FROM directory to a timestamped subdir of the (tahoe) TO/Archives directory, sharing as many files and directories as possible with the previous backup. Creates TO/Latest as a reference to the latest backup. Behaves somewhat like 'rsync -a --link-dest=TO/Archives/(previous) FROM TO/Archives/(new); ln -sf TO/Archives/(new) TO/Latest'."""
 class WebopenOptions(VDriveOptions):
     def parseArgs(self, where=''):
         self.where = where
@@ -266,6 +280,7 @@ subCommands = [
     ["rm", None, RmOptions, "Unlink a file or directory in the virtual drive."],
     ["mv", None, MvOptions, "Move a file within the virtual drive."],
     ["ln", None, LnOptions, "Make an additional link to an existing file."],
+    ["backup", None, BackupOptions, "Make target dir look like local dir."],
     ["webopen", None, WebopenOptions, "Open a webbrowser to the root_dir"],
     ["manifest", None, ManifestOptions, "List all files/dirs in a subtree"],
     ["stats", None, StatsOptions, "Print statistics about all files/dirs in a subtree"],
@@ -337,6 +352,11 @@ def ln(options):
     rc =, mode="link")
     return rc
+def backup(options):
+    from allmydata.scripts import tahoe_backup
+    rc = tahoe_backup.backup(options)
+    return rc
 def webopen(options, opener=None):
     from allmydata.scripts import tahoe_webopen
     rc = tahoe_webopen.webopen(options, opener=opener)
@@ -374,6 +394,7 @@ dispatch = {
     "rm": rm,
     "mv": mv,
     "ln": ln,
+    "backup": backup,
     "webopen": webopen,
     "manifest": manifest,
     "stats": stats,
diff --git a/src/allmydata/scripts/ b/src/allmydata/scripts/
new file mode 100644
index 00000000..f68cc7fb
--- /dev/null
+++ b/src/allmydata/scripts/
@@ -0,0 +1,264 @@
+import os.path
+import time
+import urllib
+import simplejson
+from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS
+from allmydata.scripts.common_http import do_http
+from allmydata import uri
+from allmydata.util import time_format
+def raiseHTTPError(msg, resp):
+    msg = msg + ": %s %s %s" % (resp.status, resp.reason,
+    raise RuntimeError(msg)
+def readonly(writedircap):
+    return uri.from_string_dirnode(writedircap).get_readonly().to_string()
+def parse_old_timestamp(s, options):
+    try:
+        if not s.endswith("Z"):
+            raise ValueError
+        # the "local" in this "localseconds" is superfluous and
+        # misleading. This returns seconds-since-epoch for an
+        # ISO-8601-ish-formatted UTC time string. This might raise
+        # ValueError if the string is not in the right format.
+        when = time_format.iso_utc_time_to_localseconds(s[:-1])
+        return when
+    except ValueError:
+        pass
+    try:
+        # "2008-11-16 10.34 PM" (localtime)
+        if s[-3:] in (" AM", " PM"):
+            # this might raise ValueError
+            when = time.strptime(s[:-3], "%Y-%m-%d %H.%M")
+            if s[-3:] == "PM":
+                when += 12*60*60
+            return when
+    except ValueError:
+        pass
+    print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s
+def readdir(dircap, options):
+    # returns a dict of (childname: (type, readcap, metadata)), or None if the
+    # dircap didn't point to a directory
+    url = options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
+    resp = do_http("GET", url)
+    if resp.status != 200:
+        raiseHTTPError("Error during directory GET", resp)
+    jd = simplejson.load(resp)
+    ntype, ndata = jd
+    if ntype != "dirnode":
+        return None
+    contents = {}
+    for (childname, (childtype, childdata)) in ndata["children"].items():
+        contents[childname] = (childtype,
+                               str(childdata["ro_uri"]),
+                               childdata["metadata"])
+    return contents
+def get_local_metadata(path):
+    metadata = {}
+    # posix stat(2) metadata, depends on the platform
+    os.stat_float_times(True)
+    s = os.stat(path)
+    metadata["ctime"] = s.st_ctime
+    metadata["mtime"] = s.st_mtime
+    misc_fields = ("st_mode", "st_ino", "st_dev", "st_uid", "st_gid")
+    macos_misc_fields = ("st_rsize", "st_creator", "st_type")
+    for field in misc_fields + macos_misc_fields:
+        if hasattr(s, field):
+            metadata[field] = getattr(s, field)
+    # TODO: extended attributes, like on OS-X's HFS+
+    return metadata
+def mkdir(contents, options):
+    url = options['node-url'] + "uri?t=mkdir"
+    resp = do_http("POST", url)
+    if resp.status < 200 or resp.status >= 300:
+        raiseHTTPError("error during mkdir", resp)
+    dircap = str(
+    url = options['node-url'] + "uri/%s?t=set_children" % urllib.quote(dircap)
+    body = dict([ (childname, (contents[childname][0],
+                               {"ro_uri": contents[childname][1],
+                                "metadata": contents[childname][2],
+                                }))
+                  for childname in contents
+                  ])
+    resp = do_http("POST", url, simplejson.dumps(body))
+    if resp.status != 200:
+        raiseHTTPError("error during set_children", resp)
+    return dircap
+def put_child(dirurl, childname, childcap):
+    assert dirurl[-1] == "/"
+    url = dirurl + urllib.quote(childname) + "?t=uri"
+    resp = do_http("PUT", url, childcap)
+    if resp.status not in (200, 201):
+        raiseHTTPError("error during put_child", resp)
+def directory_is_changed(a, b):
+    # each is a mapping from childname to (type, cap, metadata)
+    significant_metadata = ("ctime", "mtime")
+    # other metadata keys are preserved, but changes to them won't trigger a
+    # new backup
+    if set(a.keys()) != set(b.keys()):
+        return True
+    for childname in a:
+        a_type, a_cap, a_metadata = a[childname]
+        b_type, b_cap, b_metadata = b[childname]
+        if a_type != b_type:
+            return True
+        if a_cap != b_cap:
+            return True
+        for k in significant_metadata:
+            if a_metadata.get(k) != b_metadata.get(k):
+                return True
+    return False
+def backup(options):
+    nodeurl = options['node-url']
+    from_dir = options.from_dir
+    to_dir = options.to_dir
+    if options['quiet']:
+        verbosity = 0
+    else:
+        verbosity = 2
+    stdin = options.stdin
+    stdout = options.stdout
+    stderr = options.stderr
+    rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
+    to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
+    if path:
+        to_url += escape_path(path)
+    if not to_url.endswith("/"):
+        to_url += "/"
+    archives_url = to_url + "Archives/"
+    latest_url = to_url + "Latest"
+    # first step: make sure the target directory exists, as well as the
+    # Archives/ subdirectory.
+    resp = do_http("GET", archives_url + "?t=json")
+    if resp.status == 404:
+        resp = do_http("POST", archives_url + "?t=mkdir")
+        if resp.status != 200:
+            print >>stderr, "Unable to create target directory: %s %s %s" % \
+                  (resp.status, resp.reason,
+            return 1
+        archives_dir = {}
+    else:
+        jdata = simplejson.load(resp)
+        (otype, attrs) = jdata
+        archives_dir = attrs["children"]
+    # second step: locate the most recent backup in TODIR/Archives/*
+    latest_backup_time = 0
+    latest_backup_name = None
+    latest_backup_dircap = None
+    # we have various time formats. The windows backup tool
+    # appears to create things like "2008-11-16 10.34 PM". This script
+    # creates things like "2009-11-16--17.34Z".
+    for archive_name in archives_dir.keys():
+        if archives_dir[archive_name][0] != "dirnode":
+            continue
+        when = parse_old_timestamp(archive_name, options)
+        if when is not None:
+            if when > latest_backup_time:
+                latest_backup_time = when
+                latest_backup_name = archive_name
+                latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
+    # third step: process the tree
+    new_backup_dircap = Node().process(options.from_dir,
+                                       latest_backup_dircap,
+                                       options)
+    print >>stdout, "new backup done"
+    # fourth: attach the new backup to the list
+    new_readonly_backup_dircap = readonly(new_backup_dircap)
+    now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
+    put_child(archives_url, now, new_readonly_backup_dircap)
+    put_child(to_url, "Latest", new_readonly_backup_dircap)
+    print >>stdout, "backup done"
+    # done!
+    return 0
+class Node:
+    def verboseprint(self, msg):
+        if self.options["verbose"]:
+            print >>self.options.stdout, msg
+    def process(self, localpath, olddircap, options):
+        # returns newdircap
+        self.options = options
+        self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap))
+        olddircontents = {}
+        if olddircap:
+            olddircontents = readdir(olddircap, options)
+        newdircontents = {} # childname -> (type, rocap, metadata)
+        for child in os.listdir(localpath):
+            childpath = os.path.join(localpath, child)
+            if os.path.isdir(childpath):
+                metadata = get_local_metadata(childpath)
+                oldchildcap = None
+                if olddircontents is not None and child in olddircontents:
+                    oldchildcap = olddircontents[child][1]
+                newchilddircap = self.recurse(childpath, oldchildcap)
+                newdircontents[child] = ("dirnode", newchilddircap, metadata)
+            elif os.path.isfile(childpath):
+                newfilecap, metadata = self.upload(childpath)
+                newdircontents[child] = ("filenode", newfilecap, metadata)
+            else:
+                raise RuntimeError("how do I back this up?")
+        if (olddircap
+            and olddircontents is not None
+            and not directory_is_changed(newdircontents, olddircontents)
+            ):
+            self.verboseprint(" %s not changed, re-using old directory" % localpath)
+            # yay! they're identical!
+            return olddircap
+        else:
+            self.verboseprint(" %s changed, making new directory" % localpath)
+            # something changed, or there was no previous directory, so we
+            # must make a new directory
+            newdircap = mkdir(newdircontents, options)
+            return readonly(newdircap)
+    def recurse(self, localpath, olddircap):
+        n = self.__class__()
+        return n.process(localpath, olddircap, self.options)
+    def upload(self, childpath):
+        self.verboseprint("uploading %s.." % childpath)
+        # we can use the backupdb here
+        #s = os.stat(childpath)
+        # ...
+        # if we go with the old file, we're obligated to use the old
+        # metadata, to make sure it matches the metadata for this child in
+        # the old parent directory
+        #  return oldcap, old_metadata
+        metadata = get_local_metadata(childpath)
+        infileobj = open(os.path.expanduser(childpath), "rb")
+        url = self.options['node-url'] + "uri"
+        resp = do_http("PUT", url, infileobj)
+        if resp.status not in (200, 201):
+            raiseHTTPError("Error during file PUT", resp)
+        filecap =
+        self.verboseprint(" %s -> %s" % (childpath, filecap))
+        self.verboseprint(" metadata: %s" % (metadata,))
+        return filecap, metadata
diff --git a/src/allmydata/test/ b/src/allmydata/test/
index 22856866..e3b9948d 100644
--- a/src/allmydata/test/
+++ b/src/allmydata/test/
@@ -4,6 +4,7 @@ import os.path
 from twisted.trial import unittest
 from cStringIO import StringIO
 import urllib
+import time
 from allmydata.util import fileutil, hashutil
 from allmydata import uri
@@ -617,3 +618,111 @@ class Cp(SystemTestMixin, CLITestMixin, unittest.TestCase):
         d.addCallback(lambda res: self.do_cli("cp", "--recursive",
                                               dn, "tahoe:"))
         return d
+class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
+    def writeto(self, path, data):
+        d = os.path.dirname(os.path.join(self.basedir, "home", path))
+        fileutil.make_dirs(d)
+        f = open(os.path.join(self.basedir, "home", path), "w")
+        f.write(data)
+        f.close()
+    def test_backup(self):
+        self.basedir = os.path.dirname(self.mktemp())
+        # create a small local directory with a couple of files
+        source = os.path.join(self.basedir, "home")
+        fileutil.make_dirs(os.path.join(source, "empty"))
+        self.writeto("parent/subdir/foo.txt", "foo")
+        self.writeto("parent/subdir/bar.txt", "bar\n" * 1000)
+        self.writeto("parent/blah.txt", "blah")
+        d = self.set_up_nodes()
+        d.addCallback(lambda res: self.do_cli("create-alias", "tahoe"))
+        d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups"))
+        def _check0((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+        d.addCallback(_check0)
+        d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups"))
+        def _check1((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.failUnlessEqual(sorted(out.split()), ["Archives", "Latest"])
+        d.addCallback(_check1)
+        d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Latest"))
+        def _check2((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.failUnlessEqual(sorted(out.split()), ["empty", "parent"])
+        d.addCallback(_check2)
+        d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Latest/empty"))
+        def _check2a((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.failUnlessEqual(out.strip(), "")
+        d.addCallback(_check2a)
+        d.addCallback(lambda res: self.do_cli("get", "tahoe:backups/Latest/parent/subdir/foo.txt"))
+        def _check3((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.failUnlessEqual(out, "foo")
+        d.addCallback(_check3)
+        d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
+        def _check4((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.old_archives = out.split()
+            self.failUnlessEqual(len(self.old_archives), 1)
+        d.addCallback(_check4)
+        d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups"))
+        d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
+        def _check5((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.new_archives = out.split()
+            self.failUnlessEqual(len(self.new_archives), 2)
+            self.failUnlessEqual(sorted(self.new_archives)[0],
+                                 self.old_archives[0])
+        d.addCallback(_check5)
+        def _modify(res):
+            time.sleep(1) # get us to a new second
+            self.writeto("parent/subdir/foo.txt", "FOOF!")
+            # and turn a file into a directory
+            os.unlink(os.path.join(source, "parent/blah.txt"))
+            os.mkdir(os.path.join(source, "parent/blah.txt"))
+            self.writeto("parent/blah.txt/surprise file", "surprise")
+            self.writeto("parent/blah.txt/surprisedir/subfile", "surprise")
+            # turn a directory into a file
+            os.rmdir(os.path.join(source, "empty"))
+            self.writeto("empty", "imagine nothing being here")
+            return self.do_cli("backup", source, "tahoe:backups")
+        d.addCallback(_modify)
+        d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
+        def _check6((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.new_archives = out.split()
+            self.failUnlessEqual(len(self.new_archives), 3)
+            self.failUnlessEqual(sorted(self.new_archives)[0],
+                                 self.old_archives[0])
+        d.addCallback(_check6)
+        d.addCallback(lambda res: self.do_cli("get", "tahoe:backups/Latest/parent/subdir/foo.txt"))
+        def _check7((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.failUnlessEqual(out, "FOOF!")
+            # the old snapshot should not be modified
+            return self.do_cli("get", "tahoe:backups/Archives/%s/parent/subdir/foo.txt" % self.old_archives[0])
+        d.addCallback(_check7)
+        def _check8((rc, out, err)):
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            self.failUnlessEqual(out, "foo")
+        d.addCallback(_check8)
+        return d