From: Brian Warner Date: Tue, 3 Feb 2009 04:09:02 +0000 (-0700) Subject: #598: first cut of 'tahoe backup' command: no backupdb, but yes shared-unchanged... X-Git-Tag: allmydata-tahoe-1.3.0~103 X-Git-Url: https://git.rkrishnan.org/pf/content/en/seg/biz?a=commitdiff_plain;h=cfce8b5eab431772015b4ce5fd38ef327dc4b376;p=tahoe-lafs%2Ftahoe-lafs.git #598: first cut of 'tahoe backup' command: no backupdb, but yes shared-unchanged-directories and Archives/TIMESTAMP and Latest/ --- diff --git a/src/allmydata/scripts/cli.py b/src/allmydata/scripts/cli.py index 06d2d2a6..aad9978b 100644 --- a/src/allmydata/scripts/cli.py +++ b/src/allmydata/scripts/cli.py @@ -190,6 +190,20 @@ class LnOptions(VDriveOptions): def getSynopsis(self): return "%s ln FROM TO" % (os.path.basename(sys.argv[0]),) +class BackupOptions(VDriveOptions): + optFlags = [ + ("verbose", "v", "Be noisy about what is happening."), + ] + + def parseArgs(self, localdir, topath): + self.from_dir = localdir + self.to_dir = topath + + def getSynopsis(Self): + return "%s backup FROM ALIAS:TO" % os.path.basename(sys.argv[0]) + + longdesc = """Add a versioned backup of the local FROM directory to a timestamped subdir of the (tahoe) TO/Archives directory, sharing as many files and directories as possible with the previous backup. Creates TO/Latest as a reference to the latest backup. Behaves somewhat like 'rsync -a --link-dest=TO/Archives/(previous) FROM TO/Archives/(new); ln -sf TO/Archives/(new) TO/Latest'.""" + class WebopenOptions(VDriveOptions): def parseArgs(self, where=''): self.where = where @@ -266,6 +280,7 @@ subCommands = [ ["rm", None, RmOptions, "Unlink a file or directory in the virtual drive."], ["mv", None, MvOptions, "Move a file within the virtual drive."], ["ln", None, LnOptions, "Make an additional link to an existing file."], + ["backup", None, BackupOptions, "Make target dir look like local dir."], ["webopen", None, WebopenOptions, "Open a webbrowser to the root_dir"], ["manifest", None, ManifestOptions, "List all files/dirs in a subtree"], ["stats", None, StatsOptions, "Print statistics about all files/dirs in a subtree"], @@ -337,6 +352,11 @@ def ln(options): rc = tahoe_mv.mv(options, mode="link") return rc +def backup(options): + from allmydata.scripts import tahoe_backup + rc = tahoe_backup.backup(options) + return rc + def webopen(options, opener=None): from allmydata.scripts import tahoe_webopen rc = tahoe_webopen.webopen(options, opener=opener) @@ -374,6 +394,7 @@ dispatch = { "rm": rm, "mv": mv, "ln": ln, + "backup": backup, "webopen": webopen, "manifest": manifest, "stats": stats, diff --git a/src/allmydata/scripts/tahoe_backup.py b/src/allmydata/scripts/tahoe_backup.py new file mode 100644 index 00000000..f68cc7fb --- /dev/null +++ b/src/allmydata/scripts/tahoe_backup.py @@ -0,0 +1,264 @@ + +import os.path +import time +import urllib +import simplejson +from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS +from allmydata.scripts.common_http import do_http +from allmydata import uri +from allmydata.util import time_format + +def raiseHTTPError(msg, resp): + msg = msg + ": %s %s %s" % (resp.status, resp.reason, resp.read()) + raise RuntimeError(msg) + +def readonly(writedircap): + return uri.from_string_dirnode(writedircap).get_readonly().to_string() + +def parse_old_timestamp(s, options): + try: + if not s.endswith("Z"): + raise ValueError + # the "local" in this "localseconds" is superfluous and + # misleading. This returns seconds-since-epoch for an + # ISO-8601-ish-formatted UTC time string. This might raise + # ValueError if the string is not in the right format. + when = time_format.iso_utc_time_to_localseconds(s[:-1]) + return when + except ValueError: + pass + try: + # "2008-11-16 10.34 PM" (localtime) + if s[-3:] in (" AM", " PM"): + # this might raise ValueError + when = time.strptime(s[:-3], "%Y-%m-%d %H.%M") + if s[-3:] == "PM": + when += 12*60*60 + return when + except ValueError: + pass + print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s + +def readdir(dircap, options): + # returns a dict of (childname: (type, readcap, metadata)), or None if the + # dircap didn't point to a directory + url = options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap) + resp = do_http("GET", url) + if resp.status != 200: + raiseHTTPError("Error during directory GET", resp) + jd = simplejson.load(resp) + ntype, ndata = jd + if ntype != "dirnode": + return None + contents = {} + for (childname, (childtype, childdata)) in ndata["children"].items(): + contents[childname] = (childtype, + str(childdata["ro_uri"]), + childdata["metadata"]) + return contents + +def get_local_metadata(path): + metadata = {} + + # posix stat(2) metadata, depends on the platform + os.stat_float_times(True) + s = os.stat(path) + metadata["ctime"] = s.st_ctime + metadata["mtime"] = s.st_mtime + + misc_fields = ("st_mode", "st_ino", "st_dev", "st_uid", "st_gid") + macos_misc_fields = ("st_rsize", "st_creator", "st_type") + for field in misc_fields + macos_misc_fields: + if hasattr(s, field): + metadata[field] = getattr(s, field) + + # TODO: extended attributes, like on OS-X's HFS+ + return metadata + +def mkdir(contents, options): + url = options['node-url'] + "uri?t=mkdir" + resp = do_http("POST", url) + if resp.status < 200 or resp.status >= 300: + raiseHTTPError("error during mkdir", resp) + dircap = str(resp.read().strip()) + url = options['node-url'] + "uri/%s?t=set_children" % urllib.quote(dircap) + body = dict([ (childname, (contents[childname][0], + {"ro_uri": contents[childname][1], + "metadata": contents[childname][2], + })) + for childname in contents + ]) + resp = do_http("POST", url, simplejson.dumps(body)) + if resp.status != 200: + raiseHTTPError("error during set_children", resp) + return dircap + +def put_child(dirurl, childname, childcap): + assert dirurl[-1] == "/" + url = dirurl + urllib.quote(childname) + "?t=uri" + resp = do_http("PUT", url, childcap) + if resp.status not in (200, 201): + raiseHTTPError("error during put_child", resp) + +def directory_is_changed(a, b): + # each is a mapping from childname to (type, cap, metadata) + significant_metadata = ("ctime", "mtime") + # other metadata keys are preserved, but changes to them won't trigger a + # new backup + + if set(a.keys()) != set(b.keys()): + return True + for childname in a: + a_type, a_cap, a_metadata = a[childname] + b_type, b_cap, b_metadata = b[childname] + if a_type != b_type: + return True + if a_cap != b_cap: + return True + for k in significant_metadata: + if a_metadata.get(k) != b_metadata.get(k): + return True + return False + +def backup(options): + nodeurl = options['node-url'] + from_dir = options.from_dir + to_dir = options.to_dir + if options['quiet']: + verbosity = 0 + else: + verbosity = 2 + stdin = options.stdin + stdout = options.stdout + stderr = options.stderr + + rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS) + to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap) + if path: + to_url += escape_path(path) + if not to_url.endswith("/"): + to_url += "/" + + archives_url = to_url + "Archives/" + latest_url = to_url + "Latest" + + # first step: make sure the target directory exists, as well as the + # Archives/ subdirectory. + resp = do_http("GET", archives_url + "?t=json") + if resp.status == 404: + resp = do_http("POST", archives_url + "?t=mkdir") + if resp.status != 200: + print >>stderr, "Unable to create target directory: %s %s %s" % \ + (resp.status, resp.reason, resp.read()) + return 1 + archives_dir = {} + else: + jdata = simplejson.load(resp) + (otype, attrs) = jdata + archives_dir = attrs["children"] + + # second step: locate the most recent backup in TODIR/Archives/* + latest_backup_time = 0 + latest_backup_name = None + latest_backup_dircap = None + + # we have various time formats. The allmydata.com windows backup tool + # appears to create things like "2008-11-16 10.34 PM". This script + # creates things like "2009-11-16--17.34Z". + for archive_name in archives_dir.keys(): + if archives_dir[archive_name][0] != "dirnode": + continue + when = parse_old_timestamp(archive_name, options) + if when is not None: + if when > latest_backup_time: + latest_backup_time = when + latest_backup_name = archive_name + latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"]) + + # third step: process the tree + new_backup_dircap = Node().process(options.from_dir, + latest_backup_dircap, + options) + print >>stdout, "new backup done" + + # fourth: attach the new backup to the list + new_readonly_backup_dircap = readonly(new_backup_dircap) + now = time_format.iso_utc(int(time.time()), sep="_") + "Z" + + put_child(archives_url, now, new_readonly_backup_dircap) + put_child(to_url, "Latest", new_readonly_backup_dircap) + + print >>stdout, "backup done" + # done! + return 0 + + +class Node: + def verboseprint(self, msg): + if self.options["verbose"]: + print >>self.options.stdout, msg + + def process(self, localpath, olddircap, options): + # returns newdircap + self.options = options + + self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap)) + olddircontents = {} + if olddircap: + olddircontents = readdir(olddircap, options) + + newdircontents = {} # childname -> (type, rocap, metadata) + for child in os.listdir(localpath): + childpath = os.path.join(localpath, child) + if os.path.isdir(childpath): + metadata = get_local_metadata(childpath) + oldchildcap = None + if olddircontents is not None and child in olddircontents: + oldchildcap = olddircontents[child][1] + newchilddircap = self.recurse(childpath, oldchildcap) + newdircontents[child] = ("dirnode", newchilddircap, metadata) + elif os.path.isfile(childpath): + newfilecap, metadata = self.upload(childpath) + newdircontents[child] = ("filenode", newfilecap, metadata) + else: + raise RuntimeError("how do I back this up?") + + if (olddircap + and olddircontents is not None + and not directory_is_changed(newdircontents, olddircontents) + ): + self.verboseprint(" %s not changed, re-using old directory" % localpath) + # yay! they're identical! + return olddircap + else: + self.verboseprint(" %s changed, making new directory" % localpath) + # something changed, or there was no previous directory, so we + # must make a new directory + newdircap = mkdir(newdircontents, options) + return readonly(newdircap) + + def recurse(self, localpath, olddircap): + n = self.__class__() + return n.process(localpath, olddircap, self.options) + + def upload(self, childpath): + self.verboseprint("uploading %s.." % childpath) + # we can use the backupdb here + #s = os.stat(childpath) + # ... + # if we go with the old file, we're obligated to use the old + # metadata, to make sure it matches the metadata for this child in + # the old parent directory + # return oldcap, old_metadata + + metadata = get_local_metadata(childpath) + infileobj = open(os.path.expanduser(childpath), "rb") + url = self.options['node-url'] + "uri" + resp = do_http("PUT", url, infileobj) + if resp.status not in (200, 201): + raiseHTTPError("Error during file PUT", resp) + filecap = resp.read().strip() + self.verboseprint(" %s -> %s" % (childpath, filecap)) + self.verboseprint(" metadata: %s" % (metadata,)) + return filecap, metadata + diff --git a/src/allmydata/test/test_cli.py b/src/allmydata/test/test_cli.py index 22856866..e3b9948d 100644 --- a/src/allmydata/test/test_cli.py +++ b/src/allmydata/test/test_cli.py @@ -4,6 +4,7 @@ import os.path from twisted.trial import unittest from cStringIO import StringIO import urllib +import time from allmydata.util import fileutil, hashutil from allmydata import uri @@ -617,3 +618,111 @@ class Cp(SystemTestMixin, CLITestMixin, unittest.TestCase): d.addCallback(lambda res: self.do_cli("cp", "--recursive", dn, "tahoe:")) return d + +class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase): + def writeto(self, path, data): + d = os.path.dirname(os.path.join(self.basedir, "home", path)) + fileutil.make_dirs(d) + f = open(os.path.join(self.basedir, "home", path), "w") + f.write(data) + f.close() + + def test_backup(self): + self.basedir = os.path.dirname(self.mktemp()) + + # create a small local directory with a couple of files + source = os.path.join(self.basedir, "home") + fileutil.make_dirs(os.path.join(source, "empty")) + self.writeto("parent/subdir/foo.txt", "foo") + self.writeto("parent/subdir/bar.txt", "bar\n" * 1000) + self.writeto("parent/blah.txt", "blah") + + d = self.set_up_nodes() + d.addCallback(lambda res: self.do_cli("create-alias", "tahoe")) + d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups")) + def _check0((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + d.addCallback(_check0) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups")) + def _check1((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.failUnlessEqual(sorted(out.split()), ["Archives", "Latest"]) + d.addCallback(_check1) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Latest")) + def _check2((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.failUnlessEqual(sorted(out.split()), ["empty", "parent"]) + d.addCallback(_check2) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Latest/empty")) + def _check2a((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.failUnlessEqual(out.strip(), "") + d.addCallback(_check2a) + d.addCallback(lambda res: self.do_cli("get", "tahoe:backups/Latest/parent/subdir/foo.txt")) + def _check3((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.failUnlessEqual(out, "foo") + d.addCallback(_check3) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives")) + def _check4((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.old_archives = out.split() + self.failUnlessEqual(len(self.old_archives), 1) + d.addCallback(_check4) + + + d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups")) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives")) + def _check5((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.new_archives = out.split() + self.failUnlessEqual(len(self.new_archives), 2) + self.failUnlessEqual(sorted(self.new_archives)[0], + self.old_archives[0]) + d.addCallback(_check5) + + def _modify(res): + time.sleep(1) # get us to a new second + self.writeto("parent/subdir/foo.txt", "FOOF!") + # and turn a file into a directory + os.unlink(os.path.join(source, "parent/blah.txt")) + os.mkdir(os.path.join(source, "parent/blah.txt")) + self.writeto("parent/blah.txt/surprise file", "surprise") + self.writeto("parent/blah.txt/surprisedir/subfile", "surprise") + # turn a directory into a file + os.rmdir(os.path.join(source, "empty")) + self.writeto("empty", "imagine nothing being here") + return self.do_cli("backup", source, "tahoe:backups") + d.addCallback(_modify) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives")) + def _check6((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.new_archives = out.split() + self.failUnlessEqual(len(self.new_archives), 3) + self.failUnlessEqual(sorted(self.new_archives)[0], + self.old_archives[0]) + d.addCallback(_check6) + d.addCallback(lambda res: self.do_cli("get", "tahoe:backups/Latest/parent/subdir/foo.txt")) + def _check7((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.failUnlessEqual(out, "FOOF!") + # the old snapshot should not be modified + return self.do_cli("get", "tahoe:backups/Archives/%s/parent/subdir/foo.txt" % self.old_archives[0]) + d.addCallback(_check7) + def _check8((rc, out, err)): + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + self.failUnlessEqual(out, "foo") + d.addCallback(_check8) + + return d +