From ca32db5b3992623cdf541c31e7adcead3e449ced Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Thu, 5 Feb 2009 22:07:01 -0700 Subject: [PATCH] #598: add cli+backupdb tests, improve user display, update docs, move docs out of proposed/ --- docs/backupdb.txt | 160 ++++++++++++++++ docs/proposed/backupdb.txt | 188 ------------------- src/allmydata/scripts/tahoe_backup.py | 252 ++++++++++++++------------ src/allmydata/test/test_cli.py | 108 ++++++++++- 4 files changed, 403 insertions(+), 305 deletions(-) create mode 100644 docs/backupdb.txt delete mode 100644 docs/proposed/backupdb.txt diff --git a/docs/backupdb.txt b/docs/backupdb.txt new file mode 100644 index 00000000..7c5905e8 --- /dev/null +++ b/docs/backupdb.txt @@ -0,0 +1,160 @@ += The Tahoe BackupDB = + +To speed up backup operations, Tahoe maintains a small database known as the +"backupdb". This is used to avoid re-uploading files which have already been +uploaded recently. + +This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite +single-file database. It is used by the "tahoe backup" command (unless the +--no-backupdb option is included). In the future, it will also be used by +"tahoe mirror", and by "tahoe cp" when the --use-backupdb option is included. + +The purpose of this database is specifically to manage the file-to-cap +translation (the "upload" step). It does not address directory updates. A +future version will include a directory cache. + +The overall goal of optimizing backup is to reduce the work required when the +source disk has not changed since the last backup. In the ideal case, running +"tahoe backup" twice in a row, with no intervening changes to the disk, will +not require any network traffic. + +This database is optional. If it is deleted, the worst effect is that a +subsequent backup operation may use more effort (network bandwidth, CPU +cycles, and disk IO) than it would have without the backupdb. + +The database uses sqlite3, which is included as part of the standard python +library with python2.5 and later. For python2.4, please install the +"pysqlite2" package (which, despite the name, actually provides sqlite3 +rather than sqlite2). + +== Schema == + +The database contains the following tables: + +CREATE TABLE version +( + version integer # contains one row, set to 1 +); + +CREATE TABLE last_upload +( + path varchar(1024), PRIMARY KEY -- index, this is os.path.abspath(fn) + size integer, -- os.stat(fn)[stat.ST_SIZE] + mtime number, -- os.stat(fn)[stat.ST_MTIME] + ctime number, -- os.stat(fn)[stat.ST_MTIME] + fileid integer +); + +CREATE TABLE caps +( + fileid integer PRIMARY KEY AUTOINCREMENT, + filecap varchar(256) UNIQUE -- URI:CHK:... +); + +CREATE TABLE last_upload +( + fileid INTEGER PRIMARY KEY, + last_uploaded TIMESTAMP, + last_checked TIMESTAMP +); + +Notes: if we extend the backupdb to assist with directory maintenance (see +below), we may need paths in multiple places, so it would make sense to +create a table for them, and change the last_upload table to refer to a +pathid instead of an absolute path: + +CREATE TABLE paths +( + path varchar(1024) UNIQUE, -- index + pathid integer PRIMARY KEY AUTOINCREMENT +); + +== Operation == + +The upload process starts with a pathname (like ~/.emacs) and wants to end up +with a file-cap (like URI:CHK:...). + +The first step is to convert the path to an absolute form +(/home/warner/emacs) and do a lookup in the last_upload table. If the path is +not present in this table, the file must be uploaded. The upload process is: + + 1. record the file's size, creation time, and modification time + 2. upload the file into the grid, obtaining an immutable file read-cap + 3. add an entry to the 'caps' table, with the read-cap, to get a fileid + 4. add an entry to the 'last_upload' table, with the current time + 5. add an entry to the 'local_files' table, with the fileid, the path, + and the local file's size/ctime/mtime + +If the path *is* present in 'last_upload', the easy-to-compute identifying +information is compared: file size and ctime/mtime. If these differ, the file +must be uploaded. The row is removed from the last_upload table, and the +upload process above is followed. + +If the path is present but ctime or mtime differs, the file may have changed. +If the size differs, then the file has certainly changed. At this point, a +future version of the "backup" command might hash the file and look for a +match in an as-yet-defined table, in the hopes that the file has simply been +moved from somewhere else on the disk. This enhancement requires changes to +the Tahoe upload API before it can be significantly more efficient than +simply handing the file to Tahoe and relying upon the normal convergence to +notice the similarity. + +If ctime, mtime, or size is different, the client will upload the file, as +above. + +If these identifiers are the same, the client will assume that the file is +unchanged (unless the --ignore-timestamps option is provided, in which case +the client always re-uploads the file), and it may be allowed to skip the +upload. For safety, however, we require the client periodically perform a +filecheck on these probably-already-uploaded files, and re-upload anything +that doesn't look healthy. The client looks the fileid up in the +'last_upload' table, to see how long it has been since the file was last +checked. + +A "random early check" algorithm should be used, in which a check is +performed with a probability that increases with the age of the previous +results. E.g. files that were last checked within a month are not checked, +files that were checked 5 weeks ago are re-checked with 25% probability, 6 +weeks with 50%, more than 8 weeks are always checked. This reduces the +"thundering herd" of filechecks-on-everything that would otherwise result +when a backup operation is run one month after the original backup. If a +filecheck reveals the file is not healthy, it is re-uploaded. + +If the filecheck shows the file is healthy, or if the filecheck was skipped, +the client gets to skip the upload, and uses the previous filecap (from the +'caps' table) to add to the parent directory. + +If a new file is uploaded, a new entry is put in the 'caps' and 'last_upload' +table, and an entry is made in the 'local_files' table to reflect the mapping +from local disk pathname to uploaded filecap. If an old file is re-uploaded, +the 'last_upload' entry is updated with the new timestamps. If an old file is +checked and found healthy, the 'last_upload' entry is updated. + +Relying upon timestamps is a compromise between efficiency and safety: a file +which is modified without changing the timestamp or size will be treated as +unmodified, and the "tahoe backup" command will not copy the new contents +into the grid. The --no-timestamps can be used to disable this optimization, +forcing every byte of the file to be hashed and encoded. + +== DIRECTORY CACHING == + +A future version of the backupdb will also record a secure hash of the most +recent contents of each tahoe directory that was used in the last backup run. +The directories created by the "tahoe backup" command are all read-only, so +it should be difficult to violate the assumption that these directories are +unmodified since the previous pass. In the future, Tahoe will provide truly +immutable directories, making this assumption even more solid. + +In the current implementation, when the backup algorithm is faced with the +decision to either create a new directory or share an old one, it must read +the contents of the old directory to compare it against the desired new +contents. This means that a "null backup" (performing a backup when nothing +has been changed) must still read every Tahoe directory from the previous +backup. + +With a directory-caching backupdb, these directory reads will be bypassed, +and the null backup will use minimal network bandwidth: one directory read +and two modifies. The Archives/ directory must be read to locate the latest +backup, and must be modified to add a new snapshot, and the Latest/ directory +will be updated to point to that same snapshot. + diff --git a/docs/proposed/backupdb.txt b/docs/proposed/backupdb.txt deleted file mode 100644 index c9618e6d..00000000 --- a/docs/proposed/backupdb.txt +++ /dev/null @@ -1,188 +0,0 @@ -= PRELIMINARY = - -This document is a description of a feature which is not yet implemented, -added here to solicit feedback and to describe future plans. This document is -subject to revision or withdrawal at any moment. Until this notice is -removed, consider this entire document to be a figment of your imagination. - -= The Tahoe BackupDB = - -To speed up backup operations, Tahoe maintains a small database known as the -"backupdb". This is used to avoid re-uploading files which have already been -uploaded recently. - -This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite -single-file database. It is used by the "tahoe backup" command, and by the -"tahoe cp" command when the --use-backupdb option is included. - -The purpose of this database is specifically to manage the file-to-cap -translation (the "upload" step). It does not address directory updates. - -The overall goal of optimizing backup is to reduce the work required when the -source disk has not changed since the last backup. In the ideal case, running -"tahoe backup" twice in a row, with no intervening changes to the disk, will -not require any network traffic. - -This database is optional. If it is deleted, the worst effect is that a -subsequent backup operation may use more effort (network bandwidth, CPU -cycles, and disk IO) than it would have without the backupdb. - -== Schema == - -The database contains the following tables: - -CREATE TABLE version -( - version integer # contains one row, set to 0 -); - -CREATE TABLE last_upload -( - path varchar(1024), # index, this is os.path.abspath(fn) - size integer, # os.stat(fn)[stat.ST_SIZE] - mtime number, # os.stat(fn)[stat.ST_MTIME] - fileid integer -); - -CREATE TABLE caps -( - fileid integer PRIMARY KEY AUTOINCREMENT, - filecap varchar(256), # URI:CHK:... - last_uploaded timestamp, - last_checked timestamp -); - -CREATE TABLE keys_to_files -( - readkey varchar(256) PRIMARY KEY, # index, AES key portion of filecap - fileid integer -); - -Notes: if we extend the backupdb to assist with directory maintenance (see -below), we may need paths in multiple places, so it would make sense to -create a table for them, and change the last_upload table to refer to a -pathid instead of an absolute path: - -CREATE TABLE paths -( - path varchar(1024), # index - pathid integer PRIMARY KEY AUTOINCREMENT -); - -== Operation == - -The upload process starts with a pathname (like ~/.emacs) and wants to end up -with a file-cap (like URI:CHK:...). - -The first step is to convert the path to an absolute form -(/home/warner/emacs) and do a lookup in the last_upload table. If the path is -not present in this table, the file must be uploaded. The upload process is: - - 1. record the file's size and modification time - 2. upload the file into the grid, obtaining an immutable file read-cap - 3. add an entry to the 'caps' table, with the read-cap, and the current time - 4. extract the read-key from the read-cap, add an entry to 'keys_to_files' - 5. add an entry to 'last_upload' - -If the path *is* present in 'last_upload', the easy-to-compute identifying -information is compared: file size and modification time. If these differ, -the file must be uploaded. The row is removed from the last_upload table, and -the upload process above is followed. - -If the path is present but the mtime differs, the file may have changed. If -the size differs, then the file has certainly changed. The client will -compute the CHK read-key for the file by hashing its contents, using exactly -the same algorithm as the node does when it uploads a file (including -~/.tahoe/private/convergence). It then checks the 'keys_to_files' table to -see if this file has been uploaded before: perhaps the file was moved from -elsewhere on the disk. If no match is found, the file must be uploaded, so -the upload process above is follwed. - -If the read-key *is* found in the 'keys_to_files' table, then the file has -been uploaded before, but we should consider performing a file check / verify -operation to make sure we can skip a new upload. The fileid is used to -retrieve the entry from the 'caps' table, and the last_checked timestamp is -examined. If this timestamp is too old, a filecheck operation should be -performed, and the file repaired if the results are not satisfactory. A -"random early check" algorithm should be used, in which a check is performed -with a probability that increases with the age of the previous results. E.g. -files that were last checked within a month are not checked, files that were -checked 5 weeks ago are re-checked with 25% probability, 6 weeks with 50%, -more than 8 weeks are always checked. This reduces the "thundering herd" of -filechecks-on-everything that would otherwise result when a backup operation -is run one month after the original backup. The readkey can be submitted to -the upload operation, to remove a duplicate hashing pass through the file and -reduce the disk IO. In a future version of the storage server protocol, this -could also improve the "streamingness" of the upload process. - -If the file's size and mtime match, the file is considered to be unmodified, -and the last_checked timestamp from the 'caps' table is examined as above -(possibly resulting in a filecheck or repair). The --no-timestamps option -disables this check: this removes the danger of false-positives (i.e. not -uploading a new file, because it appeared to be the same as a previously -uploaded one), but increases the amount of disk IO that must be performed -(every byte of every file must be hashed to compute the readkey). - -This algorithm is summarized in the following pseudocode: - -{{{ - def backup(path): - abspath = os.path.abspath(path) - result = check_for_upload(abspath) - now = time.time() - if result == MUST_UPLOAD: - filecap = upload(abspath, key=result.readkey) - fileid = db("INSERT INTO caps (filecap, last_uploaded, last_checked)", - (filecap, now, now)) - db("INSERT INTO keys_to_files", (result.readkey, filecap)) - db("INSERT INTO last_upload", (abspath,current_size,current_mtime,fileid)) - if result in (MOVED, ALREADY_UPLOADED): - age = now - result.last_checked - probability = (age - 1*MONTH) / 1*MONTH - probability = min(max(probability, 0.0), 1.0) - if random.random() < probability: - do_filecheck(result.filecap) - if result == MOVED: - db("INSERT INTO last_upload", - (abspath, current_size, current_mtime, result.fileid)) - - - def check_for_upload(abspath): - row = db("SELECT (size,mtime,fileid) FROM last_upload WHERE path == %s" - % abspath) - if not row: - return check_moved(abspath) - current_size = os.stat(abspath)[stat.ST_SIZE] - current_mtime = os.stat(abspath)[stat.ST_MTIME] - (last_size,last_mtime,last_fileid) = row - if file_changed(current_size, last_size, current_mtime, last_mtime): - db("DELETE FROM last_upload WHERE fileid=%s" % fileid) - return check_moved(abspath) - (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" + - " WHERE fileid == %s" % last_fileid) - return ALREADY_UPLOADED(filecap=filecap, last_checked=last_checked) - - def file_changed(current_size, last_size, current_mtime, last_mtime): - if last_size != current_size: - return True - if NO_TIMESTAMPS: - return True - if last_mtime != current_mtime: - return True - return False - - def check_moved(abspath): - readkey = hash_with_convergence(abspath) - fileid = db("SELECT (fileid) FROM keys_to_files WHERE readkey == %s"%readkey) - if not fileid: - return MUST_UPLOAD(readkey=readkey) - (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" + - " WHERE fileid == %s" % fileid) - return MOVED(fileid=fileid, filecap=filecap, last_checked=last_checked) - - def do_filecheck(filecap): - health = check(filecap) - if health < DESIRED: - repair(filecap) - -}}} diff --git a/src/allmydata/scripts/tahoe_backup.py b/src/allmydata/scripts/tahoe_backup.py index 205c25ff..6d44ee69 100644 --- a/src/allmydata/scripts/tahoe_backup.py +++ b/src/allmydata/scripts/tahoe_backup.py @@ -50,24 +50,6 @@ def parse_old_timestamp(s, options): print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s return None -def readdir(dircap, options): - # returns a dict of (childname: (type, readcap, metadata)), or None if the - # dircap didn't point to a directory - url = options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap) - resp = do_http("GET", url) - if resp.status != 200: - raiseHTTPError("Error during directory GET", resp) - jd = simplejson.load(resp) - ntype, ndata = jd - if ntype != "dirnode": - return None - contents = {} - for (childname, (childtype, childdata)) in ndata["children"].items(): - contents[childname] = (childtype, - str(childdata["ro_uri"]), - childdata["metadata"]) - return contents - def get_local_metadata(path): metadata = {} @@ -131,100 +113,120 @@ def directory_is_changed(a, b): return True return False -def backup(options): - nodeurl = options['node-url'] - from_dir = options.from_dir - to_dir = options.to_dir - if options['quiet']: - verbosity = 0 - else: - verbosity = 2 - stdin = options.stdin - stdout = options.stdout - stderr = options.stderr - - use_backupdb = not options["no-backupdb"] - options.backupdb = None - if use_backupdb: - bdbfile = os.path.join(options["node-directory"], - "private", "backupdb.sqlite") - bdbfile = os.path.abspath(bdbfile) - options.backupdb = backupdb.get_backupdb(bdbfile) - - rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS) - to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap) - if path: - to_url += escape_path(path) - if not to_url.endswith("/"): - to_url += "/" - - archives_url = to_url + "Archives/" - latest_url = to_url + "Latest" - - # first step: make sure the target directory exists, as well as the - # Archives/ subdirectory. - resp = do_http("GET", archives_url + "?t=json") - if resp.status == 404: - resp = do_http("POST", archives_url + "?t=mkdir") - if resp.status != 200: - print >>stderr, "Unable to create target directory: %s %s %s" % \ - (resp.status, resp.reason, resp.read()) - return 1 - archives_dir = {} - else: - jdata = simplejson.load(resp) - (otype, attrs) = jdata - archives_dir = attrs["children"] - - # second step: locate the most recent backup in TODIR/Archives/* - latest_backup_time = 0 - latest_backup_name = None - latest_backup_dircap = None - - # we have various time formats. The allmydata.com windows backup tool - # appears to create things like "2008-11-16 10.34 PM". This script - # creates things like "2009-11-16--17.34Z". - for archive_name in archives_dir.keys(): - if archives_dir[archive_name][0] != "dirnode": - continue - when = parse_old_timestamp(archive_name, options) - if when is not None: - if when > latest_backup_time: - latest_backup_time = when - latest_backup_name = archive_name - latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"]) - - # third step: process the tree - new_backup_dircap = Node().process(options.from_dir, - latest_backup_dircap, - options) - print >>stdout, "new backup done" - - # fourth: attach the new backup to the list - new_readonly_backup_dircap = readonly(new_backup_dircap) - now = time_format.iso_utc(int(time.time()), sep="_") + "Z" - - put_child(archives_url, now, new_readonly_backup_dircap) - put_child(to_url, "Latest", new_readonly_backup_dircap) - - print >>stdout, "backup done" - # done! - return 0 - - -class Node: +class BackerUpper: + def __init__(self, options): + self.options = options + self.files_uploaded = 0 + self.files_reused = 0 + self.files_checked = 0 + self.directories_read = 0 + self.directories_created = 0 + self.directories_reused = 0 + self.directories_checked = 0 + + def run(self): + options = self.options + nodeurl = options['node-url'] + from_dir = options.from_dir + to_dir = options.to_dir + self.verbosity = 1 + if options['quiet']: + self.verbosity = 0 + if options['verbose']: + self.verbosity = 2 + stdin = options.stdin + stdout = options.stdout + stderr = options.stderr + + self.backupdb = None + use_backupdb = not options["no-backupdb"] + if use_backupdb: + bdbfile = os.path.join(options["node-directory"], + "private", "backupdb.sqlite") + bdbfile = os.path.abspath(bdbfile) + self.backupdb = backupdb.get_backupdb(bdbfile) + + rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS) + to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap) + if path: + to_url += escape_path(path) + if not to_url.endswith("/"): + to_url += "/" + + archives_url = to_url + "Archives/" + latest_url = to_url + "Latest" + + # first step: make sure the target directory exists, as well as the + # Archives/ subdirectory. + resp = do_http("GET", archives_url + "?t=json") + if resp.status == 404: + resp = do_http("POST", archives_url + "?t=mkdir") + if resp.status != 200: + print >>stderr, "Unable to create target directory: %s %s %s" % \ + (resp.status, resp.reason, resp.read()) + return 1 + archives_dir = {} + else: + jdata = simplejson.load(resp) + (otype, attrs) = jdata + archives_dir = attrs["children"] + + # second step: locate the most recent backup in TODIR/Archives/* + latest_backup_time = 0 + latest_backup_name = None + latest_backup_dircap = None + + # we have various time formats. The allmydata.com windows backup tool + # appears to create things like "2008-11-16 10.34 PM". This script + # creates things like "2009-11-16--17.34Z". + for archive_name in archives_dir.keys(): + if archives_dir[archive_name][0] != "dirnode": + continue + when = parse_old_timestamp(archive_name, options) + if when is not None: + if when > latest_backup_time: + latest_backup_time = when + latest_backup_name = archive_name + latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"]) + + # third step: process the tree + new_backup_dircap = self.process(options.from_dir, latest_backup_dircap) + + # fourth: attach the new backup to the list + new_readonly_backup_dircap = readonly(new_backup_dircap) + now = time_format.iso_utc(int(time.time()), sep="_") + "Z" + + put_child(archives_url, now, new_readonly_backup_dircap) + put_child(to_url, "Latest", new_readonly_backup_dircap) + + if self.verbosity >= 1: + print >>stdout, (" %d files uploaded (%d reused), " + "%d directories created (%d reused)" + % (self.files_uploaded, + self.files_reused, + self.directories_created, + self.directories_reused)) + if self.verbosity >= 2: + print >>stdout, (" %d files checked, %d directories checked, " + "%d directories read" + % (self.files_checked, + self.directories_checked, + self.directories_read)) + print >>stdout, " backup done" + # done! + return 0 + def verboseprint(self, msg): - if self.options["verbose"]: + if self.verbosity >= 2: print >>self.options.stdout, msg - def process(self, localpath, olddircap, options): + def process(self, localpath, olddircap): # returns newdircap - self.options = options self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap)) olddircontents = {} if olddircap: - olddircontents = readdir(olddircap, options) + olddircontents = self.readdir(olddircap) newdircontents = {} # childname -> (type, rocap, metadata) for child in os.listdir(localpath): @@ -234,7 +236,8 @@ class Node: oldchildcap = None if olddircontents is not None and child in olddircontents: oldchildcap = olddircontents[child][1] - newchilddircap = self.recurse(childpath, oldchildcap) + # recurse on the child directory + newchilddircap = self.process(childpath, oldchildcap) newdircontents[child] = ("dirnode", newchilddircap, metadata) elif os.path.isfile(childpath): newfilecap, metadata = self.upload(childpath) @@ -248,25 +251,21 @@ class Node: ): self.verboseprint(" %s not changed, re-using old directory" % localpath) # yay! they're identical! + self.directories_reused += 1 return olddircap else: self.verboseprint(" %s changed, making new directory" % localpath) # something changed, or there was no previous directory, so we # must make a new directory - newdircap = mkdir(newdircontents, options) + newdircap = mkdir(newdircontents, self.options) + self.directories_created += 1 return readonly(newdircap) - def recurse(self, localpath, olddircap): - n = self.__class__() - return n.process(localpath, olddircap, self.options) - - def check_backupdb(self, childpath): - if not self.options.backupdb: + if not self.backupdb: return True, None use_timestamps = not self.options["ignore-timestamps"] - bdb = self.options.backupdb - r = bdb.check_file(childpath, use_timestamps) + r = self.backupdb.check_file(childpath, use_timestamps) if not r.was_uploaded(): return True, r @@ -281,6 +280,7 @@ class Node: self.verboseprint("checking %s" % filecap) nodeurl = self.options['node-url'] checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap) + self.files_checked += 1 resp = do_http("POST", checkurl) if resp.status != 200: # can't check, so we must assume it's bad @@ -295,6 +295,25 @@ class Node: r.did_check_healthy(cr) return False, r + def readdir(self, dircap): + # returns a dict of (childname: (type, readcap, metadata)), or None + # if the dircap didn't point to a directory + self.directories_read += 1 + url = self.options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap) + resp = do_http("GET", url) + if resp.status != 200: + raiseHTTPError("Error during directory GET", resp) + jd = simplejson.load(resp) + ntype, ndata = jd + if ntype != "dirnode": + return None + contents = {} + for (childname, (childtype, childdata)) in ndata["children"].items(): + contents[childname] = (childtype, + str(childdata["ro_uri"]), + childdata["metadata"]) + return contents + def upload(self, childpath): #self.verboseprint("uploading %s.." % childpath) metadata = get_local_metadata(childpath) @@ -316,9 +335,14 @@ class Node: if bdb_results: bdb_results.did_upload(filecap) + self.files_uploaded += 1 return filecap, metadata else: self.verboseprint("skipping %s.." % childpath) + self.files_reused += 1 return bdb_results.was_uploaded(), metadata +def backup(options): + bu = BackerUpper(options) + return bu.run() diff --git a/src/allmydata/test/test_cli.py b/src/allmydata/test/test_cli.py index e3b9948d..ebfd5ed2 100644 --- a/src/allmydata/test/test_cli.py +++ b/src/allmydata/test/test_cli.py @@ -5,6 +5,7 @@ from twisted.trial import unittest from cStringIO import StringIO import urllib import time +import re from allmydata.util import fileutil, hashutil from allmydata import uri @@ -16,7 +17,7 @@ _hush_pyflakes = [tahoe_ls, tahoe_get, tahoe_put, tahoe_rm, tahoe_cp] from allmydata.scripts.common import DEFAULT_ALIAS, get_aliases -from allmydata.scripts import cli, debug, runner +from allmydata.scripts import cli, debug, runner, backupdb from allmydata.test.common import SystemTestMixin from twisted.internet import threads # CLI tests use deferToThread @@ -627,9 +628,23 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase): f.write(data) f.close() + def count_output(self, out): + mo = re.search(r"(\d)+ files uploaded \((\d+) reused\), (\d+) directories created \((\d+) reused\)", out) + return [int(s) for s in mo.groups()] + + def count_output2(self, out): + mo = re.search(r"(\d)+ files checked, (\d+) directories checked, (\d+) directories read", out) + return [int(s) for s in mo.groups()] + def test_backup(self): self.basedir = os.path.dirname(self.mktemp()) + # is the backupdb available? If so, we test that a second backup does + # not create new directories. + hush = StringIO() + have_bdb = backupdb.get_backupdb(os.path.join(self.basedir, "dbtest"), + hush) + # create a small local directory with a couple of files source = os.path.join(self.basedir, "home") fileutil.make_dirs(os.path.join(source, "empty")) @@ -643,7 +658,15 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase): def _check0((rc, out, err)): self.failUnlessEqual(err, "") self.failUnlessEqual(rc, 0) + fu, fr, dc, dr = self.count_output(out) + # foo.txt, bar.txt, blah.txt + self.failUnlessEqual(fu, 3) + self.failUnlessEqual(fr, 0) + # empty, home, home/parent, home/parent/subdir + self.failUnlessEqual(dc, 4) + self.failUnlessEqual(dr, 0) d.addCallback(_check0) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups")) def _check1((rc, out, err)): self.failUnlessEqual(err, "") @@ -678,12 +701,62 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase): d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups")) + def _check4a((rc, out, err)): + # second backup should reuse everything, if the backupdb is + # available + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + if have_bdb: + fu, fr, dc, dr = self.count_output(out) + # foo.txt, bar.txt, blah.txt + self.failUnlessEqual(fu, 0) + self.failUnlessEqual(fr, 3) + # empty, home, home/parent, home/parent/subdir + self.failUnlessEqual(dc, 0) + self.failUnlessEqual(dr, 4) + d.addCallback(_check4a) + + if have_bdb: + # sneak into the backupdb, crank back the "last checked" + # timestamp to force a check on all files + def _reset_last_checked(res): + dbfile = os.path.join(self.basedir, + "client0", "private", "backupdb.sqlite") + self.failUnless(os.path.exists(dbfile), dbfile) + bdb = backupdb.get_backupdb(dbfile) + bdb.cursor.execute("UPDATE last_upload SET last_checked=0") + bdb.connection.commit() + + d.addCallback(_reset_last_checked) + + d.addCallback(lambda res: + self.do_cli("backup", "--verbose", source, "tahoe:backups")) + def _check4b((rc, out, err)): + # we should check all files, and re-use all of them. None of + # the directories should have been changed. + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + fu, fr, dc, dr = self.count_output(out) + fchecked, dchecked, dread = self.count_output2(out) + self.failUnlessEqual(fchecked, 3) + self.failUnlessEqual(fu, 0) + self.failUnlessEqual(fr, 3) + # TODO: backupdb doesn't do dirs yet; when it does, this will + # change to dchecked=4, and maybe dread=0 + self.failUnlessEqual(dchecked, 0) + self.failUnlessEqual(dread, 4) + self.failUnlessEqual(dc, 0) + self.failUnlessEqual(dr, 4) + d.addCallback(_check4b) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives")) def _check5((rc, out, err)): self.failUnlessEqual(err, "") self.failUnlessEqual(rc, 0) self.new_archives = out.split() - self.failUnlessEqual(len(self.new_archives), 2) + self.failUnlessEqual(len(self.new_archives), 3) + # the original backup should still be the oldest (i.e. sorts + # alphabetically towards the beginning) self.failUnlessEqual(sorted(self.new_archives)[0], self.old_archives[0]) d.addCallback(_check5) @@ -701,12 +774,27 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase): self.writeto("empty", "imagine nothing being here") return self.do_cli("backup", source, "tahoe:backups") d.addCallback(_modify) + def _check5a((rc, out, err)): + # second backup should reuse bar.txt (if backupdb is available), + # and upload the rest. None of the directories can be reused. + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + if have_bdb: + fu, fr, dc, dr = self.count_output(out) + # new foo.txt, surprise file, subfile, empty + self.failUnlessEqual(fu, 4) + # old bar.txt + self.failUnlessEqual(fr, 1) + # home, parent, subdir, blah.txt, surprisedir + self.failUnlessEqual(dc, 5) + self.failUnlessEqual(dr, 0) + d.addCallback(_check5a) d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives")) def _check6((rc, out, err)): self.failUnlessEqual(err, "") self.failUnlessEqual(rc, 0) self.new_archives = out.split() - self.failUnlessEqual(len(self.new_archives), 3) + self.failUnlessEqual(len(self.new_archives), 4) self.failUnlessEqual(sorted(self.new_archives)[0], self.old_archives[0]) d.addCallback(_check6) @@ -724,5 +812,19 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase): self.failUnlessEqual(out, "foo") d.addCallback(_check8) + d.addCallback(lambda res: + self.do_cli("backup", "--no-backupdb", source, "tahoe:backups")) + def _check9((rc, out, err)): + # --no-backupdb means re-upload everything. We still get to + # re-use the directories, since nothing changed. + self.failUnlessEqual(err, "") + self.failUnlessEqual(rc, 0) + fu, fr, dc, dr = self.count_output(out) + self.failUnlessEqual(fu, 5) + self.failUnlessEqual(fr, 0) + self.failUnlessEqual(dc, 0) + self.failUnlessEqual(dr, 5) + d.addCallback(_check9) + return d -- 2.45.2