From 9901a08fd30efa875040dd8772e973cd232585a9 Mon Sep 17 00:00:00 2001 From: Brian Warner <warner@lothar.com> Date: Thu, 26 Nov 2009 15:42:57 -0800 Subject: [PATCH] Add dirnodes to backupdb and "tahoe backup", closes #606. * backups now share dirnodes with any previous backup, in any location, so renames and moves are handled very efficiently * "tahoe backup" no longer bothers reading the previous snapshot * if you switch grids, you should delete ~/.tahoe/private/backupdb.sqlite, to force new uploads of all files and directories --- docs/backupdb.txt | 77 +++++---- src/allmydata/scripts/backupdb.py | 174 ++++++++++++++++++--- src/allmydata/scripts/tahoe_backup.py | 217 ++++++++------------------ src/allmydata/test/test_backupdb.py | 81 +++++++++- src/allmydata/test/test_cli.py | 13 +- src/allmydata/util/hashutil.py | 4 + 6 files changed, 351 insertions(+), 215 deletions(-) diff --git a/docs/backupdb.txt b/docs/backupdb.txt index d162f818..140287f7 100644 --- a/docs/backupdb.txt +++ b/docs/backupdb.txt @@ -9,14 +9,15 @@ single-file database. It is used by the "tahoe backup" command. In the future, it will also be used by "tahoe mirror", and by "tahoe cp" when the --use-backupdb option is included. -The purpose of this database is specifically to manage the file-to-cap -translation (the "upload" step). It does not address directory updates. A -future version will include a directory cache. +The purpose of this database is twofold: to manage the file-to-cap +translation (the "upload" step) and the directory-to-cap translation (the +"mkdir-immutable" step). The overall goal of optimizing backup is to reduce the work required when the -source disk has not changed since the last backup. In the ideal case, running -"tahoe backup" twice in a row, with no intervening changes to the disk, will -not require any network traffic. +source disk has not changed (much) since the last backup. In the ideal case, +running "tahoe backup" twice in a row, with no intervening changes to the +disk, will not require any network traffic. Minimal changes to the source +disk should result in minimal traffic. This database is optional. If it is deleted, the worst effect is that a subsequent backup operation may use more effort (network bandwidth, CPU @@ -62,18 +63,15 @@ CREATE TABLE last_upload last_checked TIMESTAMP ); -Notes: if we extend the backupdb to assist with directory maintenance (see -below), we may need paths in multiple places, so it would make sense to -create a table for them, and change the last_upload table to refer to a -pathid instead of an absolute path: - -CREATE TABLE paths +CREATE TABLE directories ( - path varchar(1024) UNIQUE, -- index - pathid integer PRIMARY KEY AUTOINCREMENT + dirhash varchar(256) PRIMARY KEY, + dircap varchar(256), + last_uploaded TIMESTAMP, + last_checked TIMESTAMP ); -== Operation == +== Upload Operation == The upload process starts with a pathname (like ~/.emacs) and wants to end up with a file-cap (like URI:CHK:...). @@ -113,7 +111,7 @@ the client always re-uploads the file), and it may be allowed to skip the upload. For safety, however, we require the client periodically perform a filecheck on these probably-already-uploaded files, and re-upload anything that doesn't look healthy. The client looks the fileid up in the -'last_upload' table, to see how long it has been since the file was last +'last_checked' table, to see how long it has been since the file was last checked. A "random early check" algorithm should be used, in which a check is @@ -141,25 +139,36 @@ unmodified, and the "tahoe backup" command will not copy the new contents into the grid. The --no-timestamps can be used to disable this optimization, forcing every byte of the file to be hashed and encoded. -== DIRECTORY CACHING == - -A future version of the backupdb will also record a secure hash of the most -recent contents of each tahoe directory that was used in the last backup run. -The directories created by the "tahoe backup" command are all read-only, so -it should be difficult to violate the assumption that these directories are -unmodified since the previous pass. In the future, Tahoe will provide truly -immutable directories, making this assumption even more solid. +== Directory Operations == -In the current implementation, when the backup algorithm is faced with the -decision to either create a new directory or share an old one, it must read -the contents of the old directory to compare it against the desired new -contents. This means that a "null backup" (performing a backup when nothing -has been changed) must still read every Tahoe directory from the previous +Once the contents of a directory are known (a filecap for each file, and a +dircap for each directory), the backup process must find or create a tahoe +directory node with the same contents. The contents are hashed, and the hash +is queried in the 'directories' table. If found, the last-checked timestamp +is used to perform the same random-early-check algorithm described for files +above, but no new upload is performed. Since "tahoe backup" creates immutable +directories, it is perfectly safe to re-use a directory from a previous backup. -With a directory-caching backupdb, these directory reads will be bypassed, -and the null backup will use minimal network bandwidth: one directory read -and two modifies. The Archives/ directory must be read to locate the latest -backup, and must be modified to add a new snapshot, and the Latest/ directory -will be updated to point to that same snapshot. +If not found, the webapi "mkdir-immutable" operation is used to create a new +directory, and an entry is stored in the table. + +The comparison operation ignores timestamps and metadata, and pays attention +solely to the file names and contents. + +By using a directory-contents hash, the "tahoe backup" command is able to +re-use directories from other places in the backed up data, or from old +backups. This means that renaming a directory and moving a subdirectory to a +new parent both count as "minor changes" and will result in minimal Tahoe +operations and subsequent network traffic (new directories will be created +for the modified directory and all of its ancestors). It also means that you +can perform a backup ("#1"), delete a file or directory, perform a backup +("#2"), restore it, and then the next backup ("#3") will re-use the +directories from backup #1. + +The best case is a null backup, in which nothing has changed. This will +result in minimal network bandwidth: one directory read and two modifies. The +Archives/ directory must be read to locate the latest backup, and must be +modified to add a new snapshot, and the Latest/ directory will be updated to +point to that same snapshot. diff --git a/src/allmydata/scripts/backupdb.py b/src/allmydata/scripts/backupdb.py index add7c02c..f5e3e073 100644 --- a/src/allmydata/scripts/backupdb.py +++ b/src/allmydata/scripts/backupdb.py @@ -6,17 +6,20 @@ # "package")) must be installed. On debian, install python-pysqlite2 import os.path, sys, time, random, stat +from allmydata.util.netstring import netstring +from allmydata.util.hashutil import backupdb_dirhash +from allmydata.util import base32 DAY = 24*60*60 MONTH = 30*DAY SCHEMA_v1 = """ -CREATE TABLE version +CREATE TABLE version -- added in v1 ( - version INTEGER -- contains one row, set to 1 + version INTEGER -- contains one row, set to 2 ); -CREATE TABLE local_files +CREATE TABLE local_files -- added in v1 ( path VARCHAR(1024) PRIMARY KEY, -- index, this is os.path.abspath(fn) size INTEGER, -- os.stat(fn)[stat.ST_SIZE] @@ -25,13 +28,13 @@ CREATE TABLE local_files fileid INTEGER ); -CREATE TABLE caps +CREATE TABLE caps -- added in v1 ( fileid INTEGER PRIMARY KEY AUTOINCREMENT, filecap VARCHAR(256) UNIQUE -- URI:CHK:... ); -CREATE TABLE last_upload +CREATE TABLE last_upload -- added in v1 ( fileid INTEGER PRIMARY KEY, last_uploaded TIMESTAMP, @@ -40,7 +43,27 @@ CREATE TABLE last_upload """ -def get_backupdb(dbfile, stderr=sys.stderr): +TABLE_DIRECTORY = """ + +CREATE TABLE directories -- added in v2 +( + dirhash varchar(256) PRIMARY KEY, -- base32(dirhash) + dircap varchar(256), -- URI:DIR2-CHK:... + last_uploaded TIMESTAMP, + last_checked TIMESTAMP +); + +""" + +SCHEMA_v2 = SCHEMA_v1 + TABLE_DIRECTORY + +UPDATE_v1_to_v2 = TABLE_DIRECTORY + """ +UPDATE version SET version=2; +""" + + +def get_backupdb(dbfile, stderr=sys.stderr, + create_version=(SCHEMA_v2, 2), just_create=False): # open or create the given backupdb file. The parent directory must # exist. try: @@ -61,8 +84,9 @@ def get_backupdb(dbfile, stderr=sys.stderr): c = db.cursor() if must_create: - c.executescript(SCHEMA_v1) - c.execute("INSERT INTO version (version) VALUES (1)") + schema, version = create_version + c.executescript(schema) + c.execute("INSERT INTO version (version) VALUES (?)", (version,)) db.commit() try: @@ -74,13 +98,19 @@ def get_backupdb(dbfile, stderr=sys.stderr): print >>stderr, "backupdb file is unusable: %s" % e return None + if just_create: # for tests + return True + if version == 1: - return BackupDB_v1(sqlite, db) + c.executescript(UPDATE_v1_to_v2) + db.commit() + version = 2 + if version == 2: + return BackupDB_v2(sqlite, db) print >>stderr, "Unable to handle backupdb version %s" % version return None -MUST_UPLOAD, ALREADY_UPLOADED = range(2) -class Result: +class FileResult: def __init__(self, bdb, filecap, should_check, path, mtime, ctime, size): self.bdb = bdb @@ -98,18 +128,38 @@ class Result: return False def did_upload(self, filecap): - self.bdb.did_upload(filecap, - self.path, - self.mtime, self.ctime, self.size) + self.bdb.did_upload_file(filecap, self.path, + self.mtime, self.ctime, self.size) def should_check(self): return self.should_check_p def did_check_healthy(self, results): - self.bdb.did_check_healthy(self.filecap, results) + self.bdb.did_check_file_healthy(self.filecap, results) + +class DirectoryResult: + def __init__(self, bdb, dirhash, dircap, should_check): + self.bdb = bdb + self.dircap = dircap + self.should_check_p = should_check + self.dirhash = dirhash + + def was_created(self): + if self.dircap: + return self.dircap + return False + + def did_create(self, dircap): + self.bdb.did_create_directory(dircap, self.dirhash) + + def should_check(self): + return self.should_check_p -class BackupDB_v1: - VERSION = 1 + def did_check_healthy(self, results): + self.bdb.did_check_directory_healthy(self.dircap, results) + +class BackupDB_v2: + VERSION = 2 NO_CHECK_BEFORE = 1*MONTH ALWAYS_CHECK_AFTER = 2*MONTH @@ -123,9 +173,10 @@ class BackupDB_v1: by looking in a database and seeing if I have a record of this file having been uploaded earlier. - I return a Results object, synchronously. If r.was_uploaded() returns - False, you should upload the file. When you are finished uploading - it, call r.did_upload(filecap), so I can update my database. + I return a FileResults object, synchronously. If r.was_uploaded() + returns False, you should upload the file. When you are finished + uploading it, call r.did_upload(filecap), so I can update my + database. If was_uploaded() returns a filecap, you might be able to avoid an upload. Call r.should_check(), and if it says False, you can skip the @@ -167,7 +218,7 @@ class BackupDB_v1: (path,)) row = self.cursor.fetchone() if not row: - return Result(self, None, False, path, mtime, ctime, size) + return FileResult(self, None, False, path, mtime, ctime, size) (last_size,last_mtime,last_ctime,last_fileid) = row c.execute("SELECT caps.filecap, last_upload.last_checked" @@ -184,7 +235,7 @@ class BackupDB_v1: ): c.execute("DELETE FROM local_files WHERE path=?", (path,)) self.connection.commit() - return Result(self, None, False, path, mtime, ctime, size) + return FileResult(self, None, False, path, mtime, ctime, size) # at this point, we're allowed to assume the file hasn't been changed (filecap, last_checked) = row2 @@ -195,7 +246,8 @@ class BackupDB_v1: probability = min(max(probability, 0.0), 1.0) should_check = bool(random.random() < probability) - return Result(self, filecap, should_check, path, mtime, ctime, size) + return FileResult(self, str(filecap), should_check, + path, mtime, ctime, size) def get_or_allocate_fileid_for_cap(self, filecap): # find an existing fileid for this filecap, or insert a new one. The @@ -217,7 +269,7 @@ class BackupDB_v1: fileid = foundrow[0] return fileid - def did_upload(self, filecap, path, mtime, ctime, size): + def did_upload_file(self, filecap, path, mtime, ctime, size): now = time.time() fileid = self.get_or_allocate_fileid_for_cap(filecap) try: @@ -238,7 +290,7 @@ class BackupDB_v1: (size, mtime, ctime, fileid, path)) self.connection.commit() - def did_check_healthy(self, filecap, results): + def did_check_file_healthy(self, filecap, results): now = time.time() fileid = self.get_or_allocate_fileid_for_cap(filecap) self.cursor.execute("UPDATE last_upload" @@ -246,3 +298,75 @@ class BackupDB_v1: " WHERE fileid=?", (now, fileid)) self.connection.commit() + + def check_directory(self, contents): + """I will tell you if a new directory needs to be created for a given + set of directory contents, or if I know of an existing (immutable) + directory that can be used instead. + + 'contents' should be a dictionary that maps from child name (a single + unicode string) to immutable childcap (filecap or dircap). + + I return a DirectoryResult object, synchronously. If r.was_created() + returns False, you should create the directory (with + t=mkdir-immutable). When you are finished, call r.did_create(dircap) + so I can update my database. + + If was_created() returns a dircap, you might be able to avoid the + mkdir. Call r.should_check(), and if it says False, you can skip the + mkdir and use the dircap returned by was_created(). + + If should_check() returns True, you should perform a check operation + on the dircap returned by was_created(). If the check indicates the + directory is healthy, please call + r.did_check_healthy(checker_results) so I can update the database, + using the de-JSONized response from the webapi t=check call for + 'checker_results'. If the check indicates the directory is not + healthy, please repair or re-create the directory and call + r.did_create(dircap) when you're done. + """ + + now = time.time() + entries = [] + for name in contents: + entries.append( [name.encode("utf-8"), contents[name]] ) + entries.sort() + data = "".join([netstring(name_utf8)+netstring(cap) + for (name_utf8,cap) in entries]) + dirhash = backupdb_dirhash(data) + dirhash_s = base32.b2a(dirhash) + c = self.cursor + c.execute("SELECT dircap, last_checked" + " FROM directories WHERE dirhash=?", (dirhash_s,)) + row = c.fetchone() + if not row: + return DirectoryResult(self, dirhash_s, None, False) + (dircap, last_checked) = row + age = now - last_checked + + probability = ((age - self.NO_CHECK_BEFORE) / + (self.ALWAYS_CHECK_AFTER - self.NO_CHECK_BEFORE)) + probability = min(max(probability, 0.0), 1.0) + should_check = bool(random.random() < probability) + + return DirectoryResult(self, dirhash_s, str(dircap), should_check) + + def did_create_directory(self, dircap, dirhash): + now = time.time() + try: + self.cursor.execute("INSERT INTO directories VALUES (?,?,?,?)", + (dirhash, dircap, now, now)) + except (self.sqlite_module.IntegrityError, + self.sqlite_module.OperationalError): + # dirhash was already added: maybe they did mkdir and called us + # even though we told them the didn't have to + pass + self.connection.commit() + + def did_check_directory_healthy(self, dircap, results): + now = time.time() + self.cursor.execute("UPDATE directories" + " SET last_checked=?" + " WHERE dircap=?", + (now, dircap)) + self.connection.commit() diff --git a/src/allmydata/scripts/tahoe_backup.py b/src/allmydata/scripts/tahoe_backup.py index 145106ee..4f7c70dd 100644 --- a/src/allmydata/scripts/tahoe_backup.py +++ b/src/allmydata/scripts/tahoe_backup.py @@ -6,7 +6,6 @@ import simplejson import datetime from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS from allmydata.scripts.common_http import do_http -from allmydata import uri from allmydata.util import time_format from allmydata.scripts import backupdb @@ -17,53 +16,6 @@ def raiseHTTPError(msg, resp): msg = msg + ": %s %s %s" % (resp.status, resp.reason, resp.read()) raise HTTPError(msg) -def readonly(writedircap): - return uri.from_string_dirnode(writedircap).get_readonly().to_string() - -def parse_old_timestamp(s, options): - try: - if not s.endswith("Z"): - raise ValueError - # This returns seconds-since-epoch for an ISO-8601-ish-formatted UTC - # time string. This might raise ValueError if the string is not in the - # right format. - when = time_format.iso_utc_time_to_seconds(s[:-1]) - return when - except ValueError: - pass - - try: - # "2008-11-16 10.34 PM" (localtime) - if s[-3:] in (" AM", " PM"): - # this might raise ValueError - when = time.strptime(s[:-3], "%Y-%m-%d %I.%M") - if s[-3:] == "PM": - when += 12*60*60 - return when - except ValueError: - pass - - try: - # "2008-11-16 10.34.56 PM" (localtime) - if s[-3:] in (" AM", " PM"): - # this might raise ValueError - when = time.strptime(s[:-3], "%Y-%m-%d %I.%M.%S") - if s[-3:] == "PM": - when += 12*60*60 - return when - except ValueError: - pass - - try: - # "2008-12-31 18.21.43" - when = time.strptime(s, "%Y-%m-%d %H.%M.%S") - return when - except ValueError: - pass - - print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s - return None - def get_local_metadata(path): metadata = {} @@ -89,7 +41,7 @@ def mkdir(contents, options): })) for childname in contents ]) - body = simplejson.dumps(kids) + body = simplejson.dumps(kids).encode("utf-8") url = options['node-url'] + "uri?t=mkdir-immutable" resp = do_http("POST", url, body) if resp.status < 200 or resp.status >= 300: @@ -104,26 +56,6 @@ def put_child(dirurl, childname, childcap): if resp.status not in (200, 201): raiseHTTPError("error during put_child", resp) -def directory_is_changed(a, b): - # each is a mapping from childname to (type, cap, metadata) - significant_metadata = ("ctime", "mtime") - # other metadata keys are preserved, but changes to them won't trigger a - # new backup - - if set(a.keys()) != set(b.keys()): - return True - for childname in a: - a_type, a_cap, a_metadata = a[childname] - b_type, b_cap, b_metadata = b[childname] - if a_type != b_type: - return True - if a_cap != b_cap: - return True - for k in significant_metadata: - if a_metadata.get(k) != b_metadata.get(k): - return True - return False - class BackupProcessingError(Exception): pass @@ -133,7 +65,6 @@ class BackerUpper: self.files_uploaded = 0 self.files_reused = 0 self.files_checked = 0 - self.directories_read = 0 self.directories_created = 0 self.directories_reused = 0 self.directories_checked = 0 @@ -187,33 +118,14 @@ class BackerUpper: (otype, attrs) = jdata archives_dir = attrs["children"] - # second step: locate the most recent backup in TODIR/Archives/* - latest_backup_time = 0 - latest_backup_name = None - latest_backup_dircap = None - - # we have various time formats. The allmydata.com windows backup tool - # appears to create things like "2008-11-16 10.34 PM". This script - # creates things like "2008-11-16--17.34Z". - for archive_name in archives_dir.keys(): - if archives_dir[archive_name][0] != "dirnode": - continue - when = parse_old_timestamp(archive_name, options) - if when is not None: - if when > latest_backup_time: - latest_backup_time = when - latest_backup_name = archive_name - latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"]) - - # third step: process the tree - new_backup_dircap = self.process(options.from_dir, latest_backup_dircap) - - # fourth: attach the new backup to the list - new_readonly_backup_dircap = readonly(new_backup_dircap) + # second step: process the tree + new_backup_dircap = self.process(options.from_dir) + + # third: attach the new backup to the list now = time_format.iso_utc(int(time.time()), sep="_") + "Z" - put_child(archives_url, now, new_readonly_backup_dircap) - put_child(to_url, "Latest", new_readonly_backup_dircap) + put_child(archives_url, now, new_backup_dircap) + put_child(to_url, "Latest", new_backup_dircap) end_timestamp = datetime.datetime.now() # calc elapsed time, omitting microseconds elapsed_time = str(end_timestamp - start_timestamp).split('.')[0] @@ -226,11 +138,9 @@ class BackerUpper: self.directories_created, self.directories_reused)) if self.verbosity >= 2: - print >>stdout, (" %d files checked, %d directories checked, " - "%d directories read" + print >>stdout, (" %d files checked, %d directories checked" % (self.files_checked, - self.directories_checked, - self.directories_read)) + self.directories_checked)) print >>stdout, " backup done, elapsed time: %s" % elapsed_time # done! return 0 @@ -239,48 +149,45 @@ class BackerUpper: if self.verbosity >= 2: print >>self.options.stdout, msg - def process(self, localpath, olddircap): + def process(self, localpath): # returns newdircap - self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap)) - olddircontents = {} - if olddircap: - olddircontents = self.readdir(olddircap) - - newdircontents = {} # childname -> (type, rocap, metadata) + self.verboseprint("processing %s" % localpath) + create_contents = {} # childname -> (type, rocap, metadata) + compare_contents = {} # childname -> rocap for child in self.options.filter_listdir(os.listdir(localpath)): childpath = os.path.join(localpath, child) + child = unicode(child) if os.path.isdir(childpath): metadata = get_local_metadata(childpath) - oldchildcap = None - if olddircontents is not None and child in olddircontents: - oldchildcap = olddircontents[child][1] # recurse on the child directory - newchilddircap = self.process(childpath, oldchildcap) - newdircontents[child] = ("dirnode", newchilddircap, metadata) + childcap = self.process(childpath) + assert isinstance(childcap, str) + create_contents[child] = ("dirnode", childcap, metadata) + compare_contents[child] = childcap elif os.path.isfile(childpath): - newfilecap, metadata = self.upload(childpath) - newdircontents[child] = ("filenode", newfilecap, metadata) + childcap, metadata = self.upload(childpath) + assert isinstance(childcap, str) + create_contents[child] = ("filenode", childcap, metadata) + compare_contents[child] = childcap else: - raise BackupProcessingError("Cannot backup this file %r" % childpath) - - if (olddircap - and olddircontents is not None - and not directory_is_changed(newdircontents, olddircontents) - ): - self.verboseprint(" %s not changed, re-using old directory" % localpath) - # yay! they're identical! - self.directories_reused += 1 - return olddircap - else: - self.verboseprint(" %s changed, making new directory" % localpath) - # something changed, or there was no previous directory, so we - # must make a new directory - newdircap = mkdir(newdircontents, self.options) + raise BackupProcessingError("Cannot backup child %r" % childpath) + + must_create, r = self.check_backupdb_directory(compare_contents) + if must_create: + self.verboseprint(" creating directory for %s" % localpath) + newdircap = mkdir(create_contents, self.options) + assert isinstance(newdircap, str) + if r: + r.did_create(newdircap) self.directories_created += 1 - return readonly(newdircap) + return newdircap + else: + self.verboseprint(" re-using old directory for %s" % localpath) + self.directories_reused += 1 + return r.was_created() - def check_backupdb(self, childpath): + def check_backupdb_file(self, childpath): if not self.backupdb: return True, None use_timestamps = not self.options["ignore-timestamps"] @@ -314,31 +221,45 @@ class BackerUpper: r.did_check_healthy(cr) return False, r - def readdir(self, dircap): - # returns a dict of (childname: (type, readcap, metadata)), or None - # if the dircap didn't point to a directory - self.directories_read += 1 - url = self.options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap) - resp = do_http("GET", url) + def check_backupdb_directory(self, compare_contents): + if not self.backupdb: + return True, None + r = self.backupdb.check_directory(compare_contents) + + if not r.was_created(): + return True, r + + if not r.should_check(): + # the file was uploaded or checked recently, so we can just use + # it + return False, r + + # we must check the directory before re-using it + dircap = r.was_created() + self.verboseprint("checking %s" % dircap) + nodeurl = self.options['node-url'] + checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(dircap) + self.directories_checked += 1 + resp = do_http("POST", checkurl) if resp.status != 200: - raiseHTTPError("Error during directory GET", resp) - jd = simplejson.load(resp) - ntype, ndata = jd - if ntype != "dirnode": - return None - contents = {} - for (childname, (childtype, childdata)) in ndata["children"].items(): - contents[childname] = (childtype, - str(childdata["ro_uri"]), - childdata["metadata"]) - return contents + # can't check, so we must assume it's bad + return True, r + + cr = simplejson.loads(resp.read()) + healthy = cr["results"]["healthy"] + if not healthy: + # must create + return True, r + # directory is healthy, no need to upload + r.did_check_healthy(cr) + return False, r def upload(self, childpath): #self.verboseprint("uploading %s.." % childpath) metadata = get_local_metadata(childpath) # we can use the backupdb here - must_upload, bdb_results = self.check_backupdb(childpath) + must_upload, bdb_results = self.check_backupdb_file(childpath) if must_upload: self.verboseprint("uploading %s.." % childpath) diff --git a/src/allmydata/test/test_backupdb.py b/src/allmydata/test/test_backupdb.py index 31a3d05a..9cf58065 100644 --- a/src/allmydata/test/test_backupdb.py +++ b/src/allmydata/test/test_backupdb.py @@ -21,7 +21,24 @@ class BackupDB(unittest.TestCase): dbfile = os.path.join(basedir, "dbfile") bdb = self.create_or_skip(dbfile) self.failUnless(bdb) - self.failUnlessEqual(bdb.VERSION, 1) + self.failUnlessEqual(bdb.VERSION, 2) + + def test_upgrade_v1_v2(self): + self.basedir = basedir = os.path.join("backupdb", "upgrade_v1_v2") + fileutil.make_dirs(basedir) + dbfile = os.path.join(basedir, "dbfile") + stderr = StringIO() + created = backupdb.get_backupdb(dbfile, stderr=stderr, + create_version=(backupdb.SCHEMA_v1, 1), + just_create=True) + if not created: + if "I was unable to import a python sqlite library" in stderr.getvalue(): + raise unittest.SkipTest("sqlite unavailable, skipping test") + self.fail("unable to create v1 backupdb") + # now we should have a v1 database on disk + bdb = self.create_or_skip(dbfile) + self.failUnless(bdb) + self.failUnlessEqual(bdb.VERSION, 2) def test_fail(self): self.basedir = basedir = os.path.join("backupdb", "fail") @@ -87,6 +104,7 @@ class BackupDB(unittest.TestCase): r = bdb.check_file(foo_fn) self.failUnlessEqual(r.was_uploaded(), "foo-cap") + self.failUnlessEqual(type(r.was_uploaded()), str) self.failUnlessEqual(r.should_check(), False) time.sleep(1.0) # make sure the timestamp changes @@ -149,3 +167,64 @@ class BackupDB(unittest.TestCase): stderr = stderr_f.getvalue() self.failUnlessEqual(stderr.strip(), "Unable to handle backupdb version 0") + + def test_directory(self): + self.basedir = basedir = os.path.join("backupdb", "directory") + fileutil.make_dirs(basedir) + dbfile = os.path.join(basedir, "dbfile") + bdb = self.create_or_skip(dbfile) + self.failUnless(bdb) + + contents = {u"file1": "URI:CHK:blah1", + u"file2": "URI:CHK:blah2", + u"dir1": "URI:DIR2-CHK:baz2"} + r = bdb.check_directory(contents) + self.failUnless(isinstance(r, backupdb.DirectoryResult)) + self.failIf(r.was_created()) + dircap = "URI:DIR2-CHK:foo1" + r.did_create(dircap) + + r = bdb.check_directory(contents) + self.failUnless(r.was_created()) + self.failUnlessEqual(r.was_created(), dircap) + self.failUnlessEqual(r.should_check(), False) + + # if we spontaneously decide to upload it anyways, nothing should + # break + r.did_create(dircap) + r = bdb.check_directory(contents) + self.failUnless(r.was_created()) + self.failUnlessEqual(r.was_created(), dircap) + self.failUnlessEqual(type(r.was_created()), str) + self.failUnlessEqual(r.should_check(), False) + + bdb.NO_CHECK_BEFORE = 0 + bdb.ALWAYS_CHECK_AFTER = 0.1 + time.sleep(1.0) + + r = bdb.check_directory(contents) + self.failUnless(r.was_created()) + self.failUnlessEqual(r.was_created(), dircap) + self.failUnlessEqual(r.should_check(), True) + r.did_check_healthy("results") + + bdb.NO_CHECK_BEFORE = 200 + bdb.ALWAYS_CHECK_AFTER = 400 + + r = bdb.check_directory(contents) + self.failUnless(r.was_created()) + self.failUnlessEqual(r.was_created(), dircap) + self.failUnlessEqual(r.should_check(), False) + + + contents2 = {u"file1": "URI:CHK:blah1", + u"dir1": "URI:DIR2-CHK:baz2"} + r = bdb.check_directory(contents2) + self.failIf(r.was_created()) + + contents3 = {u"file1": "URI:CHK:blah1", + u"file2": "URI:CHK:blah3", + u"dir1": "URI:DIR2-CHK:baz2"} + r = bdb.check_directory(contents3) + self.failIf(r.was_created()) + diff --git a/src/allmydata/test/test_cli.py b/src/allmydata/test/test_cli.py index 58754b88..b88ed4c0 100644 --- a/src/allmydata/test/test_cli.py +++ b/src/allmydata/test/test_cli.py @@ -948,7 +948,7 @@ class Backup(GridTestMixin, CLITestMixin, StallMixin, unittest.TestCase): return [int(s) for s in mo.groups()] def count_output2(self, out): - mo = re.search(r"(\d)+ files checked, (\d+) directories checked, (\d+) directories read", out) + mo = re.search(r"(\d)+ files checked, (\d+) directories checked", out) return [int(s) for s in mo.groups()] def test_backup(self): @@ -1062,6 +1062,7 @@ class Backup(GridTestMixin, CLITestMixin, StallMixin, unittest.TestCase): self.failUnless(os.path.exists(dbfile), dbfile) bdb = backupdb.get_backupdb(dbfile) bdb.cursor.execute("UPDATE last_upload SET last_checked=0") + bdb.cursor.execute("UPDATE directories SET last_checked=0") bdb.connection.commit() d.addCallback(_reset_last_checked) @@ -1070,18 +1071,16 @@ class Backup(GridTestMixin, CLITestMixin, StallMixin, unittest.TestCase): d.addCallback(lambda res: do_backup(verbose=True)) def _check4b((rc, out, err)): # we should check all files, and re-use all of them. None of - # the directories should have been changed. + # the directories should have been changed, so we should + # re-use all of them too. self.failUnlessEqual(err, "") self.failUnlessEqual(rc, 0) fu, fr, dc, dr = self.count_output(out) - fchecked, dchecked, dread = self.count_output2(out) + fchecked, dchecked = self.count_output2(out) self.failUnlessEqual(fchecked, 3) self.failUnlessEqual(fu, 0) self.failUnlessEqual(fr, 3) - # TODO: backupdb doesn't do dirs yet; when it does, this will - # change to dchecked=4, and maybe dread=0 - self.failUnlessEqual(dchecked, 0) - self.failUnlessEqual(dread, 4) + self.failUnlessEqual(dchecked, 4) self.failUnlessEqual(dc, 0) self.failUnlessEqual(dr, 4) d.addCallback(_check4b) diff --git a/src/allmydata/util/hashutil.py b/src/allmydata/util/hashutil.py index 883f27c6..2b40a97c 100644 --- a/src/allmydata/util/hashutil.py +++ b/src/allmydata/util/hashutil.py @@ -197,3 +197,7 @@ def ssk_storage_index_hash(readkey): def constant_time_compare(a, b): n = os.urandom(8) return bool(tagged_hash(n, a) == tagged_hash(n, b)) + +BACKUPDB_DIRHASH_TAG = "allmydata_backupdb_dirhash_v1" +def backupdb_dirhash(contents): + return tagged_hash(BACKUPDB_DIRHASH_TAG, contents) -- 2.45.2