--- /dev/null
+= The Tahoe BackupDB =
+
+To speed up backup operations, Tahoe maintains a small database known as the
+"backupdb". This is used to avoid re-uploading files which have already been
+uploaded recently.
+
+This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
+single-file database. It is used by the "tahoe backup" command (unless the
+--no-backupdb option is included). In the future, it will also be used by
+"tahoe mirror", and by "tahoe cp" when the --use-backupdb option is included.
+
+The purpose of this database is specifically to manage the file-to-cap
+translation (the "upload" step). It does not address directory updates. A
+future version will include a directory cache.
+
+The overall goal of optimizing backup is to reduce the work required when the
+source disk has not changed since the last backup. In the ideal case, running
+"tahoe backup" twice in a row, with no intervening changes to the disk, will
+not require any network traffic.
+
+This database is optional. If it is deleted, the worst effect is that a
+subsequent backup operation may use more effort (network bandwidth, CPU
+cycles, and disk IO) than it would have without the backupdb.
+
+The database uses sqlite3, which is included as part of the standard python
+library with python2.5 and later. For python2.4, please install the
+"pysqlite2" package (which, despite the name, actually provides sqlite3
+rather than sqlite2).
+
+== Schema ==
+
+The database contains the following tables:
+
+CREATE TABLE version
+(
+ version integer # contains one row, set to 1
+);
+
+CREATE TABLE last_upload
+(
+ path varchar(1024), PRIMARY KEY -- index, this is os.path.abspath(fn)
+ size integer, -- os.stat(fn)[stat.ST_SIZE]
+ mtime number, -- os.stat(fn)[stat.ST_MTIME]
+ ctime number, -- os.stat(fn)[stat.ST_MTIME]
+ fileid integer
+);
+
+CREATE TABLE caps
+(
+ fileid integer PRIMARY KEY AUTOINCREMENT,
+ filecap varchar(256) UNIQUE -- URI:CHK:...
+);
+
+CREATE TABLE last_upload
+(
+ fileid INTEGER PRIMARY KEY,
+ last_uploaded TIMESTAMP,
+ last_checked TIMESTAMP
+);
+
+Notes: if we extend the backupdb to assist with directory maintenance (see
+below), we may need paths in multiple places, so it would make sense to
+create a table for them, and change the last_upload table to refer to a
+pathid instead of an absolute path:
+
+CREATE TABLE paths
+(
+ path varchar(1024) UNIQUE, -- index
+ pathid integer PRIMARY KEY AUTOINCREMENT
+);
+
+== Operation ==
+
+The upload process starts with a pathname (like ~/.emacs) and wants to end up
+with a file-cap (like URI:CHK:...).
+
+The first step is to convert the path to an absolute form
+(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
+not present in this table, the file must be uploaded. The upload process is:
+
+ 1. record the file's size, creation time, and modification time
+ 2. upload the file into the grid, obtaining an immutable file read-cap
+ 3. add an entry to the 'caps' table, with the read-cap, to get a fileid
+ 4. add an entry to the 'last_upload' table, with the current time
+ 5. add an entry to the 'local_files' table, with the fileid, the path,
+ and the local file's size/ctime/mtime
+
+If the path *is* present in 'last_upload', the easy-to-compute identifying
+information is compared: file size and ctime/mtime. If these differ, the file
+must be uploaded. The row is removed from the last_upload table, and the
+upload process above is followed.
+
+If the path is present but ctime or mtime differs, the file may have changed.
+If the size differs, then the file has certainly changed. At this point, a
+future version of the "backup" command might hash the file and look for a
+match in an as-yet-defined table, in the hopes that the file has simply been
+moved from somewhere else on the disk. This enhancement requires changes to
+the Tahoe upload API before it can be significantly more efficient than
+simply handing the file to Tahoe and relying upon the normal convergence to
+notice the similarity.
+
+If ctime, mtime, or size is different, the client will upload the file, as
+above.
+
+If these identifiers are the same, the client will assume that the file is
+unchanged (unless the --ignore-timestamps option is provided, in which case
+the client always re-uploads the file), and it may be allowed to skip the
+upload. For safety, however, we require the client periodically perform a
+filecheck on these probably-already-uploaded files, and re-upload anything
+that doesn't look healthy. The client looks the fileid up in the
+'last_upload' table, to see how long it has been since the file was last
+checked.
+
+A "random early check" algorithm should be used, in which a check is
+performed with a probability that increases with the age of the previous
+results. E.g. files that were last checked within a month are not checked,
+files that were checked 5 weeks ago are re-checked with 25% probability, 6
+weeks with 50%, more than 8 weeks are always checked. This reduces the
+"thundering herd" of filechecks-on-everything that would otherwise result
+when a backup operation is run one month after the original backup. If a
+filecheck reveals the file is not healthy, it is re-uploaded.
+
+If the filecheck shows the file is healthy, or if the filecheck was skipped,
+the client gets to skip the upload, and uses the previous filecap (from the
+'caps' table) to add to the parent directory.
+
+If a new file is uploaded, a new entry is put in the 'caps' and 'last_upload'
+table, and an entry is made in the 'local_files' table to reflect the mapping
+from local disk pathname to uploaded filecap. If an old file is re-uploaded,
+the 'last_upload' entry is updated with the new timestamps. If an old file is
+checked and found healthy, the 'last_upload' entry is updated.
+
+Relying upon timestamps is a compromise between efficiency and safety: a file
+which is modified without changing the timestamp or size will be treated as
+unmodified, and the "tahoe backup" command will not copy the new contents
+into the grid. The --no-timestamps can be used to disable this optimization,
+forcing every byte of the file to be hashed and encoded.
+
+== DIRECTORY CACHING ==
+
+A future version of the backupdb will also record a secure hash of the most
+recent contents of each tahoe directory that was used in the last backup run.
+The directories created by the "tahoe backup" command are all read-only, so
+it should be difficult to violate the assumption that these directories are
+unmodified since the previous pass. In the future, Tahoe will provide truly
+immutable directories, making this assumption even more solid.
+
+In the current implementation, when the backup algorithm is faced with the
+decision to either create a new directory or share an old one, it must read
+the contents of the old directory to compare it against the desired new
+contents. This means that a "null backup" (performing a backup when nothing
+has been changed) must still read every Tahoe directory from the previous
+backup.
+
+With a directory-caching backupdb, these directory reads will be bypassed,
+and the null backup will use minimal network bandwidth: one directory read
+and two modifies. The Archives/ directory must be read to locate the latest
+backup, and must be modified to add a new snapshot, and the Latest/ directory
+will be updated to point to that same snapshot.
+
+++ /dev/null
-= PRELIMINARY =
-
-This document is a description of a feature which is not yet implemented,
-added here to solicit feedback and to describe future plans. This document is
-subject to revision or withdrawal at any moment. Until this notice is
-removed, consider this entire document to be a figment of your imagination.
-
-= The Tahoe BackupDB =
-
-To speed up backup operations, Tahoe maintains a small database known as the
-"backupdb". This is used to avoid re-uploading files which have already been
-uploaded recently.
-
-This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
-single-file database. It is used by the "tahoe backup" command, and by the
-"tahoe cp" command when the --use-backupdb option is included.
-
-The purpose of this database is specifically to manage the file-to-cap
-translation (the "upload" step). It does not address directory updates.
-
-The overall goal of optimizing backup is to reduce the work required when the
-source disk has not changed since the last backup. In the ideal case, running
-"tahoe backup" twice in a row, with no intervening changes to the disk, will
-not require any network traffic.
-
-This database is optional. If it is deleted, the worst effect is that a
-subsequent backup operation may use more effort (network bandwidth, CPU
-cycles, and disk IO) than it would have without the backupdb.
-
-== Schema ==
-
-The database contains the following tables:
-
-CREATE TABLE version
-(
- version integer # contains one row, set to 0
-);
-
-CREATE TABLE last_upload
-(
- path varchar(1024), # index, this is os.path.abspath(fn)
- size integer, # os.stat(fn)[stat.ST_SIZE]
- mtime number, # os.stat(fn)[stat.ST_MTIME]
- fileid integer
-);
-
-CREATE TABLE caps
-(
- fileid integer PRIMARY KEY AUTOINCREMENT,
- filecap varchar(256), # URI:CHK:...
- last_uploaded timestamp,
- last_checked timestamp
-);
-
-CREATE TABLE keys_to_files
-(
- readkey varchar(256) PRIMARY KEY, # index, AES key portion of filecap
- fileid integer
-);
-
-Notes: if we extend the backupdb to assist with directory maintenance (see
-below), we may need paths in multiple places, so it would make sense to
-create a table for them, and change the last_upload table to refer to a
-pathid instead of an absolute path:
-
-CREATE TABLE paths
-(
- path varchar(1024), # index
- pathid integer PRIMARY KEY AUTOINCREMENT
-);
-
-== Operation ==
-
-The upload process starts with a pathname (like ~/.emacs) and wants to end up
-with a file-cap (like URI:CHK:...).
-
-The first step is to convert the path to an absolute form
-(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
-not present in this table, the file must be uploaded. The upload process is:
-
- 1. record the file's size and modification time
- 2. upload the file into the grid, obtaining an immutable file read-cap
- 3. add an entry to the 'caps' table, with the read-cap, and the current time
- 4. extract the read-key from the read-cap, add an entry to 'keys_to_files'
- 5. add an entry to 'last_upload'
-
-If the path *is* present in 'last_upload', the easy-to-compute identifying
-information is compared: file size and modification time. If these differ,
-the file must be uploaded. The row is removed from the last_upload table, and
-the upload process above is followed.
-
-If the path is present but the mtime differs, the file may have changed. If
-the size differs, then the file has certainly changed. The client will
-compute the CHK read-key for the file by hashing its contents, using exactly
-the same algorithm as the node does when it uploads a file (including
-~/.tahoe/private/convergence). It then checks the 'keys_to_files' table to
-see if this file has been uploaded before: perhaps the file was moved from
-elsewhere on the disk. If no match is found, the file must be uploaded, so
-the upload process above is follwed.
-
-If the read-key *is* found in the 'keys_to_files' table, then the file has
-been uploaded before, but we should consider performing a file check / verify
-operation to make sure we can skip a new upload. The fileid is used to
-retrieve the entry from the 'caps' table, and the last_checked timestamp is
-examined. If this timestamp is too old, a filecheck operation should be
-performed, and the file repaired if the results are not satisfactory. A
-"random early check" algorithm should be used, in which a check is performed
-with a probability that increases with the age of the previous results. E.g.
-files that were last checked within a month are not checked, files that were
-checked 5 weeks ago are re-checked with 25% probability, 6 weeks with 50%,
-more than 8 weeks are always checked. This reduces the "thundering herd" of
-filechecks-on-everything that would otherwise result when a backup operation
-is run one month after the original backup. The readkey can be submitted to
-the upload operation, to remove a duplicate hashing pass through the file and
-reduce the disk IO. In a future version of the storage server protocol, this
-could also improve the "streamingness" of the upload process.
-
-If the file's size and mtime match, the file is considered to be unmodified,
-and the last_checked timestamp from the 'caps' table is examined as above
-(possibly resulting in a filecheck or repair). The --no-timestamps option
-disables this check: this removes the danger of false-positives (i.e. not
-uploading a new file, because it appeared to be the same as a previously
-uploaded one), but increases the amount of disk IO that must be performed
-(every byte of every file must be hashed to compute the readkey).
-
-This algorithm is summarized in the following pseudocode:
-
-{{{
- def backup(path):
- abspath = os.path.abspath(path)
- result = check_for_upload(abspath)
- now = time.time()
- if result == MUST_UPLOAD:
- filecap = upload(abspath, key=result.readkey)
- fileid = db("INSERT INTO caps (filecap, last_uploaded, last_checked)",
- (filecap, now, now))
- db("INSERT INTO keys_to_files", (result.readkey, filecap))
- db("INSERT INTO last_upload", (abspath,current_size,current_mtime,fileid))
- if result in (MOVED, ALREADY_UPLOADED):
- age = now - result.last_checked
- probability = (age - 1*MONTH) / 1*MONTH
- probability = min(max(probability, 0.0), 1.0)
- if random.random() < probability:
- do_filecheck(result.filecap)
- if result == MOVED:
- db("INSERT INTO last_upload",
- (abspath, current_size, current_mtime, result.fileid))
-
-
- def check_for_upload(abspath):
- row = db("SELECT (size,mtime,fileid) FROM last_upload WHERE path == %s"
- % abspath)
- if not row:
- return check_moved(abspath)
- current_size = os.stat(abspath)[stat.ST_SIZE]
- current_mtime = os.stat(abspath)[stat.ST_MTIME]
- (last_size,last_mtime,last_fileid) = row
- if file_changed(current_size, last_size, current_mtime, last_mtime):
- db("DELETE FROM last_upload WHERE fileid=%s" % fileid)
- return check_moved(abspath)
- (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
- " WHERE fileid == %s" % last_fileid)
- return ALREADY_UPLOADED(filecap=filecap, last_checked=last_checked)
-
- def file_changed(current_size, last_size, current_mtime, last_mtime):
- if last_size != current_size:
- return True
- if NO_TIMESTAMPS:
- return True
- if last_mtime != current_mtime:
- return True
- return False
-
- def check_moved(abspath):
- readkey = hash_with_convergence(abspath)
- fileid = db("SELECT (fileid) FROM keys_to_files WHERE readkey == %s"%readkey)
- if not fileid:
- return MUST_UPLOAD(readkey=readkey)
- (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
- " WHERE fileid == %s" % fileid)
- return MOVED(fileid=fileid, filecap=filecap, last_checked=last_checked)
-
- def do_filecheck(filecap):
- health = check(filecap)
- if health < DESIRED:
- repair(filecap)
-
-}}}
print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s
return None
-def readdir(dircap, options):
- # returns a dict of (childname: (type, readcap, metadata)), or None if the
- # dircap didn't point to a directory
- url = options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
- resp = do_http("GET", url)
- if resp.status != 200:
- raiseHTTPError("Error during directory GET", resp)
- jd = simplejson.load(resp)
- ntype, ndata = jd
- if ntype != "dirnode":
- return None
- contents = {}
- for (childname, (childtype, childdata)) in ndata["children"].items():
- contents[childname] = (childtype,
- str(childdata["ro_uri"]),
- childdata["metadata"])
- return contents
-
def get_local_metadata(path):
metadata = {}
return True
return False
-def backup(options):
- nodeurl = options['node-url']
- from_dir = options.from_dir
- to_dir = options.to_dir
- if options['quiet']:
- verbosity = 0
- else:
- verbosity = 2
- stdin = options.stdin
- stdout = options.stdout
- stderr = options.stderr
-
- use_backupdb = not options["no-backupdb"]
- options.backupdb = None
- if use_backupdb:
- bdbfile = os.path.join(options["node-directory"],
- "private", "backupdb.sqlite")
- bdbfile = os.path.abspath(bdbfile)
- options.backupdb = backupdb.get_backupdb(bdbfile)
-
- rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
- to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
- if path:
- to_url += escape_path(path)
- if not to_url.endswith("/"):
- to_url += "/"
-
- archives_url = to_url + "Archives/"
- latest_url = to_url + "Latest"
-
- # first step: make sure the target directory exists, as well as the
- # Archives/ subdirectory.
- resp = do_http("GET", archives_url + "?t=json")
- if resp.status == 404:
- resp = do_http("POST", archives_url + "?t=mkdir")
- if resp.status != 200:
- print >>stderr, "Unable to create target directory: %s %s %s" % \
- (resp.status, resp.reason, resp.read())
- return 1
- archives_dir = {}
- else:
- jdata = simplejson.load(resp)
- (otype, attrs) = jdata
- archives_dir = attrs["children"]
-
- # second step: locate the most recent backup in TODIR/Archives/*
- latest_backup_time = 0
- latest_backup_name = None
- latest_backup_dircap = None
-
- # we have various time formats. The allmydata.com windows backup tool
- # appears to create things like "2008-11-16 10.34 PM". This script
- # creates things like "2009-11-16--17.34Z".
- for archive_name in archives_dir.keys():
- if archives_dir[archive_name][0] != "dirnode":
- continue
- when = parse_old_timestamp(archive_name, options)
- if when is not None:
- if when > latest_backup_time:
- latest_backup_time = when
- latest_backup_name = archive_name
- latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
-
- # third step: process the tree
- new_backup_dircap = Node().process(options.from_dir,
- latest_backup_dircap,
- options)
- print >>stdout, "new backup done"
-
- # fourth: attach the new backup to the list
- new_readonly_backup_dircap = readonly(new_backup_dircap)
- now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
-
- put_child(archives_url, now, new_readonly_backup_dircap)
- put_child(to_url, "Latest", new_readonly_backup_dircap)
-
- print >>stdout, "backup done"
- # done!
- return 0
-
-
-class Node:
+class BackerUpper:
+ def __init__(self, options):
+ self.options = options
+ self.files_uploaded = 0
+ self.files_reused = 0
+ self.files_checked = 0
+ self.directories_read = 0
+ self.directories_created = 0
+ self.directories_reused = 0
+ self.directories_checked = 0
+
+ def run(self):
+ options = self.options
+ nodeurl = options['node-url']
+ from_dir = options.from_dir
+ to_dir = options.to_dir
+ self.verbosity = 1
+ if options['quiet']:
+ self.verbosity = 0
+ if options['verbose']:
+ self.verbosity = 2
+ stdin = options.stdin
+ stdout = options.stdout
+ stderr = options.stderr
+
+ self.backupdb = None
+ use_backupdb = not options["no-backupdb"]
+ if use_backupdb:
+ bdbfile = os.path.join(options["node-directory"],
+ "private", "backupdb.sqlite")
+ bdbfile = os.path.abspath(bdbfile)
+ self.backupdb = backupdb.get_backupdb(bdbfile)
+
+ rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
+ to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
+ if path:
+ to_url += escape_path(path)
+ if not to_url.endswith("/"):
+ to_url += "/"
+
+ archives_url = to_url + "Archives/"
+ latest_url = to_url + "Latest"
+
+ # first step: make sure the target directory exists, as well as the
+ # Archives/ subdirectory.
+ resp = do_http("GET", archives_url + "?t=json")
+ if resp.status == 404:
+ resp = do_http("POST", archives_url + "?t=mkdir")
+ if resp.status != 200:
+ print >>stderr, "Unable to create target directory: %s %s %s" % \
+ (resp.status, resp.reason, resp.read())
+ return 1
+ archives_dir = {}
+ else:
+ jdata = simplejson.load(resp)
+ (otype, attrs) = jdata
+ archives_dir = attrs["children"]
+
+ # second step: locate the most recent backup in TODIR/Archives/*
+ latest_backup_time = 0
+ latest_backup_name = None
+ latest_backup_dircap = None
+
+ # we have various time formats. The allmydata.com windows backup tool
+ # appears to create things like "2008-11-16 10.34 PM". This script
+ # creates things like "2009-11-16--17.34Z".
+ for archive_name in archives_dir.keys():
+ if archives_dir[archive_name][0] != "dirnode":
+ continue
+ when = parse_old_timestamp(archive_name, options)
+ if when is not None:
+ if when > latest_backup_time:
+ latest_backup_time = when
+ latest_backup_name = archive_name
+ latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
+
+ # third step: process the tree
+ new_backup_dircap = self.process(options.from_dir, latest_backup_dircap)
+
+ # fourth: attach the new backup to the list
+ new_readonly_backup_dircap = readonly(new_backup_dircap)
+ now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
+
+ put_child(archives_url, now, new_readonly_backup_dircap)
+ put_child(to_url, "Latest", new_readonly_backup_dircap)
+
+ if self.verbosity >= 1:
+ print >>stdout, (" %d files uploaded (%d reused), "
+ "%d directories created (%d reused)"
+ % (self.files_uploaded,
+ self.files_reused,
+ self.directories_created,
+ self.directories_reused))
+ if self.verbosity >= 2:
+ print >>stdout, (" %d files checked, %d directories checked, "
+ "%d directories read"
+ % (self.files_checked,
+ self.directories_checked,
+ self.directories_read))
+ print >>stdout, " backup done"
+ # done!
+ return 0
+
def verboseprint(self, msg):
- if self.options["verbose"]:
+ if self.verbosity >= 2:
print >>self.options.stdout, msg
- def process(self, localpath, olddircap, options):
+ def process(self, localpath, olddircap):
# returns newdircap
- self.options = options
self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap))
olddircontents = {}
if olddircap:
- olddircontents = readdir(olddircap, options)
+ olddircontents = self.readdir(olddircap)
newdircontents = {} # childname -> (type, rocap, metadata)
for child in os.listdir(localpath):
oldchildcap = None
if olddircontents is not None and child in olddircontents:
oldchildcap = olddircontents[child][1]
- newchilddircap = self.recurse(childpath, oldchildcap)
+ # recurse on the child directory
+ newchilddircap = self.process(childpath, oldchildcap)
newdircontents[child] = ("dirnode", newchilddircap, metadata)
elif os.path.isfile(childpath):
newfilecap, metadata = self.upload(childpath)
):
self.verboseprint(" %s not changed, re-using old directory" % localpath)
# yay! they're identical!
+ self.directories_reused += 1
return olddircap
else:
self.verboseprint(" %s changed, making new directory" % localpath)
# something changed, or there was no previous directory, so we
# must make a new directory
- newdircap = mkdir(newdircontents, options)
+ newdircap = mkdir(newdircontents, self.options)
+ self.directories_created += 1
return readonly(newdircap)
- def recurse(self, localpath, olddircap):
- n = self.__class__()
- return n.process(localpath, olddircap, self.options)
-
-
def check_backupdb(self, childpath):
- if not self.options.backupdb:
+ if not self.backupdb:
return True, None
use_timestamps = not self.options["ignore-timestamps"]
- bdb = self.options.backupdb
- r = bdb.check_file(childpath, use_timestamps)
+ r = self.backupdb.check_file(childpath, use_timestamps)
if not r.was_uploaded():
return True, r
self.verboseprint("checking %s" % filecap)
nodeurl = self.options['node-url']
checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap)
+ self.files_checked += 1
resp = do_http("POST", checkurl)
if resp.status != 200:
# can't check, so we must assume it's bad
r.did_check_healthy(cr)
return False, r
+ def readdir(self, dircap):
+ # returns a dict of (childname: (type, readcap, metadata)), or None
+ # if the dircap didn't point to a directory
+ self.directories_read += 1
+ url = self.options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
+ resp = do_http("GET", url)
+ if resp.status != 200:
+ raiseHTTPError("Error during directory GET", resp)
+ jd = simplejson.load(resp)
+ ntype, ndata = jd
+ if ntype != "dirnode":
+ return None
+ contents = {}
+ for (childname, (childtype, childdata)) in ndata["children"].items():
+ contents[childname] = (childtype,
+ str(childdata["ro_uri"]),
+ childdata["metadata"])
+ return contents
+
def upload(self, childpath):
#self.verboseprint("uploading %s.." % childpath)
metadata = get_local_metadata(childpath)
if bdb_results:
bdb_results.did_upload(filecap)
+ self.files_uploaded += 1
return filecap, metadata
else:
self.verboseprint("skipping %s.." % childpath)
+ self.files_reused += 1
return bdb_results.was_uploaded(), metadata
+def backup(options):
+ bu = BackerUpper(options)
+ return bu.run()
from cStringIO import StringIO
import urllib
import time
+import re
from allmydata.util import fileutil, hashutil
from allmydata import uri
from allmydata.scripts.common import DEFAULT_ALIAS, get_aliases
-from allmydata.scripts import cli, debug, runner
+from allmydata.scripts import cli, debug, runner, backupdb
from allmydata.test.common import SystemTestMixin
from twisted.internet import threads # CLI tests use deferToThread
f.write(data)
f.close()
+ def count_output(self, out):
+ mo = re.search(r"(\d)+ files uploaded \((\d+) reused\), (\d+) directories created \((\d+) reused\)", out)
+ return [int(s) for s in mo.groups()]
+
+ def count_output2(self, out):
+ mo = re.search(r"(\d)+ files checked, (\d+) directories checked, (\d+) directories read", out)
+ return [int(s) for s in mo.groups()]
+
def test_backup(self):
self.basedir = os.path.dirname(self.mktemp())
+ # is the backupdb available? If so, we test that a second backup does
+ # not create new directories.
+ hush = StringIO()
+ have_bdb = backupdb.get_backupdb(os.path.join(self.basedir, "dbtest"),
+ hush)
+
# create a small local directory with a couple of files
source = os.path.join(self.basedir, "home")
fileutil.make_dirs(os.path.join(source, "empty"))
def _check0((rc, out, err)):
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
+ fu, fr, dc, dr = self.count_output(out)
+ # foo.txt, bar.txt, blah.txt
+ self.failUnlessEqual(fu, 3)
+ self.failUnlessEqual(fr, 0)
+ # empty, home, home/parent, home/parent/subdir
+ self.failUnlessEqual(dc, 4)
+ self.failUnlessEqual(dr, 0)
d.addCallback(_check0)
+
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups"))
def _check1((rc, out, err)):
self.failUnlessEqual(err, "")
d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups"))
+ def _check4a((rc, out, err)):
+ # second backup should reuse everything, if the backupdb is
+ # available
+ self.failUnlessEqual(err, "")
+ self.failUnlessEqual(rc, 0)
+ if have_bdb:
+ fu, fr, dc, dr = self.count_output(out)
+ # foo.txt, bar.txt, blah.txt
+ self.failUnlessEqual(fu, 0)
+ self.failUnlessEqual(fr, 3)
+ # empty, home, home/parent, home/parent/subdir
+ self.failUnlessEqual(dc, 0)
+ self.failUnlessEqual(dr, 4)
+ d.addCallback(_check4a)
+
+ if have_bdb:
+ # sneak into the backupdb, crank back the "last checked"
+ # timestamp to force a check on all files
+ def _reset_last_checked(res):
+ dbfile = os.path.join(self.basedir,
+ "client0", "private", "backupdb.sqlite")
+ self.failUnless(os.path.exists(dbfile), dbfile)
+ bdb = backupdb.get_backupdb(dbfile)
+ bdb.cursor.execute("UPDATE last_upload SET last_checked=0")
+ bdb.connection.commit()
+
+ d.addCallback(_reset_last_checked)
+
+ d.addCallback(lambda res:
+ self.do_cli("backup", "--verbose", source, "tahoe:backups"))
+ def _check4b((rc, out, err)):
+ # we should check all files, and re-use all of them. None of
+ # the directories should have been changed.
+ self.failUnlessEqual(err, "")
+ self.failUnlessEqual(rc, 0)
+ fu, fr, dc, dr = self.count_output(out)
+ fchecked, dchecked, dread = self.count_output2(out)
+ self.failUnlessEqual(fchecked, 3)
+ self.failUnlessEqual(fu, 0)
+ self.failUnlessEqual(fr, 3)
+ # TODO: backupdb doesn't do dirs yet; when it does, this will
+ # change to dchecked=4, and maybe dread=0
+ self.failUnlessEqual(dchecked, 0)
+ self.failUnlessEqual(dread, 4)
+ self.failUnlessEqual(dc, 0)
+ self.failUnlessEqual(dr, 4)
+ d.addCallback(_check4b)
+
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
def _check5((rc, out, err)):
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
self.new_archives = out.split()
- self.failUnlessEqual(len(self.new_archives), 2)
+ self.failUnlessEqual(len(self.new_archives), 3)
+ # the original backup should still be the oldest (i.e. sorts
+ # alphabetically towards the beginning)
self.failUnlessEqual(sorted(self.new_archives)[0],
self.old_archives[0])
d.addCallback(_check5)
self.writeto("empty", "imagine nothing being here")
return self.do_cli("backup", source, "tahoe:backups")
d.addCallback(_modify)
+ def _check5a((rc, out, err)):
+ # second backup should reuse bar.txt (if backupdb is available),
+ # and upload the rest. None of the directories can be reused.
+ self.failUnlessEqual(err, "")
+ self.failUnlessEqual(rc, 0)
+ if have_bdb:
+ fu, fr, dc, dr = self.count_output(out)
+ # new foo.txt, surprise file, subfile, empty
+ self.failUnlessEqual(fu, 4)
+ # old bar.txt
+ self.failUnlessEqual(fr, 1)
+ # home, parent, subdir, blah.txt, surprisedir
+ self.failUnlessEqual(dc, 5)
+ self.failUnlessEqual(dr, 0)
+ d.addCallback(_check5a)
d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
def _check6((rc, out, err)):
self.failUnlessEqual(err, "")
self.failUnlessEqual(rc, 0)
self.new_archives = out.split()
- self.failUnlessEqual(len(self.new_archives), 3)
+ self.failUnlessEqual(len(self.new_archives), 4)
self.failUnlessEqual(sorted(self.new_archives)[0],
self.old_archives[0])
d.addCallback(_check6)
self.failUnlessEqual(out, "foo")
d.addCallback(_check8)
+ d.addCallback(lambda res:
+ self.do_cli("backup", "--no-backupdb", source, "tahoe:backups"))
+ def _check9((rc, out, err)):
+ # --no-backupdb means re-upload everything. We still get to
+ # re-use the directories, since nothing changed.
+ self.failUnlessEqual(err, "")
+ self.failUnlessEqual(rc, 0)
+ fu, fr, dc, dr = self.count_output(out)
+ self.failUnlessEqual(fu, 5)
+ self.failUnlessEqual(fr, 0)
+ self.failUnlessEqual(dc, 0)
+ self.failUnlessEqual(dr, 5)
+ d.addCallback(_check9)
+
return d