From: Brian Warner <warner@allmydata.com>
Date: Fri, 6 Feb 2009 05:07:01 +0000 (-0700)
Subject: #598: add cli+backupdb tests, improve user display, update docs, move docs out of... 
X-Git-Tag: allmydata-tahoe-1.3.0~75
X-Git-Url: https://git.rkrishnan.org/frontends//%22%22?a=commitdiff_plain;h=ca32db5b3992623cdf541c31e7adcead3e449ced;p=tahoe-lafs%2Ftahoe-lafs.git

#598: add cli+backupdb tests, improve user display, update docs, move docs out of proposed/
---

diff --git a/docs/backupdb.txt b/docs/backupdb.txt
new file mode 100644
index 00000000..7c5905e8
--- /dev/null
+++ b/docs/backupdb.txt
@@ -0,0 +1,160 @@
+= The Tahoe BackupDB =
+
+To speed up backup operations, Tahoe maintains a small database known as the
+"backupdb". This is used to avoid re-uploading files which have already been
+uploaded recently.
+
+This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
+single-file database. It is used by the "tahoe backup" command (unless the
+--no-backupdb option is included). In the future, it will also be used by
+"tahoe mirror", and by "tahoe cp" when the --use-backupdb option is included.
+
+The purpose of this database is specifically to manage the file-to-cap
+translation (the "upload" step). It does not address directory updates. A
+future version will include a directory cache.
+
+The overall goal of optimizing backup is to reduce the work required when the
+source disk has not changed since the last backup. In the ideal case, running
+"tahoe backup" twice in a row, with no intervening changes to the disk, will
+not require any network traffic.
+
+This database is optional. If it is deleted, the worst effect is that a
+subsequent backup operation may use more effort (network bandwidth, CPU
+cycles, and disk IO) than it would have without the backupdb.
+
+The database uses sqlite3, which is included as part of the standard python
+library with python2.5 and later. For python2.4, please install the
+"pysqlite2" package (which, despite the name, actually provides sqlite3
+rather than sqlite2).
+
+== Schema ==
+
+The database contains the following tables:
+
+CREATE TABLE version
+(
+ version integer  # contains one row, set to 1
+);
+
+CREATE TABLE last_upload
+(
+ path  varchar(1024),  PRIMARY KEY -- index, this is os.path.abspath(fn)
+ size  integer,         -- os.stat(fn)[stat.ST_SIZE]
+ mtime number,          -- os.stat(fn)[stat.ST_MTIME]
+ ctime number,          -- os.stat(fn)[stat.ST_MTIME]
+ fileid integer
+);
+
+CREATE TABLE caps
+(
+ fileid integer PRIMARY KEY AUTOINCREMENT,
+ filecap varchar(256) UNIQUE    -- URI:CHK:...
+);
+
+CREATE TABLE last_upload
+(
+ fileid INTEGER PRIMARY KEY,
+ last_uploaded TIMESTAMP,
+ last_checked TIMESTAMP
+);
+
+Notes: if we extend the backupdb to assist with directory maintenance (see
+below), we may need paths in multiple places, so it would make sense to
+create a table for them, and change the last_upload table to refer to a
+pathid instead of an absolute path:
+
+CREATE TABLE paths
+(
+ path varchar(1024) UNIQUE,  -- index
+ pathid integer PRIMARY KEY AUTOINCREMENT
+);
+
+== Operation ==
+
+The upload process starts with a pathname (like ~/.emacs) and wants to end up
+with a file-cap (like URI:CHK:...).
+
+The first step is to convert the path to an absolute form
+(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
+not present in this table, the file must be uploaded. The upload process is:
+
+ 1. record the file's size, creation time, and modification time
+ 2. upload the file into the grid, obtaining an immutable file read-cap
+ 3. add an entry to the 'caps' table, with the read-cap, to get a fileid
+ 4. add an entry to the 'last_upload' table, with the current time
+ 5. add an entry to the 'local_files' table, with the fileid, the path,
+    and the local file's size/ctime/mtime
+
+If the path *is* present in 'last_upload', the easy-to-compute identifying
+information is compared: file size and ctime/mtime. If these differ, the file
+must be uploaded. The row is removed from the last_upload table, and the
+upload process above is followed.
+
+If the path is present but ctime or mtime differs, the file may have changed.
+If the size differs, then the file has certainly changed. At this point, a
+future version of the "backup" command might hash the file and look for a
+match in an as-yet-defined table, in the hopes that the file has simply been
+moved from somewhere else on the disk. This enhancement requires changes to
+the Tahoe upload API before it can be significantly more efficient than
+simply handing the file to Tahoe and relying upon the normal convergence to
+notice the similarity.
+
+If ctime, mtime, or size is different, the client will upload the file, as
+above.
+
+If these identifiers are the same, the client will assume that the file is
+unchanged (unless the --ignore-timestamps option is provided, in which case
+the client always re-uploads the file), and it may be allowed to skip the
+upload. For safety, however, we require the client periodically perform a
+filecheck on these probably-already-uploaded files, and re-upload anything
+that doesn't look healthy. The client looks the fileid up in the
+'last_upload' table, to see how long it has been since the file was last
+checked.
+
+A "random early check" algorithm should be used, in which a check is
+performed with a probability that increases with the age of the previous
+results. E.g. files that were last checked within a month are not checked,
+files that were checked 5 weeks ago are re-checked with 25% probability, 6
+weeks with 50%, more than 8 weeks are always checked. This reduces the
+"thundering herd" of filechecks-on-everything that would otherwise result
+when a backup operation is run one month after the original backup. If a
+filecheck reveals the file is not healthy, it is re-uploaded.
+
+If the filecheck shows the file is healthy, or if the filecheck was skipped,
+the client gets to skip the upload, and uses the previous filecap (from the
+'caps' table) to add to the parent directory.
+
+If a new file is uploaded, a new entry is put in the 'caps' and 'last_upload'
+table, and an entry is made in the 'local_files' table to reflect the mapping
+from local disk pathname to uploaded filecap. If an old file is re-uploaded,
+the 'last_upload' entry is updated with the new timestamps. If an old file is
+checked and found healthy, the 'last_upload' entry is updated.
+
+Relying upon timestamps is a compromise between efficiency and safety: a file
+which is modified without changing the timestamp or size will be treated as
+unmodified, and the "tahoe backup" command will not copy the new contents
+into the grid. The --no-timestamps can be used to disable this optimization,
+forcing every byte of the file to be hashed and encoded.
+
+== DIRECTORY CACHING ==
+
+A future version of the backupdb will also record a secure hash of the most
+recent contents of each tahoe directory that was used in the last backup run.
+The directories created by the "tahoe backup" command are all read-only, so
+it should be difficult to violate the assumption that these directories are
+unmodified since the previous pass. In the future, Tahoe will provide truly
+immutable directories, making this assumption even more solid.
+
+In the current implementation, when the backup algorithm is faced with the
+decision to either create a new directory or share an old one, it must read
+the contents of the old directory to compare it against the desired new
+contents. This means that a "null backup" (performing a backup when nothing
+has been changed) must still read every Tahoe directory from the previous
+backup.
+
+With a directory-caching backupdb, these directory reads will be bypassed,
+and the null backup will use minimal network bandwidth: one directory read
+and two modifies. The Archives/ directory must be read to locate the latest
+backup, and must be modified to add a new snapshot, and the Latest/ directory
+will be updated to point to that same snapshot.
+
diff --git a/docs/proposed/backupdb.txt b/docs/proposed/backupdb.txt
deleted file mode 100644
index c9618e6d..00000000
--- a/docs/proposed/backupdb.txt
+++ /dev/null
@@ -1,188 +0,0 @@
-= PRELIMINARY =
-
-This document is a description of a feature which is not yet implemented,
-added here to solicit feedback and to describe future plans. This document is
-subject to revision or withdrawal at any moment. Until this notice is
-removed, consider this entire document to be a figment of your imagination.
-
-= The Tahoe BackupDB =
-
-To speed up backup operations, Tahoe maintains a small database known as the
-"backupdb". This is used to avoid re-uploading files which have already been
-uploaded recently.
-
-This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
-single-file database. It is used by the "tahoe backup" command, and by the
-"tahoe cp" command when the --use-backupdb option is included.
-
-The purpose of this database is specifically to manage the file-to-cap
-translation (the "upload" step). It does not address directory updates.
-
-The overall goal of optimizing backup is to reduce the work required when the
-source disk has not changed since the last backup. In the ideal case, running
-"tahoe backup" twice in a row, with no intervening changes to the disk, will
-not require any network traffic.
-
-This database is optional. If it is deleted, the worst effect is that a
-subsequent backup operation may use more effort (network bandwidth, CPU
-cycles, and disk IO) than it would have without the backupdb.
-
-== Schema ==
-
-The database contains the following tables:
-
-CREATE TABLE version
-(
- version integer  # contains one row, set to 0
-);
-
-CREATE TABLE last_upload
-(
- path  varchar(1024), # index, this is os.path.abspath(fn)
- size  integer,       # os.stat(fn)[stat.ST_SIZE]
- mtime number,        # os.stat(fn)[stat.ST_MTIME]
- fileid integer
-);
-
-CREATE TABLE caps
-(
- fileid integer PRIMARY KEY AUTOINCREMENT,
- filecap varchar(256),        # URI:CHK:...
- last_uploaded timestamp,
- last_checked timestamp
-);
-
-CREATE TABLE keys_to_files
-(
- readkey varchar(256) PRIMARY KEY, # index, AES key portion of filecap
- fileid integer
-);
-
-Notes: if we extend the backupdb to assist with directory maintenance (see
-below), we may need paths in multiple places, so it would make sense to
-create a table for them, and change the last_upload table to refer to a
-pathid instead of an absolute path:
-
-CREATE TABLE paths
-(
- path varchar(1024), # index
- pathid integer PRIMARY KEY AUTOINCREMENT
-);
-
-== Operation ==
-
-The upload process starts with a pathname (like ~/.emacs) and wants to end up
-with a file-cap (like URI:CHK:...).
-
-The first step is to convert the path to an absolute form
-(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
-not present in this table, the file must be uploaded. The upload process is:
-
- 1. record the file's size and modification time
- 2. upload the file into the grid, obtaining an immutable file read-cap
- 3. add an entry to the 'caps' table, with the read-cap, and the current time
- 4. extract the read-key from the read-cap, add an entry to 'keys_to_files'
- 5. add an entry to 'last_upload'
-
-If the path *is* present in 'last_upload', the easy-to-compute identifying
-information is compared: file size and modification time. If these differ,
-the file must be uploaded. The row is removed from the last_upload table, and
-the upload process above is followed.
-
-If the path is present but the mtime differs, the file may have changed. If
-the size differs, then the file has certainly changed. The client will
-compute the CHK read-key for the file by hashing its contents, using exactly
-the same algorithm as the node does when it uploads a file (including
-~/.tahoe/private/convergence). It then checks the 'keys_to_files' table to
-see if this file has been uploaded before: perhaps the file was moved from
-elsewhere on the disk. If no match is found, the file must be uploaded, so
-the upload process above is follwed.
-
-If the read-key *is* found in the 'keys_to_files' table, then the file has
-been uploaded before, but we should consider performing a file check / verify
-operation to make sure we can skip a new upload. The fileid is used to
-retrieve the entry from the 'caps' table, and the last_checked timestamp is
-examined. If this timestamp is too old, a filecheck operation should be
-performed, and the file repaired if the results are not satisfactory. A
-"random early check" algorithm should be used, in which a check is performed
-with a probability that increases with the age of the previous results. E.g.
-files that were last checked within a month are not checked, files that were
-checked 5 weeks ago are re-checked with 25% probability, 6 weeks with 50%,
-more than 8 weeks are always checked. This reduces the "thundering herd" of
-filechecks-on-everything that would otherwise result when a backup operation
-is run one month after the original backup. The readkey can be submitted to
-the upload operation, to remove a duplicate hashing pass through the file and
-reduce the disk IO. In a future version of the storage server protocol, this
-could also improve the "streamingness" of the upload process.
-
-If the file's size and mtime match, the file is considered to be unmodified,
-and the last_checked timestamp from the 'caps' table is examined as above
-(possibly resulting in a filecheck or repair). The --no-timestamps option
-disables this check: this removes the danger of false-positives (i.e. not
-uploading a new file, because it appeared to be the same as a previously
-uploaded one), but increases the amount of disk IO that must be performed
-(every byte of every file must be hashed to compute the readkey).
-
-This algorithm is summarized in the following pseudocode:
-
-{{{
- def backup(path):
-   abspath = os.path.abspath(path)
-   result = check_for_upload(abspath)
-   now = time.time()
-   if result == MUST_UPLOAD:
-     filecap = upload(abspath, key=result.readkey)
-     fileid = db("INSERT INTO caps (filecap, last_uploaded, last_checked)",
-                 (filecap, now, now))
-     db("INSERT INTO keys_to_files", (result.readkey, filecap))
-     db("INSERT INTO last_upload", (abspath,current_size,current_mtime,fileid))
-   if result in (MOVED, ALREADY_UPLOADED):
-     age = now - result.last_checked
-     probability = (age - 1*MONTH) / 1*MONTH
-     probability = min(max(probability, 0.0), 1.0)
-     if random.random() < probability:
-       do_filecheck(result.filecap)
-   if result == MOVED:
-     db("INSERT INTO last_upload",
-        (abspath, current_size, current_mtime, result.fileid))
-
-
- def check_for_upload(abspath):
-   row = db("SELECT (size,mtime,fileid) FROM last_upload WHERE path == %s"
-            % abspath)
-   if not row:
-     return check_moved(abspath)
-   current_size = os.stat(abspath)[stat.ST_SIZE]
-   current_mtime = os.stat(abspath)[stat.ST_MTIME]
-   (last_size,last_mtime,last_fileid) = row
-   if file_changed(current_size, last_size, current_mtime, last_mtime):
-     db("DELETE FROM last_upload WHERE fileid=%s" % fileid)
-     return check_moved(abspath)
-   (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
-                                " WHERE fileid == %s" % last_fileid)
-   return ALREADY_UPLOADED(filecap=filecap, last_checked=last_checked)
-
- def file_changed(current_size, last_size, current_mtime, last_mtime):
-   if last_size != current_size:
-     return True
-   if NO_TIMESTAMPS:
-     return True
-   if last_mtime != current_mtime:
-     return True
-   return False
-
- def check_moved(abspath):
-   readkey = hash_with_convergence(abspath)
-   fileid = db("SELECT (fileid) FROM keys_to_files WHERE readkey == %s"%readkey)
-   if not fileid:
-     return MUST_UPLOAD(readkey=readkey)
-   (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
-                                " WHERE fileid == %s" % fileid)
-   return MOVED(fileid=fileid, filecap=filecap, last_checked=last_checked)
-
- def do_filecheck(filecap):
-   health = check(filecap)
-   if health < DESIRED:
-     repair(filecap)
-
-}}}
diff --git a/src/allmydata/scripts/tahoe_backup.py b/src/allmydata/scripts/tahoe_backup.py
index 205c25ff..6d44ee69 100644
--- a/src/allmydata/scripts/tahoe_backup.py
+++ b/src/allmydata/scripts/tahoe_backup.py
@@ -50,24 +50,6 @@ def parse_old_timestamp(s, options):
     print >>options.stderr, "unable to parse old timestamp '%s', ignoring" % s
     return None
 
-def readdir(dircap, options):
-    # returns a dict of (childname: (type, readcap, metadata)), or None if the
-    # dircap didn't point to a directory
-    url = options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
-    resp = do_http("GET", url)
-    if resp.status != 200:
-        raiseHTTPError("Error during directory GET", resp)
-    jd = simplejson.load(resp)
-    ntype, ndata = jd
-    if ntype != "dirnode":
-        return None
-    contents = {}
-    for (childname, (childtype, childdata)) in ndata["children"].items():
-        contents[childname] = (childtype,
-                               str(childdata["ro_uri"]),
-                               childdata["metadata"])
-    return contents
-
 def get_local_metadata(path):
     metadata = {}
 
@@ -131,100 +113,120 @@ def directory_is_changed(a, b):
                 return True
     return False
 
-def backup(options):
-    nodeurl = options['node-url']
-    from_dir = options.from_dir
-    to_dir = options.to_dir
-    if options['quiet']:
-        verbosity = 0
-    else:
-        verbosity = 2
-    stdin = options.stdin
-    stdout = options.stdout
-    stderr = options.stderr
-
-    use_backupdb = not options["no-backupdb"]
-    options.backupdb = None
-    if use_backupdb:
-        bdbfile = os.path.join(options["node-directory"],
-                               "private", "backupdb.sqlite")
-        bdbfile = os.path.abspath(bdbfile)
-        options.backupdb = backupdb.get_backupdb(bdbfile)
-
-    rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
-    to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
-    if path:
-        to_url += escape_path(path)
-    if not to_url.endswith("/"):
-        to_url += "/"
-
-    archives_url = to_url + "Archives/"
-    latest_url = to_url + "Latest"
-
-    # first step: make sure the target directory exists, as well as the
-    # Archives/ subdirectory.
-    resp = do_http("GET", archives_url + "?t=json")
-    if resp.status == 404:
-        resp = do_http("POST", archives_url + "?t=mkdir")
-        if resp.status != 200:
-            print >>stderr, "Unable to create target directory: %s %s %s" % \
-                  (resp.status, resp.reason, resp.read())
-            return 1
-        archives_dir = {}
-    else:
-        jdata = simplejson.load(resp)
-        (otype, attrs) = jdata
-        archives_dir = attrs["children"]
-
-    # second step: locate the most recent backup in TODIR/Archives/*
-    latest_backup_time = 0
-    latest_backup_name = None
-    latest_backup_dircap = None
-
-    # we have various time formats. The allmydata.com windows backup tool
-    # appears to create things like "2008-11-16 10.34 PM". This script
-    # creates things like "2009-11-16--17.34Z".
-    for archive_name in archives_dir.keys():
-        if archives_dir[archive_name][0] != "dirnode":
-            continue
-        when = parse_old_timestamp(archive_name, options)
-        if when is not None:
-            if when > latest_backup_time:
-                latest_backup_time = when
-                latest_backup_name = archive_name
-                latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
-
-    # third step: process the tree
-    new_backup_dircap = Node().process(options.from_dir,
-                                       latest_backup_dircap,
-                                       options)
-    print >>stdout, "new backup done"
-
-    # fourth: attach the new backup to the list
-    new_readonly_backup_dircap = readonly(new_backup_dircap)
-    now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
-
-    put_child(archives_url, now, new_readonly_backup_dircap)
-    put_child(to_url, "Latest", new_readonly_backup_dircap)
-
-    print >>stdout, "backup done"
-    # done!
-    return 0
-
-
-class Node:
+class BackerUpper:
+    def __init__(self, options):
+        self.options = options
+        self.files_uploaded = 0
+        self.files_reused = 0
+        self.files_checked = 0
+        self.directories_read = 0
+        self.directories_created = 0
+        self.directories_reused = 0
+        self.directories_checked = 0
+
+    def run(self):
+        options = self.options
+        nodeurl = options['node-url']
+        from_dir = options.from_dir
+        to_dir = options.to_dir
+        self.verbosity = 1
+        if options['quiet']:
+            self.verbosity = 0
+        if options['verbose']:
+            self.verbosity = 2
+        stdin = options.stdin
+        stdout = options.stdout
+        stderr = options.stderr
+
+        self.backupdb = None
+        use_backupdb = not options["no-backupdb"]
+        if use_backupdb:
+            bdbfile = os.path.join(options["node-directory"],
+                                   "private", "backupdb.sqlite")
+            bdbfile = os.path.abspath(bdbfile)
+            self.backupdb = backupdb.get_backupdb(bdbfile)
+
+        rootcap, path = get_alias(options.aliases, options.to_dir, DEFAULT_ALIAS)
+        to_url = nodeurl + "uri/%s/" % urllib.quote(rootcap)
+        if path:
+            to_url += escape_path(path)
+        if not to_url.endswith("/"):
+            to_url += "/"
+
+        archives_url = to_url + "Archives/"
+        latest_url = to_url + "Latest"
+
+        # first step: make sure the target directory exists, as well as the
+        # Archives/ subdirectory.
+        resp = do_http("GET", archives_url + "?t=json")
+        if resp.status == 404:
+            resp = do_http("POST", archives_url + "?t=mkdir")
+            if resp.status != 200:
+                print >>stderr, "Unable to create target directory: %s %s %s" % \
+                      (resp.status, resp.reason, resp.read())
+                return 1
+            archives_dir = {}
+        else:
+            jdata = simplejson.load(resp)
+            (otype, attrs) = jdata
+            archives_dir = attrs["children"]
+
+        # second step: locate the most recent backup in TODIR/Archives/*
+        latest_backup_time = 0
+        latest_backup_name = None
+        latest_backup_dircap = None
+
+        # we have various time formats. The allmydata.com windows backup tool
+        # appears to create things like "2008-11-16 10.34 PM". This script
+        # creates things like "2009-11-16--17.34Z".
+        for archive_name in archives_dir.keys():
+            if archives_dir[archive_name][0] != "dirnode":
+                continue
+            when = parse_old_timestamp(archive_name, options)
+            if when is not None:
+                if when > latest_backup_time:
+                    latest_backup_time = when
+                    latest_backup_name = archive_name
+                    latest_backup_dircap = str(archives_dir[archive_name][1]["ro_uri"])
+
+        # third step: process the tree
+        new_backup_dircap = self.process(options.from_dir, latest_backup_dircap)
+
+        # fourth: attach the new backup to the list
+        new_readonly_backup_dircap = readonly(new_backup_dircap)
+        now = time_format.iso_utc(int(time.time()), sep="_") + "Z"
+
+        put_child(archives_url, now, new_readonly_backup_dircap)
+        put_child(to_url, "Latest", new_readonly_backup_dircap)
+
+        if self.verbosity >= 1:
+            print >>stdout, (" %d files uploaded (%d reused), "
+                             "%d directories created (%d reused)"
+                             % (self.files_uploaded,
+                                self.files_reused,
+                                self.directories_created,
+                                self.directories_reused))
+            if self.verbosity >= 2:
+                print >>stdout, (" %d files checked, %d directories checked, "
+                                 "%d directories read"
+                                 % (self.files_checked,
+                                    self.directories_checked,
+                                    self.directories_read))
+            print >>stdout, " backup done"
+        # done!
+        return 0
+
     def verboseprint(self, msg):
-        if self.options["verbose"]:
+        if self.verbosity >= 2:
             print >>self.options.stdout, msg
 
-    def process(self, localpath, olddircap, options):
+    def process(self, localpath, olddircap):
         # returns newdircap
-        self.options = options
 
         self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap))
         olddircontents = {}
         if olddircap:
-            olddircontents = readdir(olddircap, options)
+            olddircontents = self.readdir(olddircap)
 
         newdircontents = {} # childname -> (type, rocap, metadata)
         for child in os.listdir(localpath):
@@ -234,7 +236,8 @@ class Node:
                 oldchildcap = None
                 if olddircontents is not None and child in olddircontents:
                     oldchildcap = olddircontents[child][1]
-                newchilddircap = self.recurse(childpath, oldchildcap)
+                # recurse on the child directory
+                newchilddircap = self.process(childpath, oldchildcap)
                 newdircontents[child] = ("dirnode", newchilddircap, metadata)
             elif os.path.isfile(childpath):
                 newfilecap, metadata = self.upload(childpath)
@@ -248,25 +251,21 @@ class Node:
             ):
             self.verboseprint(" %s not changed, re-using old directory" % localpath)
             # yay! they're identical!
+            self.directories_reused += 1
             return olddircap
         else:
             self.verboseprint(" %s changed, making new directory" % localpath)
             # something changed, or there was no previous directory, so we
             # must make a new directory
-            newdircap = mkdir(newdircontents, options)
+            newdircap = mkdir(newdircontents, self.options)
+            self.directories_created += 1
             return readonly(newdircap)
 
-    def recurse(self, localpath, olddircap):
-        n = self.__class__()
-        return n.process(localpath, olddircap, self.options)
-
-
     def check_backupdb(self, childpath):
-        if not self.options.backupdb:
+        if not self.backupdb:
             return True, None
         use_timestamps = not self.options["ignore-timestamps"]
-        bdb = self.options.backupdb
-        r = bdb.check_file(childpath, use_timestamps)
+        r = self.backupdb.check_file(childpath, use_timestamps)
 
         if not r.was_uploaded():
             return True, r
@@ -281,6 +280,7 @@ class Node:
         self.verboseprint("checking %s" % filecap)
         nodeurl = self.options['node-url']
         checkurl = nodeurl + "uri/%s?t=check&output=JSON" % urllib.quote(filecap)
+        self.files_checked += 1
         resp = do_http("POST", checkurl)
         if resp.status != 200:
             # can't check, so we must assume it's bad
@@ -295,6 +295,25 @@ class Node:
         r.did_check_healthy(cr)
         return False, r
 
+    def readdir(self, dircap):
+        # returns a dict of (childname: (type, readcap, metadata)), or None
+        # if the dircap didn't point to a directory
+        self.directories_read += 1
+        url = self.options['node-url'] + "uri/%s?t=json" % urllib.quote(dircap)
+        resp = do_http("GET", url)
+        if resp.status != 200:
+            raiseHTTPError("Error during directory GET", resp)
+        jd = simplejson.load(resp)
+        ntype, ndata = jd
+        if ntype != "dirnode":
+            return None
+        contents = {}
+        for (childname, (childtype, childdata)) in ndata["children"].items():
+            contents[childname] = (childtype,
+                                   str(childdata["ro_uri"]),
+                                   childdata["metadata"])
+        return contents
+
     def upload(self, childpath):
         #self.verboseprint("uploading %s.." % childpath)
         metadata = get_local_metadata(childpath)
@@ -316,9 +335,14 @@ class Node:
             if bdb_results:
                 bdb_results.did_upload(filecap)
 
+            self.files_uploaded += 1
             return filecap, metadata
 
         else:
             self.verboseprint("skipping %s.." % childpath)
+            self.files_reused += 1
             return bdb_results.was_uploaded(), metadata
 
+def backup(options):
+    bu = BackerUpper(options)
+    return bu.run()
diff --git a/src/allmydata/test/test_cli.py b/src/allmydata/test/test_cli.py
index e3b9948d..ebfd5ed2 100644
--- a/src/allmydata/test/test_cli.py
+++ b/src/allmydata/test/test_cli.py
@@ -5,6 +5,7 @@ from twisted.trial import unittest
 from cStringIO import StringIO
 import urllib
 import time
+import re
 
 from allmydata.util import fileutil, hashutil
 from allmydata import uri
@@ -16,7 +17,7 @@ _hush_pyflakes = [tahoe_ls, tahoe_get, tahoe_put, tahoe_rm, tahoe_cp]
 
 from allmydata.scripts.common import DEFAULT_ALIAS, get_aliases
 
-from allmydata.scripts import cli, debug, runner
+from allmydata.scripts import cli, debug, runner, backupdb
 from allmydata.test.common import SystemTestMixin
 from twisted.internet import threads # CLI tests use deferToThread
 
@@ -627,9 +628,23 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
         f.write(data)
         f.close()
 
+    def count_output(self, out):
+        mo = re.search(r"(\d)+ files uploaded \((\d+) reused\), (\d+) directories created \((\d+) reused\)", out)
+        return [int(s) for s in mo.groups()]
+
+    def count_output2(self, out):
+        mo = re.search(r"(\d)+ files checked, (\d+) directories checked, (\d+) directories read", out)
+        return [int(s) for s in mo.groups()]
+
     def test_backup(self):
         self.basedir = os.path.dirname(self.mktemp())
 
+        # is the backupdb available? If so, we test that a second backup does
+        # not create new directories.
+        hush = StringIO()
+        have_bdb = backupdb.get_backupdb(os.path.join(self.basedir, "dbtest"),
+                                         hush)
+
         # create a small local directory with a couple of files
         source = os.path.join(self.basedir, "home")
         fileutil.make_dirs(os.path.join(source, "empty"))
@@ -643,7 +658,15 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
         def _check0((rc, out, err)):
             self.failUnlessEqual(err, "")
             self.failUnlessEqual(rc, 0)
+            fu, fr, dc, dr = self.count_output(out)
+            # foo.txt, bar.txt, blah.txt
+            self.failUnlessEqual(fu, 3)
+            self.failUnlessEqual(fr, 0)
+            # empty, home, home/parent, home/parent/subdir
+            self.failUnlessEqual(dc, 4)
+            self.failUnlessEqual(dr, 0)
         d.addCallback(_check0)
+
         d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups"))
         def _check1((rc, out, err)):
             self.failUnlessEqual(err, "")
@@ -678,12 +701,62 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
 
 
         d.addCallback(lambda res: self.do_cli("backup", source, "tahoe:backups"))
+        def _check4a((rc, out, err)):
+            # second backup should reuse everything, if the backupdb is
+            # available
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            if have_bdb:
+                fu, fr, dc, dr = self.count_output(out)
+                # foo.txt, bar.txt, blah.txt
+                self.failUnlessEqual(fu, 0)
+                self.failUnlessEqual(fr, 3)
+                # empty, home, home/parent, home/parent/subdir
+                self.failUnlessEqual(dc, 0)
+                self.failUnlessEqual(dr, 4)
+        d.addCallback(_check4a)
+
+        if have_bdb:
+            # sneak into the backupdb, crank back the "last checked"
+            # timestamp to force a check on all files
+            def _reset_last_checked(res):
+                dbfile = os.path.join(self.basedir,
+                                      "client0", "private", "backupdb.sqlite")
+                self.failUnless(os.path.exists(dbfile), dbfile)
+                bdb = backupdb.get_backupdb(dbfile)
+                bdb.cursor.execute("UPDATE last_upload SET last_checked=0")
+                bdb.connection.commit()
+
+            d.addCallback(_reset_last_checked)
+
+            d.addCallback(lambda res:
+                          self.do_cli("backup", "--verbose", source, "tahoe:backups"))
+            def _check4b((rc, out, err)):
+                # we should check all files, and re-use all of them. None of
+                # the directories should have been changed.
+                self.failUnlessEqual(err, "")
+                self.failUnlessEqual(rc, 0)
+                fu, fr, dc, dr = self.count_output(out)
+                fchecked, dchecked, dread = self.count_output2(out)
+                self.failUnlessEqual(fchecked, 3)
+                self.failUnlessEqual(fu, 0)
+                self.failUnlessEqual(fr, 3)
+                # TODO: backupdb doesn't do dirs yet; when it does, this will
+                # change to dchecked=4, and maybe dread=0
+                self.failUnlessEqual(dchecked, 0)
+                self.failUnlessEqual(dread, 4)
+                self.failUnlessEqual(dc, 0)
+                self.failUnlessEqual(dr, 4)
+            d.addCallback(_check4b)
+
         d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
         def _check5((rc, out, err)):
             self.failUnlessEqual(err, "")
             self.failUnlessEqual(rc, 0)
             self.new_archives = out.split()
-            self.failUnlessEqual(len(self.new_archives), 2)
+            self.failUnlessEqual(len(self.new_archives), 3)
+            # the original backup should still be the oldest (i.e. sorts
+            # alphabetically towards the beginning)
             self.failUnlessEqual(sorted(self.new_archives)[0],
                                  self.old_archives[0])
         d.addCallback(_check5)
@@ -701,12 +774,27 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
             self.writeto("empty", "imagine nothing being here")
             return self.do_cli("backup", source, "tahoe:backups")
         d.addCallback(_modify)
+        def _check5a((rc, out, err)):
+            # second backup should reuse bar.txt (if backupdb is available),
+            # and upload the rest. None of the directories can be reused.
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            if have_bdb:
+                fu, fr, dc, dr = self.count_output(out)
+                # new foo.txt, surprise file, subfile, empty
+                self.failUnlessEqual(fu, 4)
+                # old bar.txt
+                self.failUnlessEqual(fr, 1)
+                # home, parent, subdir, blah.txt, surprisedir
+                self.failUnlessEqual(dc, 5)
+                self.failUnlessEqual(dr, 0)
+        d.addCallback(_check5a)
         d.addCallback(lambda res: self.do_cli("ls", "tahoe:backups/Archives"))
         def _check6((rc, out, err)):
             self.failUnlessEqual(err, "")
             self.failUnlessEqual(rc, 0)
             self.new_archives = out.split()
-            self.failUnlessEqual(len(self.new_archives), 3)
+            self.failUnlessEqual(len(self.new_archives), 4)
             self.failUnlessEqual(sorted(self.new_archives)[0],
                                  self.old_archives[0])
         d.addCallback(_check6)
@@ -724,5 +812,19 @@ class Backup(SystemTestMixin, CLITestMixin, unittest.TestCase):
             self.failUnlessEqual(out, "foo")
         d.addCallback(_check8)
 
+        d.addCallback(lambda res:
+                      self.do_cli("backup", "--no-backupdb", source, "tahoe:backups"))
+        def _check9((rc, out, err)):
+            # --no-backupdb means re-upload everything. We still get to
+            # re-use the directories, since nothing changed.
+            self.failUnlessEqual(err, "")
+            self.failUnlessEqual(rc, 0)
+            fu, fr, dc, dr = self.count_output(out)
+            self.failUnlessEqual(fu, 5)
+            self.failUnlessEqual(fr, 0)
+            self.failUnlessEqual(dc, 0)
+            self.failUnlessEqual(dr, 5)
+        d.addCallback(_check9)
+
         return d