From: Brian Warner <warner@allmydata.com>
Date: Wed, 28 May 2008 23:20:13 +0000 (-0700)
Subject: docs/backupdb.txt: preliminary sketch of our plans for the duplicate-upload-avoidance... 
X-Git-Tag: allmydata-tahoe-1.1.0~78
X-Git-Url: https://git.rkrishnan.org/components/com_hotproperty/css/rgr-080307.php?a=commitdiff_plain;h=72d8905c71151a3bfa9b35a8e1133fc6949a473e;p=tahoe-lafs%2Ftahoe-lafs.git

docs/backupdb.txt: preliminary sketch of our plans for the duplicate-upload-avoidance database
---

diff --git a/docs/backupdb.txt b/docs/backupdb.txt
new file mode 100644
index 00000000..c9618e6d
--- /dev/null
+++ b/docs/backupdb.txt
@@ -0,0 +1,188 @@
+= PRELIMINARY =
+
+This document is a description of a feature which is not yet implemented,
+added here to solicit feedback and to describe future plans. This document is
+subject to revision or withdrawal at any moment. Until this notice is
+removed, consider this entire document to be a figment of your imagination.
+
+= The Tahoe BackupDB =
+
+To speed up backup operations, Tahoe maintains a small database known as the
+"backupdb". This is used to avoid re-uploading files which have already been
+uploaded recently.
+
+This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite
+single-file database. It is used by the "tahoe backup" command, and by the
+"tahoe cp" command when the --use-backupdb option is included.
+
+The purpose of this database is specifically to manage the file-to-cap
+translation (the "upload" step). It does not address directory updates.
+
+The overall goal of optimizing backup is to reduce the work required when the
+source disk has not changed since the last backup. In the ideal case, running
+"tahoe backup" twice in a row, with no intervening changes to the disk, will
+not require any network traffic.
+
+This database is optional. If it is deleted, the worst effect is that a
+subsequent backup operation may use more effort (network bandwidth, CPU
+cycles, and disk IO) than it would have without the backupdb.
+
+== Schema ==
+
+The database contains the following tables:
+
+CREATE TABLE version
+(
+ version integer  # contains one row, set to 0
+);
+
+CREATE TABLE last_upload
+(
+ path  varchar(1024), # index, this is os.path.abspath(fn)
+ size  integer,       # os.stat(fn)[stat.ST_SIZE]
+ mtime number,        # os.stat(fn)[stat.ST_MTIME]
+ fileid integer
+);
+
+CREATE TABLE caps
+(
+ fileid integer PRIMARY KEY AUTOINCREMENT,
+ filecap varchar(256),        # URI:CHK:...
+ last_uploaded timestamp,
+ last_checked timestamp
+);
+
+CREATE TABLE keys_to_files
+(
+ readkey varchar(256) PRIMARY KEY, # index, AES key portion of filecap
+ fileid integer
+);
+
+Notes: if we extend the backupdb to assist with directory maintenance (see
+below), we may need paths in multiple places, so it would make sense to
+create a table for them, and change the last_upload table to refer to a
+pathid instead of an absolute path:
+
+CREATE TABLE paths
+(
+ path varchar(1024), # index
+ pathid integer PRIMARY KEY AUTOINCREMENT
+);
+
+== Operation ==
+
+The upload process starts with a pathname (like ~/.emacs) and wants to end up
+with a file-cap (like URI:CHK:...).
+
+The first step is to convert the path to an absolute form
+(/home/warner/emacs) and do a lookup in the last_upload table. If the path is
+not present in this table, the file must be uploaded. The upload process is:
+
+ 1. record the file's size and modification time
+ 2. upload the file into the grid, obtaining an immutable file read-cap
+ 3. add an entry to the 'caps' table, with the read-cap, and the current time
+ 4. extract the read-key from the read-cap, add an entry to 'keys_to_files'
+ 5. add an entry to 'last_upload'
+
+If the path *is* present in 'last_upload', the easy-to-compute identifying
+information is compared: file size and modification time. If these differ,
+the file must be uploaded. The row is removed from the last_upload table, and
+the upload process above is followed.
+
+If the path is present but the mtime differs, the file may have changed. If
+the size differs, then the file has certainly changed. The client will
+compute the CHK read-key for the file by hashing its contents, using exactly
+the same algorithm as the node does when it uploads a file (including
+~/.tahoe/private/convergence). It then checks the 'keys_to_files' table to
+see if this file has been uploaded before: perhaps the file was moved from
+elsewhere on the disk. If no match is found, the file must be uploaded, so
+the upload process above is follwed.
+
+If the read-key *is* found in the 'keys_to_files' table, then the file has
+been uploaded before, but we should consider performing a file check / verify
+operation to make sure we can skip a new upload. The fileid is used to
+retrieve the entry from the 'caps' table, and the last_checked timestamp is
+examined. If this timestamp is too old, a filecheck operation should be
+performed, and the file repaired if the results are not satisfactory. A
+"random early check" algorithm should be used, in which a check is performed
+with a probability that increases with the age of the previous results. E.g.
+files that were last checked within a month are not checked, files that were
+checked 5 weeks ago are re-checked with 25% probability, 6 weeks with 50%,
+more than 8 weeks are always checked. This reduces the "thundering herd" of
+filechecks-on-everything that would otherwise result when a backup operation
+is run one month after the original backup. The readkey can be submitted to
+the upload operation, to remove a duplicate hashing pass through the file and
+reduce the disk IO. In a future version of the storage server protocol, this
+could also improve the "streamingness" of the upload process.
+
+If the file's size and mtime match, the file is considered to be unmodified,
+and the last_checked timestamp from the 'caps' table is examined as above
+(possibly resulting in a filecheck or repair). The --no-timestamps option
+disables this check: this removes the danger of false-positives (i.e. not
+uploading a new file, because it appeared to be the same as a previously
+uploaded one), but increases the amount of disk IO that must be performed
+(every byte of every file must be hashed to compute the readkey).
+
+This algorithm is summarized in the following pseudocode:
+
+{{{
+ def backup(path):
+   abspath = os.path.abspath(path)
+   result = check_for_upload(abspath)
+   now = time.time()
+   if result == MUST_UPLOAD:
+     filecap = upload(abspath, key=result.readkey)
+     fileid = db("INSERT INTO caps (filecap, last_uploaded, last_checked)",
+                 (filecap, now, now))
+     db("INSERT INTO keys_to_files", (result.readkey, filecap))
+     db("INSERT INTO last_upload", (abspath,current_size,current_mtime,fileid))
+   if result in (MOVED, ALREADY_UPLOADED):
+     age = now - result.last_checked
+     probability = (age - 1*MONTH) / 1*MONTH
+     probability = min(max(probability, 0.0), 1.0)
+     if random.random() < probability:
+       do_filecheck(result.filecap)
+   if result == MOVED:
+     db("INSERT INTO last_upload",
+        (abspath, current_size, current_mtime, result.fileid))
+
+
+ def check_for_upload(abspath):
+   row = db("SELECT (size,mtime,fileid) FROM last_upload WHERE path == %s"
+            % abspath)
+   if not row:
+     return check_moved(abspath)
+   current_size = os.stat(abspath)[stat.ST_SIZE]
+   current_mtime = os.stat(abspath)[stat.ST_MTIME]
+   (last_size,last_mtime,last_fileid) = row
+   if file_changed(current_size, last_size, current_mtime, last_mtime):
+     db("DELETE FROM last_upload WHERE fileid=%s" % fileid)
+     return check_moved(abspath)
+   (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
+                                " WHERE fileid == %s" % last_fileid)
+   return ALREADY_UPLOADED(filecap=filecap, last_checked=last_checked)
+
+ def file_changed(current_size, last_size, current_mtime, last_mtime):
+   if last_size != current_size:
+     return True
+   if NO_TIMESTAMPS:
+     return True
+   if last_mtime != current_mtime:
+     return True
+   return False
+
+ def check_moved(abspath):
+   readkey = hash_with_convergence(abspath)
+   fileid = db("SELECT (fileid) FROM keys_to_files WHERE readkey == %s"%readkey)
+   if not fileid:
+     return MUST_UPLOAD(readkey=readkey)
+   (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" +
+                                " WHERE fileid == %s" % fileid)
+   return MOVED(fileid=fileid, filecap=filecap, last_checked=last_checked)
+
+ def do_filecheck(filecap):
+   health = check(filecap)
+   if health < DESIRED:
+     repair(filecap)
+
+}}}