From: Brian Warner Date: Wed, 28 May 2008 23:20:13 +0000 (-0700) Subject: docs/backupdb.txt: preliminary sketch of our plans for the duplicate-upload-avoidance... X-Git-Tag: allmydata-tahoe-1.1.0~78 X-Git-Url: https://git.rkrishnan.org/specifications/index.php?a=commitdiff_plain;h=72d8905c71151a3bfa9b35a8e1133fc6949a473e;p=tahoe-lafs%2Ftahoe-lafs.git docs/backupdb.txt: preliminary sketch of our plans for the duplicate-upload-avoidance database --- diff --git a/docs/backupdb.txt b/docs/backupdb.txt new file mode 100644 index 00000000..c9618e6d --- /dev/null +++ b/docs/backupdb.txt @@ -0,0 +1,188 @@ += PRELIMINARY = + +This document is a description of a feature which is not yet implemented, +added here to solicit feedback and to describe future plans. This document is +subject to revision or withdrawal at any moment. Until this notice is +removed, consider this entire document to be a figment of your imagination. + += The Tahoe BackupDB = + +To speed up backup operations, Tahoe maintains a small database known as the +"backupdb". This is used to avoid re-uploading files which have already been +uploaded recently. + +This database lives in ~/.tahoe/private/backupdb.sqlite, and is a SQLite +single-file database. It is used by the "tahoe backup" command, and by the +"tahoe cp" command when the --use-backupdb option is included. + +The purpose of this database is specifically to manage the file-to-cap +translation (the "upload" step). It does not address directory updates. + +The overall goal of optimizing backup is to reduce the work required when the +source disk has not changed since the last backup. In the ideal case, running +"tahoe backup" twice in a row, with no intervening changes to the disk, will +not require any network traffic. + +This database is optional. If it is deleted, the worst effect is that a +subsequent backup operation may use more effort (network bandwidth, CPU +cycles, and disk IO) than it would have without the backupdb. + +== Schema == + +The database contains the following tables: + +CREATE TABLE version +( + version integer # contains one row, set to 0 +); + +CREATE TABLE last_upload +( + path varchar(1024), # index, this is os.path.abspath(fn) + size integer, # os.stat(fn)[stat.ST_SIZE] + mtime number, # os.stat(fn)[stat.ST_MTIME] + fileid integer +); + +CREATE TABLE caps +( + fileid integer PRIMARY KEY AUTOINCREMENT, + filecap varchar(256), # URI:CHK:... + last_uploaded timestamp, + last_checked timestamp +); + +CREATE TABLE keys_to_files +( + readkey varchar(256) PRIMARY KEY, # index, AES key portion of filecap + fileid integer +); + +Notes: if we extend the backupdb to assist with directory maintenance (see +below), we may need paths in multiple places, so it would make sense to +create a table for them, and change the last_upload table to refer to a +pathid instead of an absolute path: + +CREATE TABLE paths +( + path varchar(1024), # index + pathid integer PRIMARY KEY AUTOINCREMENT +); + +== Operation == + +The upload process starts with a pathname (like ~/.emacs) and wants to end up +with a file-cap (like URI:CHK:...). + +The first step is to convert the path to an absolute form +(/home/warner/emacs) and do a lookup in the last_upload table. If the path is +not present in this table, the file must be uploaded. The upload process is: + + 1. record the file's size and modification time + 2. upload the file into the grid, obtaining an immutable file read-cap + 3. add an entry to the 'caps' table, with the read-cap, and the current time + 4. extract the read-key from the read-cap, add an entry to 'keys_to_files' + 5. add an entry to 'last_upload' + +If the path *is* present in 'last_upload', the easy-to-compute identifying +information is compared: file size and modification time. If these differ, +the file must be uploaded. The row is removed from the last_upload table, and +the upload process above is followed. + +If the path is present but the mtime differs, the file may have changed. If +the size differs, then the file has certainly changed. The client will +compute the CHK read-key for the file by hashing its contents, using exactly +the same algorithm as the node does when it uploads a file (including +~/.tahoe/private/convergence). It then checks the 'keys_to_files' table to +see if this file has been uploaded before: perhaps the file was moved from +elsewhere on the disk. If no match is found, the file must be uploaded, so +the upload process above is follwed. + +If the read-key *is* found in the 'keys_to_files' table, then the file has +been uploaded before, but we should consider performing a file check / verify +operation to make sure we can skip a new upload. The fileid is used to +retrieve the entry from the 'caps' table, and the last_checked timestamp is +examined. If this timestamp is too old, a filecheck operation should be +performed, and the file repaired if the results are not satisfactory. A +"random early check" algorithm should be used, in which a check is performed +with a probability that increases with the age of the previous results. E.g. +files that were last checked within a month are not checked, files that were +checked 5 weeks ago are re-checked with 25% probability, 6 weeks with 50%, +more than 8 weeks are always checked. This reduces the "thundering herd" of +filechecks-on-everything that would otherwise result when a backup operation +is run one month after the original backup. The readkey can be submitted to +the upload operation, to remove a duplicate hashing pass through the file and +reduce the disk IO. In a future version of the storage server protocol, this +could also improve the "streamingness" of the upload process. + +If the file's size and mtime match, the file is considered to be unmodified, +and the last_checked timestamp from the 'caps' table is examined as above +(possibly resulting in a filecheck or repair). The --no-timestamps option +disables this check: this removes the danger of false-positives (i.e. not +uploading a new file, because it appeared to be the same as a previously +uploaded one), but increases the amount of disk IO that must be performed +(every byte of every file must be hashed to compute the readkey). + +This algorithm is summarized in the following pseudocode: + +{{{ + def backup(path): + abspath = os.path.abspath(path) + result = check_for_upload(abspath) + now = time.time() + if result == MUST_UPLOAD: + filecap = upload(abspath, key=result.readkey) + fileid = db("INSERT INTO caps (filecap, last_uploaded, last_checked)", + (filecap, now, now)) + db("INSERT INTO keys_to_files", (result.readkey, filecap)) + db("INSERT INTO last_upload", (abspath,current_size,current_mtime,fileid)) + if result in (MOVED, ALREADY_UPLOADED): + age = now - result.last_checked + probability = (age - 1*MONTH) / 1*MONTH + probability = min(max(probability, 0.0), 1.0) + if random.random() < probability: + do_filecheck(result.filecap) + if result == MOVED: + db("INSERT INTO last_upload", + (abspath, current_size, current_mtime, result.fileid)) + + + def check_for_upload(abspath): + row = db("SELECT (size,mtime,fileid) FROM last_upload WHERE path == %s" + % abspath) + if not row: + return check_moved(abspath) + current_size = os.stat(abspath)[stat.ST_SIZE] + current_mtime = os.stat(abspath)[stat.ST_MTIME] + (last_size,last_mtime,last_fileid) = row + if file_changed(current_size, last_size, current_mtime, last_mtime): + db("DELETE FROM last_upload WHERE fileid=%s" % fileid) + return check_moved(abspath) + (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" + + " WHERE fileid == %s" % last_fileid) + return ALREADY_UPLOADED(filecap=filecap, last_checked=last_checked) + + def file_changed(current_size, last_size, current_mtime, last_mtime): + if last_size != current_size: + return True + if NO_TIMESTAMPS: + return True + if last_mtime != current_mtime: + return True + return False + + def check_moved(abspath): + readkey = hash_with_convergence(abspath) + fileid = db("SELECT (fileid) FROM keys_to_files WHERE readkey == %s"%readkey) + if not fileid: + return MUST_UPLOAD(readkey=readkey) + (filecap, last_checked) = db("SELECT (filecap, last_checked) FROM caps" + + " WHERE fileid == %s" % fileid) + return MOVED(fileid=fileid, filecap=filecap, last_checked=last_checked) + + def do_filecheck(filecap): + health = check(filecap) + if health < DESIRED: + repair(filecap) + +}}}