From: Brian Warner Date: Fri, 6 Feb 2009 01:17:56 +0000 (-0700) Subject: add sqlite-based backupdb, for #598 and others (including 'tahoe cp'). Not enabled... X-Git-Tag: allmydata-tahoe-1.3.0~77 X-Git-Url: https://git.rkrishnan.org/simplejson/components/%22file:/?a=commitdiff_plain;h=4be2e11d11167baaa8bd5fe0c59642ea3dd9462e;p=tahoe-lafs%2Ftahoe-lafs.git add sqlite-based backupdb, for #598 and others (including 'tahoe cp'). Not enabled yet. --- diff --git a/src/allmydata/scripts/backupdb.py b/src/allmydata/scripts/backupdb.py new file mode 100644 index 00000000..b93270c3 --- /dev/null +++ b/src/allmydata/scripts/backupdb.py @@ -0,0 +1,246 @@ + +# the backupdb is only available if sqlite3 is available. Python-2.5.x and +# beyond include sqlite3 in the standard library. For python-2.4, the +# "pysqlite2" package (which, despite the confusing name, uses sqlite3) must +# be installed. On debian, install python-pysqlite2 + +import os.path, sys, time, random, stat + +DAY = 24*60*60 +MONTH = 30*DAY + +SCHEMA_v1 = """ +CREATE TABLE version +( + version INTEGER -- contains one row, set to 1 +); + +CREATE TABLE local_files +( + path VARCHAR(1024) PRIMARY KEY, -- index, this is os.path.abspath(fn) + size INTEGER, -- os.stat(fn)[stat.ST_SIZE] + mtime NUMBER, -- os.stat(fn)[stat.ST_MTIME] + ctime NUMBER, -- os.stat(fn)[stat.ST_CTIME] + fileid INTEGER +); + +CREATE TABLE caps +( + fileid INTEGER PRIMARY KEY AUTOINCREMENT, + filecap VARCHAR(256) UNIQUE -- URI:CHK:... +); + +CREATE TABLE last_upload +( + fileid INTEGER PRIMARY KEY, + last_uploaded TIMESTAMP, + last_checked TIMESTAMP +); + +""" + +def get_backupdb(dbfile, stderr=sys.stderr): + # open or create the given backupdb file. The parent directory must + # exist. + try: + import sqlite3 + sqlite = sqlite3 # pyflakes whines about 'import sqlite3 as sqlite' .. + except ImportError: + try: + from pysqlite2 import dbapi2 + sqlite = dbapi2 # .. when this clause does it too + except ImportError: + print >>stderr, "sqlite unavailable, not using backupdb" + return None + + must_create = not os.path.exists(dbfile) + try: + db = sqlite.connect(dbfile) + except (EnvironmentError, sqlite.OperationalError), e: + print >>stderr, "Unable to create/open backupdb file %s: %s" % (dbfile, e) + return None + + c = db.cursor() + if must_create: + c.executescript(SCHEMA_v1) + c.execute("INSERT INTO version (version) VALUES (1)") + db.commit() + + try: + c.execute("SELECT version FROM version") + version = c.fetchone()[0] + except sqlite.DatabaseError, e: + # this indicates that the file is not a compatible database format. + # Perhaps it was created with an old version, or it might be junk. + print >>stderr, "backupdb file is unusable: %s" % e + return None + + if version == 1: + return BackupDB_v1(sqlite, db) + print >>stderr, "Unable to handle backupdb version %s" % version + return None + +MUST_UPLOAD, ALREADY_UPLOADED = range(2) +class Result: + def __init__(self, bdb, filecap, should_check, + path, mtime, ctime, size): + self.bdb = bdb + self.filecap = filecap + self.should_check_p = should_check + + self.path = path + self.mtime = mtime + self.ctime = ctime + self.size = size + + def was_uploaded(self): + if self.filecap: + return self.filecap + return False + + def did_upload(self, filecap): + self.bdb.did_upload(filecap, + self.path, + self.mtime, self.ctime, self.size) + + def should_check(self): + return self.should_check_p + + def did_check_healthy(self, results): + self.bdb.did_check_healthy(self.filecap, results) + +class BackupDB_v1: + VERSION = 1 + NO_CHECK_BEFORE = 1*MONTH + ALWAYS_CHECK_AFTER = 2*MONTH + + def __init__(self, sqlite_module, connection): + self.sqlite_module = sqlite_module + self.connection = connection + self.cursor = connection.cursor() + + def check_file(self, path, use_timestamps=True): + """I will tell you if a given local file needs to be uploaded or not, + by looking in a database and seeing if I have a record of this file + having been uploaded earlier. + + I return a Results object, synchronously. If r.was_uploaded() returns + False, you should upload the file. When you are finished uploading + it, call r.did_upload(filecap), so I can update my database. + + If was_uploaded() returns a filecap, you might be able to avoid an + upload. Call r.must_check(), and if it says False, you can skip the + upload and use the filecap returned by was_uploaded(). + + If should_check() returns True, you should perform a filecheck on the + filecap returned by was_uploaded(). If the check indicates the file + is healthy, please call r.did_check_healthy(checker_results) so I can + update the database. If the check indicates the file is not healthy, + please upload the file and call r.did_upload(filecap) when you're + done. + + I use_timestamps=True (the default), I will compare ctime and mtime + of the local file against an entry in my database, and consider the + file to be unchanged if ctime, mtime, and filesize are all the same + as the earlier version. If use_timestamps=False, I will not trust the + timestamps, so more files (perhaps all) will be marked as needing + upload. A future version of this database may hash the file to make + equality decisions, in which case use_timestamps=False will not + always imply r.must_upload()==True. + + 'path' points to a local file on disk, possibly relative to the + current working directory. The database stores absolute pathnames. + """ + + path = os.path.abspath(path) + s = os.stat(path) + size = s[stat.ST_SIZE] + ctime = s[stat.ST_CTIME] + mtime = s[stat.ST_MTIME] + + now = time.time() + c = self.cursor + + c.execute("SELECT size,mtime,ctime,fileid" + " FROM local_files" + " WHERE path=?", + (path,)) + row = self.cursor.fetchone() + if not row: + return Result(self, None, False, path, mtime, ctime, size) + (last_size,last_mtime,last_ctime,last_fileid) = row + + c.execute("SELECT caps.filecap, last_upload.last_checked" + " FROM caps,last_upload" + " WHERE caps.fileid=? AND last_upload.fileid=?", + (last_fileid, last_fileid)) + row2 = c.fetchone() + + if ((last_size != size + or not use_timestamps + or last_mtime != mtime + or last_ctime != ctime) # the file has been changed + or (not row2) # we somehow forgot where we put the file last time + ): + c.execute("DELETE FROM local_files WHERE path=?", (path,)) + self.connection.commit() + return Result(self, None, False, path, mtime, ctime, size) + + # at this point, we're allowed to assume the file hasn't been changed + (filecap, last_checked) = row2 + age = now - last_checked + + probability = ((age - self.NO_CHECK_BEFORE) / + (self.ALWAYS_CHECK_AFTER - self.NO_CHECK_BEFORE)) + probability = min(max(probability, 0.0), 1.0) + should_check = bool(random.random() < probability) + + return Result(self, filecap, should_check, path, mtime, ctime, size) + + def get_or_allocate_fileid_for_cap(self, filecap): + # find an existing fileid for this filecap, or insert a new one. The + # caller is required to commit() afterwards. + + # mysql has "INSERT ... ON DUPLICATE KEY UPDATE", but not sqlite + # sqlite has "INSERT ON CONFLICT REPLACE", but not mysql + # So we use INSERT, ignore any error, then a SELECT + c = self.cursor + try: + c.execute("INSERT INTO caps (filecap) VALUES (?)", (filecap,)) + except self.sqlite_module.IntegrityError: + pass + c.execute("SELECT fileid FROM caps WHERE filecap=?", (filecap,)) + foundrow = c.fetchone() + assert foundrow + fileid = foundrow[0] + return fileid + + def did_upload(self, filecap, path, mtime, ctime, size): + now = time.time() + fileid = self.get_or_allocate_fileid_for_cap(filecap) + try: + self.cursor.execute("INSERT INTO last_upload VALUES (?,?,?)", + (fileid, now, now)) + except self.sqlite_module.IntegrityError: + self.cursor.execute("UPDATE last_upload" + " SET last_uploaded=?, last_checked=?" + " WHERE fileid=?", + (now, now, fileid)) + try: + self.cursor.execute("INSERT INTO local_files VALUES (?,?,?,?,?)", + (path, size, mtime, ctime, fileid)) + except self.sqlite_module.IntegrityError: + self.cursor.execute("UPDATE local_files" + " SET size=?, mtime=?, ctime=?, fileid=?" + " WHERE path=?", + (size, mtime, ctime, fileid, path)) + self.connection.commit() + + def did_check_healthy(self, filecap, results): + now = time.time() + fileid = self.get_or_allocate_fileid_for_cap(filecap) + self.cursor.execute("UPDATE last_upload" + " SET last_checked=?" + " WHERE fileid=?", + (now, fileid)) + self.connection.commit() diff --git a/src/allmydata/test/test_backupdb.py b/src/allmydata/test/test_backupdb.py new file mode 100644 index 00000000..3af720b8 --- /dev/null +++ b/src/allmydata/test/test_backupdb.py @@ -0,0 +1,145 @@ + +import os.path, time +from StringIO import StringIO +from twisted.trial import unittest + +from allmydata.util import fileutil +from allmydata.scripts import backupdb + +class BackupDB(unittest.TestCase): + def create_or_skip(self, dbfile): + stderr = StringIO() + bdb = backupdb.get_backupdb(dbfile, stderr=stderr) + if not bdb: + if "sqlite unavailable" in stderr.getvalue(): + raise unittest.SkipTest("sqlite unavailable, skipping test") + return bdb + + def test_basic(self): + self.basedir = basedir = os.path.join("backupdb", "create") + fileutil.make_dirs(basedir) + dbfile = os.path.join(basedir, "dbfile") + bdb = self.create_or_skip(dbfile) + self.failUnless(bdb) + self.failUnlessEqual(bdb.VERSION, 1) + + def test_fail(self): + self.basedir = basedir = os.path.join("backupdb", "fail") + fileutil.make_dirs(basedir) + + # put a non-DB file in the way + self.writeto("not-a-database", "I do not look like a sqlite database") + stderr_f = StringIO() + bdb = backupdb.get_backupdb(os.path.join(basedir, "not-a-database"), + stderr_f) + self.failUnlessEqual(bdb, None) + stderr = stderr_f.getvalue() + if "sqlite unavailable" in stderr: + pass + else: + self.failUnless("backupdb file is unusable" in stderr) + self.failUnless("file is encrypted or is not a database" in stderr) + + # put a directory in the way, to exercise a different error path + where = os.path.join(basedir, "roadblock-dir") + fileutil.make_dirs(where) + stderr_f = StringIO() + bdb = backupdb.get_backupdb(where, stderr_f) + self.failUnlessEqual(bdb, None) + stderr = stderr_f.getvalue() + if "sqlite unavailable" in stderr: + pass + else: + self.failUnless(("Unable to create/open backupdb file %s" % where) + in stderr) + self.failUnless("unable to open database file" in stderr) + + + def writeto(self, filename, data): + fn = os.path.join(self.basedir, filename) + parentdir = os.path.dirname(fn) + fileutil.make_dirs(parentdir) + f = open(fn, "w") + f.write(data) + f.close() + return fn + + def test_check(self): + self.basedir = basedir = os.path.join("backupdb", "check") + fileutil.make_dirs(basedir) + dbfile = os.path.join(basedir, "dbfile") + bdb = self.create_or_skip(dbfile) + self.failUnless(bdb) + + foo_fn = self.writeto("foo.txt", "foo.txt") + blah_fn = self.writeto("bar/blah.txt", "blah.txt") + + r = bdb.check_file(foo_fn) + self.failUnlessEqual(r.was_uploaded(), False) + r.did_upload("foo-cap") + + r = bdb.check_file(blah_fn) + self.failUnlessEqual(r.was_uploaded(), False) + r.did_upload("blah-cap") + + r = bdb.check_file(foo_fn) + self.failUnlessEqual(r.was_uploaded(), "foo-cap") + self.failUnlessEqual(r.should_check(), False) + + time.sleep(1.0) # make sure the timestamp changes + self.writeto("foo.txt", "NEW") + + r = bdb.check_file(foo_fn) + self.failUnlessEqual(r.was_uploaded(), False) + r.did_upload("new-cap") + + r = bdb.check_file(foo_fn) + self.failUnlessEqual(r.was_uploaded(), "new-cap") + self.failUnlessEqual(r.should_check(), False) + # if we spontaneously decide to upload it anyways, nothing should + # break + r.did_upload("new-cap") + + r = bdb.check_file(foo_fn, use_timestamps=False) + self.failUnlessEqual(r.was_uploaded(), False) + r.did_upload("new-cap") + + r = bdb.check_file(foo_fn) + self.failUnlessEqual(r.was_uploaded(), "new-cap") + self.failUnlessEqual(r.should_check(), False) + + bdb.NO_CHECK_BEFORE = 0 + bdb.ALWAYS_CHECK_AFTER = 0.1 + + r = bdb.check_file(blah_fn) + self.failUnlessEqual(r.was_uploaded(), "blah-cap") + self.failUnlessEqual(r.should_check(), True) + r.did_check_healthy("results") # we know they're ignored for now + + r = bdb.check_file(blah_fn) + self.failUnlessEqual(r.was_uploaded(), "blah-cap") + self.failUnlessEqual(r.should_check(), False) + + os.unlink(os.path.join(basedir, "foo.txt")) + fileutil.make_dirs(os.path.join(basedir, "foo.txt")) # file becomes dir + r = bdb.check_file(foo_fn) + self.failUnlessEqual(r.was_uploaded(), False) + + def test_wrong_version(self): + self.basedir = basedir = os.path.join("backupdb", "wrong_version") + fileutil.make_dirs(basedir) + + where = os.path.join(basedir, "tooold.db") + bdb = self.create_or_skip(where) + # reach into the DB and make it old + bdb.cursor.execute("UPDATE version SET version=0") + bdb.connection.commit() + + # now the next time we open the database, it should be an unusable + # version + stderr_f = StringIO() + bdb = backupdb.get_backupdb(where, stderr_f) + self.failUnlessEqual(bdb, None) + stderr = stderr_f.getvalue() + self.failUnlessEqual(stderr.strip(), + "Unable to handle backupdb version 0")