From: Brian Warner Date: Sun, 17 Jun 2007 04:55:13 +0000 (-0700) Subject: misc/count_dirs.py: dev tool to estimate filetree space consumption X-Git-Tag: allmydata-tahoe-0.4.0~33 X-Git-Url: https://git.rkrishnan.org/specifications/banana.xhtml?a=commitdiff_plain;h=2cf7cfbe4ab5b0066f1d936654908517a126c916;p=tahoe-lafs%2Ftahoe-lafs.git misc/count_dirs.py: dev tool to estimate filetree space consumption --- diff --git a/misc/count_dirs.py b/misc/count_dirs.py new file mode 100644 index 00000000..0af03148 --- /dev/null +++ b/misc/count_dirs.py @@ -0,0 +1,118 @@ +#! /usr/bin/python + +""" +This tool estimates how much space would be consumed by a filetree into which +a native directory was copied. + +One open question is how we should encode directories. One approach is to put +a block of data on a server, one per directory, which effectively contains a +dictionary that maps child names to targets (URIs for children which are +files, slotnames for children which are directories). To prevent the server +which hosts this data from either learning its contents or corrupting them, +we can add encryption and integrity checks to the data, at the cost of +storage overhead. + +This program is intended to estimate the size of these data blocks using +real-world filenames and directories. You point it at a real directory, and +it does a recursive walk of the filesystem, adding up the size of the +filetree data structures that would be required to represent it. + +MODES: + + A: no confidentiality or integrity checking. Directories are serialized + plaintext dictionaries which map file/subdir names to targets (either + URIs or slotnames). Each entry can be changed independently. + B1: child names and targets are encrypted. No integrity checks, so the + server can still corrupt the contents undetectably. Each entry can + still be changed independently. + B2: same security properties as B1, but the dictionary is serialized before + encryption. This reduces overhead at the cost of preventing independent + updates of entries (all entries must be updated at the same time, so + test-and-set operations are required to avoid data-losing races) + C1: like B1, but adding HMACs to each entry to guarantee data integrity + C2: like B2, but adding a single block-wide HMAC for data integrity + +""" + +import sys, os.path + +#URI:7jzbza6iwdsk5xbxsvdgjaugyrhetw64zpflp4gihmyh5krjblra====:a5qdejwbimu5b2wfke7xwexxlq======:gzeub5v42rjbgd7ccawnahu2evqd42lpdpzd447c6zkmdvjkpowq====:25:100:219889 +# that's a printable representation of two 32-byte hashes (storage index, URI +# extension block hash) and a 16-byte AES read-capability key, and some +# share-count and size information +URI_SIZE = 164 + +#pb://xextf3eap44o3wi27mf7ehiur6wvhzr6@207.7.153.180:56677,127.0.0.1:56677/zilcw5uz2yyyo=== +# that's a FURL which points at the slot. Modes that need to add a +# read-capability AES key will need more space. +SLOTNAME_SIZE = 90 + + +def slotsize(mode, numfiles, numdirs): + # URI_sizes is the total space taken up by the target (dict keys) strings + # for all of the targets that are files, instead of directories + target_sizes_for_files = numfiles * URI_SIZE + slotname_size = SLOTNAME_SIZE + if mode in ("B1", "B2", "C1", "C2"): + slotname_size += 16 + # slotname_sizes is the total space taken up by the target strings for + # all the targets that are directories, instead of files. These are + # bigger when the read+write-cap slotname is larger than the store-cap, + # which happens as soon as we seek to prevent the slot's host from + # reading or corrupting it. + target_sizes_for_subdirs = numdirs * slotname_size + + # now how much overhead is there for each entry? + per_slot, per_entry = 0, 0 + if mode == "B1": + per_entry = 16+12+12 + elif mode == "C1": + per_entry = 16+12+12 + 32+32 + elif mode == "B2": + per_slot = 12 + elif mode == "C2": + per_slot = 12+32 + num_entries = numfiles + numdirs + total = (target_sizes_for_files + + target_sizes_for_subdirs + + per_slot + + per_entry * num_entries + ) + return total + +MODES = ("A", "B1", "B2", "C1", "C2") + +def scan(root): + total = dict([(mode,0) for mode in MODES]) + num_files = 0 + num_dirs = 0 + for absroot, dirs, files in os.walk(root): + #print absroot + #print " %d files" % len(files) + #print " %d subdirs" % len(dirs) + num_files += len(files) + num_dirs += len(dirs) + stringsize = len(''.join(files) + ''.join(dirs)) + for mode in MODES: + total[mode] += slotsize(mode, len(files), len(dirs)) + stringsize + + print "%d directories" % num_dirs + print "%d files" % num_files + for mode in sorted(total.keys()): + print "%s: %d bytes" % (mode, total[mode]) + + +if __name__ == '__main__': + scan(sys.argv[1]) + +""" +260:warner@monolith% ./count_dirs.py ~ +70925 directories +457199 files +A: 90042361 bytes +B1: 112302121 bytes +B2: 92027061 bytes +C1: 146102057 bytes +C2: 94293461 bytes + +"""