From: Brian Warner Date: Thu, 8 May 2008 20:21:14 +0000 (-0700) Subject: dirnode: add a deep_stats(), like deep-size but with more information. webish adds... X-Git-Tag: allmydata-tahoe-1.1.0~145 X-Git-Url: https://git.rkrishnan.org/specifications/%5B/%5D%20/uri/nxhtml.html?a=commitdiff_plain;h=6c00a70dbc06f94acb4910d22e356820c9e5054f;p=tahoe-lafs%2Ftahoe-lafs.git dirnode: add a deep_stats(), like deep-size but with more information. webish adds t=deeps-size too. --- diff --git a/docs/webapi.txt b/docs/webapi.txt index 78b3861a..cebe3e26 100644 --- a/docs/webapi.txt +++ b/docs/webapi.txt @@ -571,6 +571,38 @@ GET $URL?t=deep-size expansion or encoding overhead into account. Later versions of the code may improve this estimate upwards. +GET $URL?t=deep-stats + + Return a JSON-encoded dictionary that lists interesting statistics about + the set of all files and directories reachable from the given directory: + + count-immutable-files: count of how many CHK files are in the set + count-mutable-files: same, for mutable files (does not include directories) + count-literal-files: same, for LIT files (data contained inside the URI) + count-files: sum of the above three + count-directories: count of directories + size-immutable-files: total bytes for all CHK files in the set, =deep-size + size-mutable-files (TODO): same, for current version of all mutable files + size-literal-files: same, for LIT files + size-directories: size of directories (includes size-literal-files) + largest-directory: number of children in the largest directory + largest-immutable-file: number of bytes in the largest CHK file + + size-mutable-files is not implemented, because it would require extra + queries to each mutable file to get their size. This may be implemented in + the future. + + Assuming no sharing, the basic space consumed by a single root directory is + the sum of size-immutable-files, size-mutable-files, and size-directories. + The actual disk space used by the shares is larger, because of the + following sources of overhead: + + integrity data + expansion due to erasure coding + share management data (leases) + backend (ext3) minimum block size + + 6. XMLRPC (coming soon) http://127.0.0.1:8123/xmlrpc diff --git a/src/allmydata/dirnode.py b/src/allmydata/dirnode.py index 96eb8916..d1e3f86e 100644 --- a/src/allmydata/dirnode.py +++ b/src/allmydata/dirnode.py @@ -113,6 +113,8 @@ class NewDirectoryNode: def __init__(self, client): self._client = client + self._most_recent_size = None + def __repr__(self): return "<%s %s %s>" % (self.__class__.__name__, self.is_readonly() and "RO" or "RW", hasattr(self, '_uri') and self._uri.abbrev()) def init_from_uri(self, myuri): @@ -137,8 +139,18 @@ class NewDirectoryNode: self._uri = NewDirectoryURI(IMutableFileURI(self._node.get_uri())) return self + def get_size(self): + # return the size of our backing mutable file, in bytes, if we've + # fetched it. + return self._most_recent_size + + def _set_size(self, data): + self._most_recent_size = len(data) + return data + def _read(self): d = self._node.download_best_version() + d.addCallback(self._set_size) d.addCallback(self._unpack_contents) return d @@ -463,6 +475,76 @@ class NewDirectoryNode: d.addCallback(_got_list) return d + def deep_stats(self): + stats = dict([ (k,0) for k in ["count-immutable-files", + "count-mutable-files", + "count-literal-files", + "count-files", + "count-directories", + "size-immutable-files", + #"size-mutable-files", + "size-literal-files", + "size-directories", + "largest-directory", + "largest-directory-children", + "largest-immutable-file", + #"largest-mutable-file", + ]]) + # we track verifier caps, to avoid double-counting children for which + # we've got both a write-cap and a read-cap + found = set() + found.add(self.get_verifier()) + + limiter = ConcurrencyLimiter(10) + + d = self._add_deepstats_from_node(self, found, stats, limiter) + d.addCallback(lambda res: stats) + return d + + def _add_deepstats_from_node(self, node, found, stats, limiter): + d = limiter.add(node.list) + def _got_list(children): + dl = [] + dirsize_bytes = node.get_size() + dirsize_children = len(children) + stats["count-directories"] += 1 + stats["size-directories"] += dirsize_bytes + stats["largest-directory"] = max(stats["largest-directory"], + dirsize_bytes) + stats["largest-directory-children"] = max(stats["largest-directory-children"], + dirsize_children) + for name, (child, metadata) in children.iteritems(): + verifier = child.get_verifier() + if verifier in found: + continue + found.add(verifier) + if IDirectoryNode.providedBy(child): + dl.append(self._add_deepstats_from_node(child, found, + stats, limiter)) + elif IMutableFileNode.providedBy(child): + stats["count-files"] += 1 + stats["count-mutable-files"] += 1 + # TODO: update the servermap, compute a size, add it to + # stats["size-mutable-files"], max it into + # stats["largest-mutable-file"] + elif IFileNode.providedBy(child): # CHK and LIT + stats["count-files"] += 1 + size = child.get_size() + if child.get_uri().startswith("URI:LIT:"): + stats["count-literal-files"] += 1 + stats["size-literal-files"] += size + else: + stats["count-immutable-files"] += 1 + stats["size-immutable-files"] += size + stats["largest-immutable-file"] = max( + stats["largest-immutable-file"], size) + if dl: + return defer.DeferredList(dl) + d.addCallback(_got_list) + return d + + + # use client.create_dirnode() to make one of these diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py index b3dbed39..789ae71a 100644 --- a/src/allmydata/interfaces.py +++ b/src/allmydata/interfaces.py @@ -867,8 +867,42 @@ class IDirectoryNode(IMutableFilesystemNode): operation finishes. The child name must be a unicode string.""" def build_manifest(): - """Return a frozenset of verifier-capability strings for all nodes - (directories and files) reachable from this one.""" + """Return a Deferred that fires with a frozenset of + verifier-capability strings for all nodes (directories and files) + reachable from this one.""" + + def deep_stats(): + """Return a Deferred that fires with a dictionary of statistics + computed by examining all nodes (directories and files) reachable + from this one, with the following keys:: + + count-immutable-files: count of how many CHK files are in the set + count-mutable-files: same, for mutable files (does not include + directories) + count-literal-files: same, for LIT files + count-files: sum of the above three + + count-directories: count of directories + + size-immutable-files: total bytes for all CHK files in the set + size-mutable-files (TODO): same, for current version of all mutable + files, does not include directories + size-literal-files: same, for LIT files + size-directories: size of mutable files used by directories + + largest-directory: number of bytes in the largest directory + largest-directory-children: number of children in the largest + directory + largest-immutable-file: number of bytes in the largest CHK file + + size-mutable-files is not yet implemented, because it would involve + even more queries than deep_stats does. + + This operation will visit every directory node underneath this one, + and can take a long time to run. On a typical workstation with good + bandwidth, this can examine roughly 15 directories per second (and + takes several minutes of 100% CPU for ~1700 directories). + """ class ICodecEncoder(Interface): def set_params(data_size, required_shares, max_shares): diff --git a/src/allmydata/test/test_dirnode.py b/src/allmydata/test/test_dirnode.py index 918df62c..83d849d3 100644 --- a/src/allmydata/test/test_dirnode.py +++ b/src/allmydata/test/test_dirnode.py @@ -207,6 +207,29 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin): sorted(self.expected_manifest)) d.addCallback(_check_manifest) + d.addCallback(lambda res: n.deep_stats()) + def _check_deepstats(stats): + self.failUnless(isinstance(stats, dict)) + expected = {"count-immutable-files": 0, + "count-mutable-files": 1, + "count-literal-files": 0, + "count-files": 1, + "count-directories": 2, + "size-immutable-files": 0, + "size-literal-files": 0, + #"size-directories": 616, # varies + #"largest-directory": 616, + "largest-directory-children": 2, + "largest-immutable-file": 0, + } + for k,v in expected.iteritems(): + self.failUnlessEqual(stats[k], v, + "stats[%s] was %s, not %s" % + (k, stats[k], v)) + self.failUnless(stats["size-directories"] > 600) + self.failUnless(stats["largest-directory"] > 600) + d.addCallback(_check_deepstats) + def _add_subsubdir(res): return self.subdir.create_empty_directory(u"subsubdir") d.addCallback(_add_subsubdir) diff --git a/src/allmydata/test/test_web.py b/src/allmydata/test/test_web.py index 0ac61955..8636e56e 100644 --- a/src/allmydata/test/test_web.py +++ b/src/allmydata/test/test_web.py @@ -768,6 +768,29 @@ class Web(WebMixin, unittest.TestCase): d.addCallback(_got) return d + def test_GET_DIRURL_deepstats(self): + d = self.GET(self.public_url + "/foo?t=deep-stats", followRedirect=True) + def _got(stats_json): + stats = simplejson.loads(stats_json) + expected = {"count-immutable-files": 3, + "count-mutable-files": 0, + "count-literal-files": 0, + "count-files": 3, + "count-directories": 3, + "size-immutable-files": 57, + "size-literal-files": 0, + #"size-directories": 1912, # varies + #"largest-directory": 1590, + "largest-directory-children": 5, + "largest-immutable-file": 19, + } + for k,v in expected.iteritems(): + self.failUnlessEqual(stats[k], v, + "stats[%s] was %s, not %s" % + (k, stats[k], v)) + d.addCallback(_got) + return d + def test_GET_DIRURL_uri(self): d = self.GET(self.public_url + "/foo?t=uri") def _check(res): diff --git a/src/allmydata/webish.py b/src/allmydata/webish.py index 1ce5ec29..92ce7ea1 100644 --- a/src/allmydata/webish.py +++ b/src/allmydata/webish.py @@ -1244,6 +1244,18 @@ class DeepSize(rend.Page): d.addCallback(_measure_size) return d +class DeepStats(rend.Page): + + def __init__(self, dirnode, dirpath): + self._dirnode = dirnode + self._dirpath = dirpath + + def renderHTTP(self, ctx): + inevow.IRequest(ctx).setHeader("content-type", "text/plain") + d = self._dirnode.deep_stats() + d.addCallback(simplejson.dumps, indent=1) + return d + class ChildError: implements(inevow.IResource) def renderHTTP(self, ctx): @@ -1338,6 +1350,8 @@ class VDrive(rend.Page): return Manifest(node, path), () elif t == "deep-size": return DeepSize(node, path), () + elif t == "deep-stats": + return DeepStats(node, path), () elif t == 'rename-form': return RenameForm(self.name, node, path), () else: