From: Brian Warner Date: Fri, 9 Jan 2009 02:59:32 +0000 (-0700) Subject: webapi/deep-manifest t=JSON: don't return the (large) manifest/SI/verifycap lists... X-Git-Url: https://git.rkrishnan.org/components/(%5B%5E?a=commitdiff_plain;h=7ee336b274c311e0e7af9c5444e2bd1b3249514f;p=tahoe-lafs%2Ftahoe-lafs.git webapi/deep-manifest t=JSON: don't return the (large) manifest/SI/verifycap lists unless the operation has completed, to avoid the considerable CPU+memory cost of creating the JSON (for 330k dirnodes, it could take two minutes to generate 275MB of JSON). They must be paid eventually, but not on every poll --- diff --git a/docs/frontends/webapi.txt b/docs/frontends/webapi.txt index 29fae18b..1242c6e8 100644 --- a/docs/frontends/webapi.txt +++ b/docs/frontends/webapi.txt @@ -213,6 +213,11 @@ GET /operations/$HANDLE?output=JSON (same) * whether the operation is complete, or if it is still running * how much of the operation is complete, and how much is left, if possible + Note that the final status output can be quite large: a deep-manifest of a + directory structure with 300k directories and 200k unique files is about + 275MB of JSON, and might take two minutes to generate. For this reason, the + full status is not provided until the operation has completed. + The HTML form will include a meta-refresh tag, which will cause a regular web browser to reload the status page about 60 seconds later. This tag will be removed once the operation has completed. @@ -966,7 +971,10 @@ POST $DIRURL?t=start-manifest (must add &ophandle=XYZ) by a space. If output=JSON is added to the queryargs, then the results will be a - JSON-formatted dictionary with six keys: + JSON-formatted dictionary with six keys. Note that because large directory + structures can result in very large JSON results, the full results will not + be available until the operation is complete (i.e. until output["finished"] + is True): finished (bool): if False then you must reload the page until True origin_si (base32 str): the storage index of the starting point diff --git a/src/allmydata/web/directory.py b/src/allmydata/web/directory.py index 0ac24bc2..07771bf8 100644 --- a/src/allmydata/web/directory.py +++ b/src/allmydata/web/directory.py @@ -728,13 +728,27 @@ class ManifestResults(rend.Page, ReloadMixin): inevow.IRequest(ctx).setHeader("content-type", "text/plain") m = self.monitor s = m.get_status() - status = {"manifest": s["manifest"], - "verifycaps": list(s["verifycaps"]), - "storage-index": list(s["storage-index"]), - "stats": s["stats"], - "finished": m.is_finished(), - "origin": base32.b2a(m.origin_si), - } + + status = { "stats": s["stats"], + "finished": m.is_finished(), + "origin": base32.b2a(m.origin_si), + } + if m.is_finished(): + # don't return manifest/verifycaps/SIs unless the operation is + # done, to save on CPU/memory (both here and in the HTTP client + # who has to unpack the JSON). Tests show that the ManifestWalker + # needs about 1092 bytes per item, the JSON we generate here + # requires about 503 bytes per item, and some internal overhead + # (perhaps transport-layer buffers in twisted.web?) requires an + # additional 1047 bytes per item. + status.update({ "manifest": s["manifest"], + "verifycaps": [i for i in s["verifycaps"]], + "storage-index": [i for i in s["storage-index"]], + }) + # simplejson doesn't know how to serialize a set. We use a + # generator that walks the set rather than list(setofthing) to + # save a small amount of memory (4B*len) and a moderate amount of + # CPU. return simplejson.dumps(status, indent=1) def _si_abbrev(self):