misc/operations_helpers/munin/tahoe_server_latency_

   1 #!/usr/bin/env python
   2
   3 # retrieve a latency statistic for a given operation and percentile from a
   4 # set of storage servers.
   5
   6 # the OPERATION value should come from the following list:
   7 #   allocate:   allocate_buckets, first step to upload an immutable file
   8 #    write: write data to an immutable share
   9 #    close: finish writing to an immutable share
  10 #    cancel: abandon a partial immutable share
  11 #   get: get_buckets, first step to download an immutable file
  12 #    read: read data from an immutable share
  13 #   writev: slot_testv_and_readv_and_writev, modify/create a directory
  14 #   readv: read a directory (or mutable file)
  15
  16 # the PERCENTILE value should come from the following list:
  17 #  01_0:   1%
  18 #  10_0:  10%
  19 #  50_0:  50% (median)
  20 #  90_0:  90%
  21 #  99_0:  99%
  22 #  99_9:  99.9%
  23 #  mean:
  24
  25 # To use this, create a symlink from
  26 # /etc/munin/plugins/tahoe_server_latency_OPERATION_PERCENTILE to this
  27 # script. For example:
  28
  29 # ln -s /usr/share/doc/allmydata-tahoe/munin/tahoe_server_latency_ \
  30 #  /etc/munin/plugins/tahoe_server_latency_allocate_99_9
  31
  32 # Also, you will need to put a list of node statistics URLs in the plugin's
  33 # environment, by adding a stanza like the following to a file in
  34 # /etc/munin/plugin-conf.d/, such as /etc/munin/plugin-conf.d/tahoe_latencies:
  35 #
  36 # [tahoe_server_latency*]
  37 # env.url_storage1 http://localhost:9011/statistics?t=json
  38 # env.url_storage2 http://localhost:9012/statistics?t=json
  39 # env.url_storage3 http://localhost:9013/statistics?t=json
  40 # env.url_storage4 http://localhost:9014/statistics?t=json
  41
  42 # of course, these URLs must match the webports you have configured into the
  43 # storage nodes.
  44
  45 import os, sys
  46 import urllib
  47 import simplejson
  48
  49 node_urls = []
  50 for k,v in os.environ.items():
  51     if k.startswith("url_"):
  52         nodename = k[len("url_"):]
  53         node_urls.append( (nodename, v) )
  54 node_urls.sort()
  55
  56 my_name = os.path.basename(sys.argv[0])
  57 PREFIX = "tahoe_server_latency_"
  58 assert my_name.startswith(PREFIX)
  59 my_name = my_name[len(PREFIX):]
  60 (operation, percentile) = my_name.split("_", 1)
  61 if percentile == "mean":
  62     what = "mean"
  63 else:
  64     what = percentile.replace("_", ".") + "th percentile"
  65
  66 configinfo = \
  67 """graph_title Tahoe Server '%(operation)s' Latency (%(what)s)
  68 graph_vlabel seconds
  69 graph_category tahoe
  70 graph_info This graph shows how long '%(operation)s' operations took on the storage server, the %(what)s delay between message receipt and response generation, calculated over the last thousand operations.
  71 """ % {'operation': operation,
  72        'what': what}
  73
  74 for nodename, url in node_urls:
  75     configinfo += "%s.label %s\n" % (nodename, nodename)
  76     configinfo += "%s.draw LINE2\n" % (nodename,)
  77
  78
  79 if len(sys.argv) > 1:
  80     if sys.argv[1] == "config":
  81         print configinfo.rstrip()
  82         sys.exit(0)
  83
  84 for nodename, url in node_urls:
  85     data = simplejson.loads(urllib.urlopen(url).read())
  86     if percentile == "mean":
  87         p_key = "mean"
  88     else:
  89         p_key = percentile + "_percentile"
  90     key = "storage_server.latencies.%s.%s" % (operation, p_key)
  91     value = data["stats"][key]
  92     print "%s.value %s" % (nodename, value)
  93