contrib/fuse/impl_b/pyfuse/httpfs.py

   1 import os, re, urlparse
   2 from handler import Handler
   3 from objectfs import ObjectFs
   4
   5
   6 class Root:
   7     def __init__(self):
   8         self.entries = {'gg': GoogleRoot()}
   9     def listdir(self):
  10         return self.entries.keys()
  11     def join(self, hostname):
  12         if hostname in self.entries:
  13             return self.entries[hostname]
  14         if '.' not in hostname:
  15             raise KeyError
  16         result = HtmlNode('http://%s/' % (hostname,))
  17         self.entries[hostname] = result
  18         return result
  19
  20
  21 class UrlNode:
  22     data = None
  23
  24     def __init__(self, url):
  25         self.url = url
  26
  27     def getdata(self):
  28         if self.data is None:
  29             print self.url
  30             g = os.popen("lynx -source %r" % (self.url,), 'r')
  31             self.data = g.read()
  32             g.close()
  33         return self.data
  34
  35
  36 class HtmlNode(UrlNode):
  37     r_links  = re.compile(r'<a\s[^>]*href="([^"]+)"[^>]*>(.*?)</a>',
  38                           re.IGNORECASE | re.DOTALL)
  39     r_images = re.compile(r'<img\s[^>]*src="([^"]+[.]jpg)"', re.IGNORECASE)
  40
  41     def format(self, text, index,
  42                TRANSTBL = ''.join([(32<=c<127 and c!=ord('/'))
  43                                    and chr(c) or '_'
  44                                    for c in range(256)])):
  45         return text.translate(TRANSTBL)
  46
  47     def listdir(self):
  48         data = self.getdata()
  49
  50         seen = {}
  51         def uniquename(name):
  52             name = self.format(name, len(seen))
  53             if name == '' or name.startswith('.'):
  54                 name = '_' + name
  55             basename = name
  56             i = 1
  57             while name in seen:
  58                 i += 1
  59                 name = '%s_%d' % (basename, i)
  60             seen[name] = True
  61             return name
  62
  63         for link, text in self.r_links.findall(data):
  64             url = urlparse.urljoin(self.url, link)
  65             yield uniquename(text), HtmlNode(url)
  66
  67         for link in self.r_images.findall(data):
  68             text = os.path.basename(link)
  69             url = urlparse.urljoin(self.url, link)
  70             yield uniquename(text), RawNode(url)
  71
  72         yield '.source', RawNode(self.url)
  73
  74
  75 class RawNode(UrlNode):
  76
  77     def read(self):
  78         return self.getdata()
  79
  80     def size(self):
  81         if self.data:
  82             return len(self.data)
  83         else:
  84             return None
  85
  86
  87 class GoogleRoot:
  88     def join(self, query):
  89         return GoogleSearch(query)
  90
  91 class GoogleSearch(HtmlNode):
  92     r_links  = re.compile(r'<a\sclass=l\s[^>]*href="([^"]+)"[^>]*>(.*?)</a>',
  93                           re.IGNORECASE | re.DOTALL)
  94
  95     def __init__(self, query):
  96         self.url = 'http://www.google.com/search?q=' + query
  97
  98     def format(self, text, index):
  99         text = text.replace('<b>', '').replace('</b>', '')
 100         text = HtmlNode.format(self, text, index)
 101         return '%d. %s' % (index, text)
 102
 103
 104 if __name__ == '__main__':
 105     root = Root()
 106     handler = Handler('/home/arigo/mnt', ObjectFs(root))
 107     handler.loop_forever()