zfec/setuptools-0.6c16dev3.egg/setuptools/package_index.py

   1 """PyPI and direct package downloading"""
   2 import sys, os.path, re, urlparse, urllib2, shutil, random, socket, cStringIO
   3 import httplib, urllib
   4 from pkg_resources import *
   5 from distutils import log
   6 from distutils.errors import DistutilsError
   7 try:
   8     from hashlib import md5
   9 except ImportError:
  10     from md5 import md5
  11 from fnmatch import translate
  12 EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
  13 HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
  14 # this is here to fix emacs' cruddy broken syntax highlighting
  15 PYPI_MD5 = re.compile(
  16     '<a href="([^"#]+)">([^<]+)</a>\n\s+\\(<a (?:title="MD5 hash"\n\s+)'
  17     'href="[^?]+\?:action=show_md5&amp;digest=([0-9a-f]{32})">md5</a>\\)'
  18 )
  19 URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):',re.I).match
  20 EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split()
  21
  22 def is_local(url_or_fname):
  23     """ Return True if url_or_fname is a "file:" url or if it is a schemaless thing (which is presumably a filename). """
  24     mo = URL_SCHEME(url_or_fname)
  25     return not (mo and mo.group(1).lower()!='file')
  26
  27 def url_or_fname_to_fname(url_or_fname):
  28     """ Assert that is_local(url_or_fname) then if it is a "file:" url, parse it and run url2pathname on it, else just return it. """
  29     assert is_local(url_or_fname)
  30
  31     mo = URL_SCHEME(url_or_fname)
  32     if mo:
  33         return urllib2.url2pathname(urlparse.urlparse(url)[2])
  34     else:
  35         return url_or_fname
  36
  37 __all__ = [
  38     'PackageIndex', 'distros_for_url', 'parse_bdist_wininst',
  39     'interpret_distro_name',
  40 ]
  41
  42 def parse_bdist_wininst(name):
  43     """Return (base,pyversion) or (None,None) for possible .exe name"""
  44
  45     lower = name.lower()
  46     base, py_ver = None, None
  47
  48     if lower.endswith('.exe'):
  49         if lower.endswith('.win32.exe'):
  50             base = name[:-10]
  51         elif lower.startswith('.win32-py',-16):
  52             py_ver = name[-7:-4]
  53             base = name[:-16]
  54
  55     return base,py_ver
  56
  57 def egg_info_for_url(url):
  58     scheme, server, path, parameters, query, fragment = urlparse.urlparse(url)
  59     base = urllib2.unquote(path.split('/')[-1])
  60     if server=='sourceforge.net' and base=='download':    # XXX Yuck
  61         base = urllib2.unquote(path.split('/')[-2])
  62     if '#' in base: base, fragment = base.split('#',1)
  63     return base,fragment
  64
  65 def distros_for_url(url, metadata=None):
  66     """Yield egg or source distribution objects that might be found at a URL"""
  67     base, fragment = egg_info_for_url(url)
  68     for dist in distros_for_location(url, base, metadata): yield dist
  69     if fragment:
  70         match = EGG_FRAGMENT.match(fragment)
  71         if match:
  72             for dist in interpret_distro_name(
  73                 url, match.group(1), metadata, precedence = CHECKOUT_DIST
  74             ):
  75                 yield dist
  76
  77 def distros_for_location(location, basename, metadata=None):
  78     """Yield egg or source distribution objects based on basename"""
  79     if basename.endswith('.egg.zip'):
  80         basename = basename[:-4]    # strip the .zip
  81     if basename.endswith('.egg') and '-' in basename:
  82         # only one, unambiguous interpretation
  83         return [Distribution.from_location(location, basename, metadata)]
  84     if basename.endswith('.exe'):
  85         win_base, py_ver = parse_bdist_wininst(basename)
  86         if win_base is not None:
  87             return interpret_distro_name(
  88                 location, win_base, metadata, py_ver, BINARY_DIST, "win32"
  89             )
  90     # Try source distro extensions (.zip, .tgz, etc.)
  91     #
  92     for ext in EXTENSIONS:
  93         if basename.endswith(ext):
  94             basename = basename[:-len(ext)]
  95             return interpret_distro_name(location, basename, metadata)
  96     return []  # no extension matched
  97
  98 def distros_for_filename(filename, metadata=None):
  99     """Yield possible egg or source distribution objects based on a filename"""
 100     return distros_for_location(
 101         normalize_path(filename), os.path.basename(filename), metadata
 102     )
 103
 104
 105 def interpret_distro_name(location, basename, metadata,
 106     py_version=None, precedence=SOURCE_DIST, platform=None
 107 ):
 108     """Generate alternative interpretations of a source distro name
 109
 110     Note: if `location` is a filesystem filename, you should call
 111     ``pkg_resources.normalize_path()`` on it before passing it to this
 112     routine!
 113     """
 114     # Generate alternative interpretations of a source distro name
 115     # Because some packages are ambiguous as to name/versions split
 116     # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc.
 117     # So, we generate each possible interepretation (e.g. "adns, python-1.1.0"
 118     # "adns-python, 1.1.0", and "adns-python-1.1.0, no version").  In practice,
 119     # the spurious interpretations should be ignored, because in the event
 120     # there's also an "adns" package, the spurious "python-1.1.0" version will
 121     # compare lower than any numeric version number, and is therefore unlikely
 122     # to match a request for it.  It's still a potential problem, though, and
 123     # in the long run PyPI and the distutils should go for "safe" names and
 124     # versions in distribution archive names (sdist and bdist).
 125
 126     parts = basename.split('-')
 127     if not py_version:
 128         for i,p in enumerate(parts[2:]):
 129             if len(p)==5 and p.startswith('py2.'):
 130                 return # It's a bdist_dumb, not an sdist -- bail out
 131
 132     for p in range(1,len(parts)+1):
 133         yield Distribution(
 134             location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]),
 135             py_version=py_version, precedence = precedence,
 136             platform = platform
 137         )
 138
 139 REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
 140 # this line is here to fix emacs' cruddy broken syntax highlighting
 141
 142 def find_external_links(url, page):
 143     """Find rel="homepage" and rel="download" links in `page`, yielding URLs"""
 144
 145     for match in REL.finditer(page):
 146         tag, rel = match.groups()
 147         rels = map(str.strip, rel.lower().split(','))
 148         if 'homepage' in rels or 'download' in rels:
 149             for match in HREF.finditer(tag):
 150                 yield urlparse.urljoin(url, htmldecode(match.group(1)))
 151
 152     for tag in ("<th>Home Page", "<th>Download URL"):
 153         pos = page.find(tag)
 154         if pos!=-1:
 155             match = HREF.search(page,pos)
 156             if match:
 157                 yield urlparse.urljoin(url, htmldecode(match.group(1)))
 158
 159 user_agent = "Python-urllib/%s setuptools/%s" % (
 160     urllib2.__version__, require('setuptools')[0].version
 161 )
 162
 163
 164 class PackageIndex(Environment):
 165     """A distribution index that scans web pages for download URLs"""
 166
 167     def __init__(self, index_url="http://pypi.python.org/simple", hosts=('*',),
 168         *args, **kw
 169     ):
 170         Environment.__init__(self,*args,**kw)
 171         self.index_url = index_url + "/"[:not index_url.endswith('/')]
 172         self.scanned_urls = {}
 173         self.fetched_urls = {}
 174         self.package_pages = {}
 175         self.allows = re.compile('|'.join(map(translate,hosts))).match
 176         self.to_scan = []
 177
 178
 179
 180     def process_url(self, url, retrieve=False):
 181         """Evaluate a URL as a possible download, and maybe retrieve it"""
 182         if url in self.scanned_urls and not retrieve:
 183             return
 184         self.scanned_urls[url] = True
 185         if not URL_SCHEME(url):
 186             self.process_filename(url)
 187             return
 188         else:
 189             dists = list(distros_for_url(url))
 190             if dists:
 191                 if not self.url_ok(url):
 192                     return
 193                 self.debug("Found link: %s", url)
 194
 195         if dists or not retrieve or url in self.fetched_urls:
 196             map(self.add, dists)
 197             return  # don't need the actual page
 198
 199         if not self.url_ok(url):
 200             self.fetched_urls[url] = True
 201             return
 202
 203         self.info("Reading %s", url)
 204         self.fetched_urls[url] = True   # prevent multiple fetch attempts
 205         f = self.open_url(url, "Download error: %s -- Some packages may not be found!")
 206         if f is None: return
 207         self.fetched_urls[f.url] = True
 208         if 'html' not in f.headers.get('content-type', '').lower():
 209             f.close()   # not html, we can't process it
 210             return
 211
 212         base = f.url     # handle redirects
 213         page = f.read()
 214         f.close()
 215         if url.startswith(self.index_url) and getattr(f,'code',None)!=404:
 216             page = self.process_index(url, page)
 217         for match in HREF.finditer(page):
 218             link = urlparse.urljoin(base, htmldecode(match.group(1)))
 219             self.process_url(link)
 220
 221     def process_filename(self, fn, nested=False):
 222         # process filenames or directories
 223         if not os.path.exists(fn):
 224             self.warn("Not found: %s", fn)
 225             return
 226
 227         if os.path.isdir(fn) and not nested:
 228             path = os.path.realpath(fn)
 229             for item in os.listdir(path):
 230                 self.process_filename(os.path.join(path,item), True)
 231
 232         dists = distros_for_filename(fn)
 233         if dists:
 234             self.debug("Found: %s", fn)
 235             map(self.add, dists)
 236
 237     def url_ok(self, url, fatal=False):
 238         s = URL_SCHEME(url)
 239         if (s and s.group(1).lower()=='file') or self.allows(urlparse.urlparse(url)[1]):
 240             return True
 241         msg = "\nLink to % s ***BLOCKED*** by --allow-hosts\n"
 242         if fatal:
 243             raise DistutilsError(msg % url)
 244         else:
 245             self.warn(msg, url)
 246
 247     def scan_egg_links(self, search_path):
 248         for item in search_path:
 249             if os.path.isdir(item):
 250                 for entry in os.listdir(item):
 251                     if entry.endswith('.egg-link'):
 252                         self.scan_egg_link(item, entry)
 253
 254     def scan_egg_link(self, path, entry):
 255         lines = filter(None, map(str.strip, file(os.path.join(path, entry))))
 256         if len(lines)==2:
 257             for dist in find_distributions(os.path.join(path, lines[0])):
 258                 dist.location = os.path.join(path, *lines)
 259                 dist.precedence = SOURCE_DIST
 260                 self.add(dist)
 261
 262     def process_index(self,url,page):
 263         """Process the contents of a PyPI page"""
 264         def scan(link):
 265             # Process a URL to see if it's for a package page
 266             if link.startswith(self.index_url):
 267                 parts = map(
 268                     urllib2.unquote, link[len(self.index_url):].split('/')
 269                 )
 270                 if len(parts)==2 and '#' not in parts[1]:
 271                     # it's a package page, sanitize and index it
 272                     pkg = safe_name(parts[0])
 273                     ver = safe_version(parts[1])
 274                     self.package_pages.setdefault(pkg.lower(),{})[link] = True
 275                     return to_filename(pkg), to_filename(ver)
 276             return None, None
 277
 278         # process an index page into the package-page index
 279         for match in HREF.finditer(page):
 280             scan( urlparse.urljoin(url, htmldecode(match.group(1))) )
 281
 282         pkg, ver = scan(url)   # ensure this page is in the page index
 283         if pkg:
 284             # process individual package page
 285             for new_url in find_external_links(url, page):
 286                 # Process the found URL
 287                 base, frag = egg_info_for_url(new_url)
 288                 if base.endswith('.py') and not frag:
 289                     if ver:
 290                         new_url+='#egg=%s-%s' % (pkg,ver)
 291                     else:
 292                         self.need_version_info(url)
 293                 self.scan_url(new_url)
 294
 295             return PYPI_MD5.sub(
 296                 lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page
 297             )
 298         else:
 299             return ""   # no sense double-scanning non-package pages
 300
 301
 302
 303     def need_version_info(self, url):
 304         self.scan_all(
 305             "Page at %s links to .py file(s) without version info; an index "
 306             "scan is required.", url
 307         )
 308
 309     def scan_all(self, msg=None, *args):
 310         if self.index_url not in self.fetched_urls:
 311             if msg: self.warn(msg,*args)
 312             self.info(
 313                 "Scanning index of all packages (this may take a while)"
 314             )
 315         self.scan_url(self.index_url)
 316
 317     def find_packages(self, requirement):
 318         self.scan_url(self.index_url + requirement.unsafe_name+'/')
 319
 320         if not self.package_pages.get(requirement.key):
 321             # Fall back to safe version of the name
 322             self.scan_url(self.index_url + requirement.project_name+'/')
 323
 324         if not self.package_pages.get(requirement.key):
 325             # We couldn't find the target package, so search the index page too
 326             self.not_found_in_index(requirement)
 327
 328         for url in list(self.package_pages.get(requirement.key,())):
 329             # scan each page that might be related to the desired package
 330             self.scan_url(url)
 331
 332     def obtain(self, requirement, installer=None):
 333         self.prescan(); self.find_packages(requirement)
 334         for dist in self[requirement.key]:
 335             if dist in requirement:
 336                 return dist
 337             self.debug("%s does not match %s", requirement, dist)
 338         return super(PackageIndex, self).obtain(requirement,installer)
 339
 340
 341
 342
 343
 344     def check_md5(self, cs, info, filename, tfp):
 345         if re.match('md5=[0-9a-f]{32}$', info):
 346             self.debug("Validating md5 checksum for %s", filename)
 347             if cs.hexdigest()!=info[4:]:
 348                 tfp.close()
 349                 os.unlink(filename)
 350                 raise DistutilsError(
 351                     "MD5 validation failed for "+os.path.basename(filename)+
 352                     "; possible download problem?"
 353                 )
 354
 355     def add_find_links(self, urls):
 356         """Add `urls` to the list that will be prescanned for searches"""
 357         for url in urls:
 358             if (
 359                 self.to_scan is None        # if we have already "gone online"
 360                 or not URL_SCHEME(url)      # or it's a local file/directory
 361                 or url.startswith('file:')
 362                 or list(distros_for_url(url))   # or a direct package link
 363             ):
 364                 # then go ahead and process it now
 365                 self.scan_url(url)
 366             else:
 367                 # otherwise, defer retrieval till later
 368                 self.to_scan.append(url)
 369
 370     def prescan(self):
 371         """Scan urls scheduled for prescanning (e.g. --find-links)"""
 372         if self.to_scan:
 373             map(self.scan_url, self.to_scan)
 374         self.to_scan = None     # from now on, go ahead and process immediately
 375
 376     def not_found_in_index(self, requirement):
 377         if self[requirement.key]:   # we've seen at least one distro
 378             meth, msg = self.info, "Couldn't retrieve index page for %r"
 379         else:   # no distros seen for this name, might be misspelled
 380             meth, msg = (self.warn,
 381                 "Couldn't find index page for %r (maybe misspelled?)")
 382         meth(msg, requirement.unsafe_name)
 383         self.scan_all()
 384
 385     def download(self, spec, tmpdir):
 386         """Locate and/or download `spec` to `tmpdir`, returning a local path
 387
 388         `spec` may be a ``Requirement`` object, or a string containing a URL,
 389         an existing local filename, or a project/version requirement spec
 390         (i.e. the string form of a ``Requirement`` object).  If it is the URL
 391         of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one
 392         that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is
 393         automatically created alongside the downloaded file.
 394
 395         If `spec` is a ``Requirement`` object or a string containing a
 396         project/version requirement spec, this method returns the location of
 397         a matching distribution (possibly after downloading it to `tmpdir`).
 398         If `spec` is a locally existing file or directory name, it is simply
 399         returned unchanged.  If `spec` is a URL, it is downloaded to a subpath
 400         of `tmpdir`, and the local filename is returned.  Various errors may be
 401         raised if a problem occurs during downloading.
 402         """
 403         if not isinstance(spec,Requirement):
 404             scheme = URL_SCHEME(spec)
 405             if scheme:
 406                 # It's a url, download it to tmpdir
 407                 found = self._download_url(scheme.group(1), spec, tmpdir)
 408                 base, fragment = egg_info_for_url(spec)
 409                 if base.endswith('.py'):
 410                     found = self.gen_setup(found,fragment,tmpdir)
 411                 return found
 412             elif os.path.exists(spec):
 413                 # Existing file or directory, just return it
 414                 return spec
 415             else:
 416                 try:
 417                     spec = Requirement.parse(spec)
 418                 except ValueError:
 419                     raise DistutilsError(
 420                         "Not a URL, existing file, or requirement spec: %r" %
 421                         (spec,)
 422                     )
 423         return getattr(self.fetch_distribution(spec, tmpdir),'location',None)
 424
 425
 426     def fetch_distribution(self,
 427         requirement, tmpdir, force_scan=False, source=False, develop_ok=False,
 428         local_index=None,
 429     ):
 430         """Obtain a distribution suitable for fulfilling `requirement`
 431
 432         `requirement` must be a ``pkg_resources.Requirement`` instance.
 433         If necessary, or if the `force_scan` flag is set, the requirement is
 434         searched for in the (online) package index as well as the locally
 435         installed packages.  If a distribution matching `requirement` is found,
 436         the returned distribution's ``location`` is the value you would have
 437         gotten from calling the ``download()`` method with the matching
 438         distribution's URL or filename.  If no matching distribution is found,
 439         ``None`` is returned.
 440
 441         If the `source` flag is set, only source distributions and source
 442         checkout links will be considered.  Unless the `develop_ok` flag is
 443         set, development and system eggs (i.e., those using the ``.egg-info``
 444         format) will be ignored.
 445         """
 446         # process a Requirement
 447         self.info("Searching for %s", requirement)
 448         skipped = {}
 449         dist = None
 450
 451         def find(env, req):
 452             # Find a matching distribution; may be called more than once
 453
 454             # first try to find a local dist
 455             for allow_remote in (False, True):
 456                 # then try to find a platform-dependent dist
 457                 for allow_platform_independent in (False, True):
 458                     for dist in env[req.key]:
 459                         if dist.precedence==DEVELOP_DIST and not develop_ok:
 460                             if dist not in skipped:
 461                                 self.warn("Skipping development or system egg: %s",dist)
 462                                 skipped[dist] = 1
 463                             continue
 464
 465                         if ((is_local(dist.location) or allow_remote) and
 466                             (dist in req) and
 467                             ((allow_platform_independent or dist.platform is not None) and
 468                              (dist.precedence<=SOURCE_DIST or not source))):
 469                             return dist
 470
 471         if force_scan:
 472             self.prescan()
 473             self.find_packages(requirement)
 474             dist = find(self, requirement)
 475
 476         if local_index is not None:
 477             dist = dist or find(local_index, requirement)
 478
 479         if dist is None and self.to_scan is not None:
 480             self.prescan()
 481             dist = find(self, requirement)
 482
 483         if dist is None and not force_scan:
 484             self.find_packages(requirement)
 485             dist = find(self, requirement)
 486
 487         if dist is None:
 488             self.warn(
 489                 "No local packages or download links found for %s%s",
 490                 (source and "a source distribution of " or ""),
 491                 requirement,
 492             )
 493         else:
 494             self.info("Best match: %s", dist)
 495             return dist.clone(location=self.download(dist.location, tmpdir))
 496
 497
 498     def fetch(self, requirement, tmpdir, force_scan=False, source=False):
 499         """Obtain a file suitable for fulfilling `requirement`
 500
 501         DEPRECATED; use the ``fetch_distribution()`` method now instead.  For
 502         backward compatibility, this routine is identical but returns the
 503         ``location`` of the downloaded distribution instead of a distribution
 504         object.
 505         """
 506         dist = self.fetch_distribution(requirement,tmpdir,force_scan,source)
 507         if dist is not None:
 508             return dist.location
 509         return None
 510
 511
 512     def gen_setup(self, filename, fragment, tmpdir):
 513         match = EGG_FRAGMENT.match(fragment)
 514         dists = match and [d for d in
 515             interpret_distro_name(filename, match.group(1), None) if d.version
 516         ] or []
 517
 518         if len(dists)==1:   # unambiguous ``#egg`` fragment
 519             basename = os.path.basename(filename)
 520
 521             # Make sure the file has been downloaded to the temp dir.
 522             if os.path.dirname(filename) != tmpdir:
 523                 dst = os.path.join(tmpdir, basename)
 524                 from setuptools.command.easy_install import samefile
 525                 if not samefile(filename, dst):
 526                     shutil.copy2(filename, dst)
 527                     filename=dst
 528
 529             file = open(os.path.join(tmpdir, 'setup.py'), 'w')
 530             file.write(
 531                 "from setuptools import setup\n"
 532                 "setup(name=%r, version=%r, py_modules=[%r])\n"
 533                 % (
 534                     dists[0].project_name, dists[0].version,
 535                     os.path.splitext(basename)[0]
 536                 )
 537             )
 538             file.close()
 539             return filename
 540
 541         elif match:
 542             raise DistutilsError(
 543                 "Can't unambiguously interpret project/version identifier %r; "
 544                 "any dashes in the name or version should be escaped using "
 545                 "underscores. %r" % (fragment,dists)
 546             )
 547         else:
 548             raise DistutilsError(
 549                 "Can't process plain .py files without an '#egg=name-version'"
 550                 " suffix to enable automatic setup script generation."
 551             )
 552
 553     dl_blocksize = 8192
 554     def _download_to(self, url, filename):
 555         self.info("Downloading %s", url)
 556         # Download the file
 557         fp, tfp, info = None, None, None
 558         try:
 559             if '#' in url:
 560                 url, info = url.split('#', 1)
 561             fp = self.open_url(url)
 562             if isinstance(fp, urllib2.HTTPError):
 563                 raise DistutilsError(
 564                     "Can't download %s: %s %s" % (url, fp.code,fp.msg)
 565                 )
 566             cs = md5()
 567             headers = fp.info()
 568             blocknum = 0
 569             bs = self.dl_blocksize
 570             size = -1
 571             if "content-length" in headers:
 572                 size = int(headers["Content-Length"])
 573                 self.reporthook(url, filename, blocknum, bs, size)
 574             tfp = open(filename,'wb')
 575             while True:
 576                 block = fp.read(bs)
 577                 if block:
 578                     cs.update(block)
 579                     tfp.write(block)
 580                     blocknum += 1
 581                     self.reporthook(url, filename, blocknum, bs, size)
 582                 else:
 583                     break
 584             if info: self.check_md5(cs, info, filename, tfp)
 585             return headers
 586         finally:
 587             if fp: fp.close()
 588             if tfp: tfp.close()
 589
 590     def reporthook(self, url, filename, blocknum, blksize, size):
 591         pass    # no-op
 592
 593
 594     def open_url(self, url, warning=None):
 595         if url.startswith('file:'): return local_open(url)
 596         try:
 597             return open_with_auth(url)
 598         except urllib2.HTTPError, v:
 599             return v
 600         except urllib2.URLError, v:
 601             reason = v.reason
 602         except httplib.HTTPException, v:
 603             reason = "%s: %s" % (v.__doc__ or v.__class__.__name__, v)
 604         if warning:
 605             self.warn(warning, reason)
 606         else:
 607             raise DistutilsError("Download error for %s: %s" % (url, reason))
 608
 609     def _download_url(self, scheme, url, tmpdir):
 610         # Determine download filename
 611         #
 612         name, fragment = egg_info_for_url(url)
 613         if name:
 614             while '..' in name:
 615                 name = name.replace('..','.').replace('\\','_')
 616         else:
 617             name = "__downloaded__"    # default if URL has no path contents
 618
 619         if name.endswith('.egg.zip'):
 620             name = name[:-4]    # strip the extra .zip before download
 621
 622         filename = os.path.join(tmpdir,name)
 623
 624         # Download the file
 625         #
 626         if scheme=='svn' or scheme.startswith('svn+'):
 627             return self._download_svn(url, filename)
 628         elif scheme=='file':
 629             return urllib2.url2pathname(urlparse.urlparse(url)[2])
 630         else:
 631             self.url_ok(url, True)   # raises error if not allowed
 632             return self._attempt_download(url, filename)
 633
 634
 635     def scan_url(self, url):
 636         self.process_url(url, True)
 637
 638
 639     def _attempt_download(self, url, filename):
 640         headers = self._download_to(url, filename)
 641         if 'html' in headers.get('content-type','').lower():
 642             return self._download_html(url, headers, filename)
 643         else:
 644             return filename
 645
 646     def _download_html(self, url, headers, filename):
 647         file = open(filename)
 648         for line in file:
 649             if line.strip():
 650                 # Check for a subversion index page
 651                 if re.search(r'<title>([^- ]+ - )?Revision \d+:', line):
 652                     # it's a subversion index page:
 653                     file.close()
 654                     os.unlink(filename)
 655                     return self._download_svn(url, filename)
 656                 break   # not an index page
 657         file.close()
 658         os.unlink(filename)
 659         raise DistutilsError("Unexpected HTML page found at "+url)
 660
 661     def _download_svn(self, url, filename):
 662         url = url.split('#',1)[0]   # remove any fragment for svn's sake
 663         self.info("Doing subversion checkout from %s to %s", url, filename)
 664         os.system("svn checkout -q %s %s" % (url, filename))
 665         return filename
 666
 667     def debug(self, msg, *args):
 668         log.debug(msg, *args)
 669
 670     def info(self, msg, *args):
 671         log.info(msg, *args)
 672
 673     def warn(self, msg, *args):
 674         log.warn(msg, *args)
 675
 676 # This pattern matches a character entity reference (a decimal numeric
 677 # references, a hexadecimal numeric reference, or a named reference).
 678 entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
 679
 680 def uchr(c):
 681     if not isinstance(c, int):
 682         return c
 683     if c>255: return unichr(c)
 684     return chr(c)
 685
 686 def decode_entity(match):
 687     what = match.group(1)
 688     if what.startswith('#x'):
 689         what = int(what[2:], 16)
 690     elif what.startswith('#'):
 691         what = int(what[1:])
 692     else:
 693         from htmlentitydefs import name2codepoint
 694         what = name2codepoint.get(what, match.group(0))
 695     return uchr(what)
 696
 697 def htmldecode(text):
 698     """Decode HTML entities in the given text."""
 699     return entity_sub(decode_entity, text)
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717 def open_with_auth(url):
 718     """Open a urllib2 request, handling HTTP authentication"""
 719
 720     scheme, netloc, path, params, query, frag = urlparse.urlparse(url)
 721
 722     if scheme in ('http', 'https'):
 723         auth, host = urllib.splituser(netloc)
 724     else:
 725         auth = None
 726
 727     if auth:
 728         auth = "Basic " + urllib2.unquote(auth).encode('base64').strip()
 729         new_url = urlparse.urlunparse((scheme,host,path,params,query,frag))
 730         request = urllib2.Request(new_url)
 731         request.add_header("Authorization", auth)
 732     else:
 733         request = urllib2.Request(url)
 734
 735     request.add_header('User-Agent', user_agent)
 736     fp = urllib2.urlopen(request)
 737
 738     if auth:
 739         # Put authentication info back into request URL if same host,
 740         # so that links found on the page will work
 741         s2, h2, path2, param2, query2, frag2 = urlparse.urlparse(fp.url)
 742         if s2==scheme and h2==host:
 743             fp.url = urlparse.urlunparse((s2,netloc,path2,param2,query2,frag2))
 744
 745     return fp
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758 def fix_sf_url(url):
 759     return url      # backward compatibility
 760
 761 def local_open(url):
 762     """Read a local path, with special support for directories"""
 763     scheme, server, path, param, query, frag = urlparse.urlparse(url)
 764     filename = urllib2.url2pathname(path)
 765     if os.path.isfile(filename):
 766         return urllib2.urlopen(url)
 767     elif path.endswith('/') and os.path.isdir(filename):
 768         files = []
 769         for f in os.listdir(filename):
 770             if f=='index.html':
 771                 body = open(os.path.join(filename,f),'rb').read()
 772                 break
 773             elif os.path.isdir(os.path.join(filename,f)):
 774                 f+='/'
 775             files.append("<a href=%r>%s</a>" % (f,f))
 776         else:
 777             body = ("<html><head><title>%s</title>" % url) + \
 778                 "</head><body>%s</body></html>" % '\n'.join(files)
 779         status, message = 200, "OK"
 780     else:
 781         status, message, body = 404, "Path not found", "Not found"
 782
 783     return urllib2.HTTPError(url, status, message,
 784             {'content-type':'text/html'}, cStringIO.StringIO(body))
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798 # this line is a kludge to keep the trailing blank lines for pje's editor