From 572c848d983db04087b8396a9b26273ae566515b Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@allmydata.com>
Date: Fri, 18 Jul 2008 18:06:50 -0700
Subject: [PATCH] web: for GET save=true, don't interpret the filename= arg
 with any character set, just copy the bytes back into the Content-Disposition
 header. This seems to make it maximally compatible with Firefox and IE7

---
 docs/webapi.txt               | 60 +++++++++++++++++++++++++++++++++++
 src/allmydata/web/filenode.py | 24 ++++++++++----
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/docs/webapi.txt b/docs/webapi.txt
index 05621292..ea5a4eca 100644
--- a/docs/webapi.txt
+++ b/docs/webapi.txt
@@ -443,6 +443,12 @@ GET /uri/$DIRCAP/[SUBDIRS../]FILENAME
  most browsers will refuse to display it inline). "true", "t", "1", and other
  case-insensitive equivalents are all treated the same.
 
+ Character-set handling in URLs and HTTP headers is a dubious art[1]. For
+ maximum compatibility, Tahoe simply copies the bytes from the filename=
+ argument into the Content-Disposition header's filename= parameter, without
+ trying to interpret them in any particular way.
+
+
 GET /named/$FILECAP/FILENAME
 
  This is an alternate download form which makes it easier to get the correct
@@ -910,3 +916,57 @@ For more details, please see the "Consistency vs Availability" and "The Prime
 Coordination Directive" sections of mutable.txt, in the same directory as
 this file.
 
+
+[1]: URLs and HTTP and UTF-8, Oh My
+
+ HTTP does not provide a mechanism to specify the character set used to
+ encode non-ascii names in URLs (rfc2396#2.1). We prefer the convention that
+ the filename= argument shall be a URL-encoded UTF-8 encoded unicode object.
+ For example, suppose we want to provoke the server into using a filename of
+ "f i a n c e-acute e" (i.e. F I A N C U+00E9 E). The UTF-8 encoding of this
+ is 0x66 0x69 0x61 0x6e 0x63 0xc3 0xa9 0x65 (or "fianc\xC3\xA9e", as python's
+ repr() function would show). To encode this into a URL, the non-printable
+ characters must be escaped with the urlencode '%XX' mechansim, giving us
+ "fianc%C3%A9e". Thus, the first line of the HTTP request will be "GET
+ /uri/CAP...?save=true&filename=fianc%C3%A9e HTTP/1.1". Not all browsers
+ provide this: IE7 uses the Latin-1 encoding, which is fianc%E9e.
+
+ The response header will need to indicate a non-ASCII filename. The actual
+ mechanism to do this is not clear. For ASCII filenames, the response header
+ would look like:
+
+  Content-Disposition: attachment; filename="english.txt"
+
+ If Tahoe were to enforce the utf-8 convention, it would need to decode the
+ URL argument into a unicode string, and then encode it back into a sequence
+ of bytes when creating the response header. One possibility would be to use
+ unencoded utf-8. Developers suggest that IE7 might accept this:
+
+  #1: Content-Disposition: attachment; filename="fianc\xC3\xA9e"
+    (note, the last four bytes of that line, not including the newline, are
+    0xC3 0xA9 0x65 0x22)
+
+ RFC2231#4 (dated 1997): suggests that the following might work, and some
+ developers (http://markmail.org/message/dsjyokgl7hv64ig3) have reported that
+ it is supported by firefox (but not IE7):
+
+  #2: Content-Disposition: attachment; filename*=utf-8''fianc%C3%A9e
+
+ My reading of RFC2616#19.5.1 (which defines Content-Disposition) says that
+ the filename= parameter is defined to be wrapped in quotes (presumeably to
+ allow spaces without breaking the parsing of subsequent parameters), which
+ would give us:
+
+  #3: Content-Disposition: attachment; filename*=utf-8''"fianc%C3%A9e"
+
+ However this is contrary to the examples in the email thread listed above.
+
+ Developers report that IE7 (when it is configured for UTF-8 URL encoding,
+ which is not the default in asian countries), will accept:
+
+  #4: Content-Disposition: attachment; filename=fianc%C3%A9e
+
+ However, for maximum compatibility, Tahoe simply copies bytes from the URL
+ into the response header, rather than enforcing the utf-8 convention. This
+ means it does not try to decode the filename from the URL argument, nor does
+ it encode the filename into the response header.
diff --git a/src/allmydata/web/filenode.py b/src/allmydata/web/filenode.py
index 21ede7e2..7996e757 100644
--- a/src/allmydata/web/filenode.py
+++ b/src/allmydata/web/filenode.py
@@ -157,8 +157,19 @@ class FileNodeHandler(RenderMixin, rend.Page, ReplaceMeMixin):
         t = get_arg(req, "t", "").strip()
         if not t:
             # just get the contents
-            filename = get_arg(req, "filename", self.name) or "unknown"
             save_to_file = boolean_of_arg(get_arg(req, "save", "False"))
+            # the filename arrives as part of the URL or in a form input
+            # element, and will be sent back in a Content-Disposition header.
+            # Different browsers use various character sets for this name,
+            # sometimes depending upon how language environment is
+            # configured. Firefox sends the equivalent of
+            # urllib.quote(name.encode("utf-8")), while IE7 sometimes does
+            # latin-1. Browsers cannot agree on how to interpret the name
+            # they see in the Content-Disposition header either, despite some
+            # 11-year old standards (RFC2231) that explain how to do it
+            # properly. So we assume that at least the browser will agree
+            # with itself, and echo back the same bytes that we were given.
+            filename = get_arg(req, "filename", self.name) or "unknown"
             return FileDownloader(self.node, filename, save_to_file)
         if t == "json":
             return FileJSONMetadata(ctx, self.node)
@@ -294,12 +305,13 @@ class WebDownloadTarget:
             self._req.setHeader("content-encoding", self._content_encoding)
         self._req.setHeader("content-length", str(size))
         if self._save_to_filename is not None:
-            # tell the browser to save the file rather display it
-            # TODO: indicate charset of filename= properly
-            filename = self._save_to_filename.encode("utf-8")
+            # tell the browser to save the file rather display it we don't
+            # try to encode the filename, instead we echo back the exact same
+            # bytes we were given in the URL. See the comment in
+            # FileNodeHandler.render_GET for the sad details.
+            filename = self._save_to_filename
             self._req.setHeader("content-disposition",
-                                'attachment; filename="%s"'
-                                % filename)
+                                'attachment; filename="%s"' % filename)
 
     def write(self, data):
         self._req.write(data)
-- 
2.45.2