Merge pull request #4785 from Tinche/get_url/uri-content-disposition

Modified the get_url module to respect the content-disposition header if...
2013-11-14 10:02:15 -08:00 · 2013-11-14 10:02:15 -08:00 · 0ee5792849
commit 0ee5792849
parent eccbd21a0d b1fa35ac3d
1 changed files with 60 additions and 19 deletions
--- a/network/get_url
+++ b/network/get_url
@ -49,15 +49,20 @@ options:
  dest:
    description:
      - absolute path of where to download the file to.
-      - If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set.
+      - If C(dest) is a directory, either the server provided filename or, if
+        none provided, the base name of the URL on the remote server will be
+        used. If a directory, C(force) has no effect.
    required: true
    default: null
  force:
    description:
-      - If C(yes), will download the file every time and replace the
-        file if the contents change. If C(no), the file will only be downloaded if
-        the destination does not exist. Generally should be C(yes) only for small
-        local files. Prior to 0.6, this module behaved as if C(yes) was the default.
+      - If C(yes) and C(dest) is not a directory, will download the file every
+        time and replace the file if the contents change. If C(no), the file
+        will only be downloaded if the destination does not exist. Generally
+        should be C(yes) only for small local files. Prior to 0.6, this module
+        behaved as if C(yes) was the default.
+        Has no effect if C(dest) is a directory - the file will always be
+        downloaded, but replaced only if the contents changed.
    version_added: "0.7"
    required: false
    choices: [ "yes", "no" ]
@ -125,7 +130,7 @@ def url_filename(url):
        return 'index.html'
    return fn

-def url_do_get(module, url, dest, use_proxy):
+def url_do_get(module, url, dest, use_proxy, last_mod_time):
    """
    Get url and return request and info
    Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
@ -171,33 +176,32 @@ def url_do_get(module, url, dest, use_proxy):
    request = urllib2.Request(url)
    request.add_header('User-agent', USERAGENT)

-    if os.path.exists(dest) and not module.params['force']:
-        t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest))
-        tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
+    if last_mod_time:
+        tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
        request.add_header('If-Modified-Since', tstamp)

    try:
        r = urllib2.urlopen(request)
        info.update(r.info())
+        info['url'] = r.geturl()  # The URL goes in too, because of redirects.
        info.update(dict(msg="OK (%s bytes)" % r.headers.get('Content-Length', 'unknown'), status=200))
    except urllib2.HTTPError, e:
        # Must not fail_json() here so caller can handle HTTP 304 unmodified
        info.update(dict(msg=str(e), status=e.code))
-        return r, info
    except urllib2.URLError, e:
        code = getattr(e, 'code', -1)
        module.fail_json(msg="Request failed: %s" % str(e), status_code=code)

    return r, info

-def url_get(module, url, dest, use_proxy):
+def url_get(module, url, dest, use_proxy, last_mod_time):
    """
-    Download url and store at dest.
-    If dest is a directory, determine filename from url.
+    Download data from the url and store in a temporary file.
+
    Return (tempfile, info about the request)
    """

-    req, info = url_do_get(module, url, dest, use_proxy)
+    req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)

    # TODO: should really handle 304, but how? src file could exist (and be newer) but empty
    if info['status'] == 304:
@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
    req.close()
    return tempname, info

+def extract_filename_from_headers(headers):
+    """
+    Extracts a filename from the given dict of HTTP headers.
+
+    Looks for the content-disposition header and applies a regex.
+    Returns the filename if successful, else None."""
+    cont_disp_regex = 'attachment; ?filename="(.+)"'
+    res = None
+
+    if 'content-disposition' in headers:
+        cont_disp = headers['content-disposition']
+        match = re.match(cont_disp_regex, cont_disp)
+        if match:
+            res = match.group(1)
+            # Try preventing any funny business.
+            res = os.path.basename(res)
+
+    return res
+
 # ==============================================================
 # main

@ -247,15 +270,33 @@ def main():
    sha256sum = module.params['sha256sum']
    use_proxy = module.params['use_proxy']

-    if os.path.isdir(dest):
-        dest = os.path.join(dest, url_filename(url))
+    dest_is_dir = os.path.isdir(dest)
+    last_mod_time = None

-    if not force:
-        if os.path.exists(dest):
+    if not dest_is_dir and os.path.exists(dest):
+        if not force:
            module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)

+        # If the file already exists, prepare the last modified time for the
+        # request.
+        mtime = os.path.getmtime(dest)
+        last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
+
    # download to tmpsrc
-    tmpsrc, info = url_get(module, url, dest, use_proxy)
+    tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
+
+    # Now the request has completed, we can finally generate the final
+    # destination file name from the info dict.
+
+    if dest_is_dir:
+        filename = extract_filename_from_headers(info)
+        if not filename:
+            # Fall back to extracting the filename from the URL.
+            # Pluck the URL from the info, since a redirect could have changed
+            # it.
+            filename = url_filename(info['url'])
+        dest = os.path.join(dest, filename)
+
    md5sum_src   = None
    md5sum_dest  = None