Modified the get_url module to respect the content-disposition header if the destination is a directory and the server provides it.

See http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html, section 19.5.1.
This commit is contained in:
Tin Tvrtkovic 2013-11-02 17:23:59 +01:00
parent e939a41bfb
commit 8036cb0cab

View file

@ -49,15 +49,20 @@ options:
dest: dest:
description: description:
- absolute path of where to download the file to. - absolute path of where to download the file to.
- If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set. - If C(dest) is a directory, either the server provided filename or, if
none provided, the base name of the URL on the remote server will be
used. If a directory, C(force) has no effect.
required: true required: true
default: null default: null
force: force:
description: description:
- If C(yes), will download the file every time and replace the - If C(yes) and C(dest) is not a directory, will download the file every
file if the contents change. If C(no), the file will only be downloaded if time and replace the file if the contents change. If C(no), the file
the destination does not exist. Generally should be C(yes) only for small will only be downloaded if the destination does not exist. Generally
local files. Prior to 0.6, this module behaved as if C(yes) was the default. should be C(yes) only for small local files. Prior to 0.6, this module
behaved as if C(yes) was the default.
Has no effect if C(dest) is a directory - the file will always be
downloaded, but replaced only if the contents changed.
version_added: "0.7" version_added: "0.7"
required: false required: false
choices: [ "yes", "no" ] choices: [ "yes", "no" ]
@ -125,7 +130,7 @@ def url_filename(url):
return 'index.html' return 'index.html'
return fn return fn
def url_do_get(module, url, dest, use_proxy): def url_do_get(module, url, dest, use_proxy, last_mod_time):
""" """
Get url and return request and info Get url and return request and info
Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
@ -171,9 +176,8 @@ def url_do_get(module, url, dest, use_proxy):
request = urllib2.Request(url) request = urllib2.Request(url)
request.add_header('User-agent', USERAGENT) request.add_header('User-agent', USERAGENT)
if os.path.exists(dest) and not module.params['force']: if last_mod_time:
t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest)) tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
request.add_header('If-Modified-Since', tstamp) request.add_header('If-Modified-Since', tstamp)
try: try:
@ -190,14 +194,14 @@ def url_do_get(module, url, dest, use_proxy):
return r, info return r, info
def url_get(module, url, dest, use_proxy): def url_get(module, url, dest, use_proxy, last_mod_time):
""" """
Download url and store at dest. Download data from the url and store in a temporary file.
If dest is a directory, determine filename from url.
Return (tempfile, info about the request) Return (tempfile, info about the request)
""" """
req, info = url_do_get(module, url, dest, use_proxy) req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)
# TODO: should really handle 304, but how? src file could exist (and be newer) but empty # TODO: should really handle 304, but how? src file could exist (and be newer) but empty
if info['status'] == 304: if info['status'] == 304:
@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
req.close() req.close()
return tempname, info return tempname, info
def extract_filename_from_headers(headers):
"""
Extracts a filename from the given dict of HTTP headers.
Looks for the content-disposition header and applies a regex.
Returns the filename if successful, else None."""
cont_disp_regex = 'attachment; ?filename="(.+)"'
res = None
if 'content-disposition' in headers:
cont_disp = headers['content-disposition']
match = re.match(cont_disp_regex, cont_disp)
if match:
res = match.group(1)
# Try preventing any funny business.
res = os.path.basename(res)
return res
# ============================================================== # ==============================================================
# main # main
@ -247,15 +270,30 @@ def main():
sha256sum = module.params['sha256sum'] sha256sum = module.params['sha256sum']
use_proxy = module.params['use_proxy'] use_proxy = module.params['use_proxy']
if os.path.isdir(dest): dest_is_dir = os.path.isdir(dest)
dest = os.path.join(dest, url_filename(url)) last_mod_time = None
if not dest_is_dir and os.path.exists(dest):
if not force: if not force:
if os.path.exists(dest):
module.exit_json(msg="file already exists", dest=dest, url=url, changed=False) module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
# If the file already exists, prepare the last modified time for the
# request.
mtime = os.path.getmtime(dest)
last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
# download to tmpsrc # download to tmpsrc
tmpsrc, info = url_get(module, url, dest, use_proxy) tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
# Now the request has completed, we can finally generate the final
# destination file name from the info dict.
if dest_is_dir:
filename = extract_filename_from_headers(info)
if not filename:
# Fall back to extracting the filename from the URL.
filename = url_filename(url)
dest = os.path.join(dest, filename)
md5sum_src = None md5sum_src = None
md5sum_dest = None md5sum_dest = None