Modified the get_url module to respect the content-disposition header if the destination is a directory and the server provides it.

See http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html, section 19.5.1.
This commit is contained in:
Tin Tvrtkovic 2013-11-02 17:23:59 +01:00
parent 1cc894f54b
commit c85655f720

View file

@ -49,15 +49,20 @@ options:
dest:
description:
- absolute path of where to download the file to.
- If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set.
- If C(dest) is a directory, either the server provided filename or, if
none provided, the base name of the URL on the remote server will be
used. If a directory, C(force) has no effect.
required: true
default: null
force:
description:
- If C(yes), will download the file every time and replace the
file if the contents change. If C(no), the file will only be downloaded if
the destination does not exist. Generally should be C(yes) only for small
local files. Prior to 0.6, this module behaved as if C(yes) was the default.
- If C(yes) and C(dest) is not a directory, will download the file every
time and replace the file if the contents change. If C(no), the file
will only be downloaded if the destination does not exist. Generally
should be C(yes) only for small local files. Prior to 0.6, this module
behaved as if C(yes) was the default.
Has no effect if C(dest) is a directory - the file will always be
downloaded, but replaced only if the contents changed.
version_added: "0.7"
required: false
choices: [ "yes", "no" ]
@ -125,7 +130,7 @@ def url_filename(url):
return 'index.html'
return fn
def url_do_get(module, url, dest, use_proxy):
def url_do_get(module, url, dest, use_proxy, last_mod_time):
"""
Get url and return request and info
Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
@ -171,9 +176,8 @@ def url_do_get(module, url, dest, use_proxy):
request = urllib2.Request(url)
request.add_header('User-agent', USERAGENT)
if os.path.exists(dest) and not module.params['force']:
t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest))
tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
if last_mod_time:
tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
request.add_header('If-Modified-Since', tstamp)
try:
@ -190,14 +194,14 @@ def url_do_get(module, url, dest, use_proxy):
return r, info
def url_get(module, url, dest, use_proxy):
def url_get(module, url, dest, use_proxy, last_mod_time):
"""
Download url and store at dest.
If dest is a directory, determine filename from url.
Download data from the url and store in a temporary file.
Return (tempfile, info about the request)
"""
req, info = url_do_get(module, url, dest, use_proxy)
req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)
# TODO: should really handle 304, but how? src file could exist (and be newer) but empty
if info['status'] == 304:
@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
req.close()
return tempname, info
def extract_filename_from_headers(headers):
"""
Extracts a filename from the given dict of HTTP headers.
Looks for the content-disposition header and applies a regex.
Returns the filename if successful, else None."""
cont_disp_regex = 'attachment; ?filename="(.+)"'
res = None
if 'content-disposition' in headers:
cont_disp = headers['content-disposition']
match = re.match(cont_disp_regex, cont_disp)
if match:
res = match.group(1)
# Try preventing any funny business.
res = os.path.basename(res)
return res
# ==============================================================
# main
@ -247,15 +270,30 @@ def main():
sha256sum = module.params['sha256sum']
use_proxy = module.params['use_proxy']
if os.path.isdir(dest):
dest = os.path.join(dest, url_filename(url))
dest_is_dir = os.path.isdir(dest)
last_mod_time = None
if not force:
if os.path.exists(dest):
if not dest_is_dir and os.path.exists(dest):
if not force:
module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
# If the file already exists, prepare the last modified time for the
# request.
mtime = os.path.getmtime(dest)
last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
# download to tmpsrc
tmpsrc, info = url_get(module, url, dest, use_proxy)
tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
# Now the request has completed, we can finally generate the final
# destination file name from the info dict.
if dest_is_dir:
filename = extract_filename_from_headers(info)
if not filename:
# Fall back to extracting the filename from the URL.
filename = url_filename(url)
dest = os.path.join(dest, filename)
md5sum_src = None
md5sum_dest = None