Merge pull request #4785 from Tinche/get_url/uri-content-disposition

Modified the get_url module to respect the content-disposition header if...
This commit is contained in:
jctanner 2013-11-14 10:02:15 -08:00
commit 0ee5792849

View file

@ -49,15 +49,20 @@ options:
dest: dest:
description: description:
- absolute path of where to download the file to. - absolute path of where to download the file to.
- If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set. - If C(dest) is a directory, either the server provided filename or, if
none provided, the base name of the URL on the remote server will be
used. If a directory, C(force) has no effect.
required: true required: true
default: null default: null
force: force:
description: description:
- If C(yes), will download the file every time and replace the - If C(yes) and C(dest) is not a directory, will download the file every
file if the contents change. If C(no), the file will only be downloaded if time and replace the file if the contents change. If C(no), the file
the destination does not exist. Generally should be C(yes) only for small will only be downloaded if the destination does not exist. Generally
local files. Prior to 0.6, this module behaved as if C(yes) was the default. should be C(yes) only for small local files. Prior to 0.6, this module
behaved as if C(yes) was the default.
Has no effect if C(dest) is a directory - the file will always be
downloaded, but replaced only if the contents changed.
version_added: "0.7" version_added: "0.7"
required: false required: false
choices: [ "yes", "no" ] choices: [ "yes", "no" ]
@ -125,7 +130,7 @@ def url_filename(url):
return 'index.html' return 'index.html'
return fn return fn
def url_do_get(module, url, dest, use_proxy): def url_do_get(module, url, dest, use_proxy, last_mod_time):
""" """
Get url and return request and info Get url and return request and info
Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
@ -171,33 +176,32 @@ def url_do_get(module, url, dest, use_proxy):
request = urllib2.Request(url) request = urllib2.Request(url)
request.add_header('User-agent', USERAGENT) request.add_header('User-agent', USERAGENT)
if os.path.exists(dest) and not module.params['force']: if last_mod_time:
t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest)) tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
request.add_header('If-Modified-Since', tstamp) request.add_header('If-Modified-Since', tstamp)
try: try:
r = urllib2.urlopen(request) r = urllib2.urlopen(request)
info.update(r.info()) info.update(r.info())
info['url'] = r.geturl() # The URL goes in too, because of redirects.
info.update(dict(msg="OK (%s bytes)" % r.headers.get('Content-Length', 'unknown'), status=200)) info.update(dict(msg="OK (%s bytes)" % r.headers.get('Content-Length', 'unknown'), status=200))
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
# Must not fail_json() here so caller can handle HTTP 304 unmodified # Must not fail_json() here so caller can handle HTTP 304 unmodified
info.update(dict(msg=str(e), status=e.code)) info.update(dict(msg=str(e), status=e.code))
return r, info
except urllib2.URLError, e: except urllib2.URLError, e:
code = getattr(e, 'code', -1) code = getattr(e, 'code', -1)
module.fail_json(msg="Request failed: %s" % str(e), status_code=code) module.fail_json(msg="Request failed: %s" % str(e), status_code=code)
return r, info return r, info
def url_get(module, url, dest, use_proxy): def url_get(module, url, dest, use_proxy, last_mod_time):
""" """
Download url and store at dest. Download data from the url and store in a temporary file.
If dest is a directory, determine filename from url.
Return (tempfile, info about the request) Return (tempfile, info about the request)
""" """
req, info = url_do_get(module, url, dest, use_proxy) req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)
# TODO: should really handle 304, but how? src file could exist (and be newer) but empty # TODO: should really handle 304, but how? src file could exist (and be newer) but empty
if info['status'] == 304: if info['status'] == 304:
@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
req.close() req.close()
return tempname, info return tempname, info
def extract_filename_from_headers(headers):
"""
Extracts a filename from the given dict of HTTP headers.
Looks for the content-disposition header and applies a regex.
Returns the filename if successful, else None."""
cont_disp_regex = 'attachment; ?filename="(.+)"'
res = None
if 'content-disposition' in headers:
cont_disp = headers['content-disposition']
match = re.match(cont_disp_regex, cont_disp)
if match:
res = match.group(1)
# Try preventing any funny business.
res = os.path.basename(res)
return res
# ============================================================== # ==============================================================
# main # main
@ -247,15 +270,33 @@ def main():
sha256sum = module.params['sha256sum'] sha256sum = module.params['sha256sum']
use_proxy = module.params['use_proxy'] use_proxy = module.params['use_proxy']
if os.path.isdir(dest): dest_is_dir = os.path.isdir(dest)
dest = os.path.join(dest, url_filename(url)) last_mod_time = None
if not force: if not dest_is_dir and os.path.exists(dest):
if os.path.exists(dest): if not force:
module.exit_json(msg="file already exists", dest=dest, url=url, changed=False) module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
# If the file already exists, prepare the last modified time for the
# request.
mtime = os.path.getmtime(dest)
last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
# download to tmpsrc # download to tmpsrc
tmpsrc, info = url_get(module, url, dest, use_proxy) tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
# Now the request has completed, we can finally generate the final
# destination file name from the info dict.
if dest_is_dir:
filename = extract_filename_from_headers(info)
if not filename:
# Fall back to extracting the filename from the URL.
# Pluck the URL from the info, since a redirect could have changed
# it.
filename = url_filename(info['url'])
dest = os.path.join(dest, filename)
md5sum_src = None md5sum_src = None
md5sum_dest = None md5sum_dest = None