Merge pull request #4785 from Tinche/get_url/uri-content-disposition
Modified the get_url module to respect the content-disposition header if...
This commit is contained in:
commit
3a5e689b80
1 changed files with 60 additions and 19 deletions
|
@ -49,15 +49,20 @@ options:
|
||||||
dest:
|
dest:
|
||||||
description:
|
description:
|
||||||
- absolute path of where to download the file to.
|
- absolute path of where to download the file to.
|
||||||
- If I(dest) is a directory, the basename of the file on the remote server will be used. If a directory, C(force=yes) must also be set.
|
- If C(dest) is a directory, either the server provided filename or, if
|
||||||
|
none provided, the base name of the URL on the remote server will be
|
||||||
|
used. If a directory, C(force) has no effect.
|
||||||
required: true
|
required: true
|
||||||
default: null
|
default: null
|
||||||
force:
|
force:
|
||||||
description:
|
description:
|
||||||
- If C(yes), will download the file every time and replace the
|
- If C(yes) and C(dest) is not a directory, will download the file every
|
||||||
file if the contents change. If C(no), the file will only be downloaded if
|
time and replace the file if the contents change. If C(no), the file
|
||||||
the destination does not exist. Generally should be C(yes) only for small
|
will only be downloaded if the destination does not exist. Generally
|
||||||
local files. Prior to 0.6, this module behaved as if C(yes) was the default.
|
should be C(yes) only for small local files. Prior to 0.6, this module
|
||||||
|
behaved as if C(yes) was the default.
|
||||||
|
Has no effect if C(dest) is a directory - the file will always be
|
||||||
|
downloaded, but replaced only if the contents changed.
|
||||||
version_added: "0.7"
|
version_added: "0.7"
|
||||||
required: false
|
required: false
|
||||||
choices: [ "yes", "no" ]
|
choices: [ "yes", "no" ]
|
||||||
|
@ -125,7 +130,7 @@ def url_filename(url):
|
||||||
return 'index.html'
|
return 'index.html'
|
||||||
return fn
|
return fn
|
||||||
|
|
||||||
def url_do_get(module, url, dest, use_proxy):
|
def url_do_get(module, url, dest, use_proxy, last_mod_time):
|
||||||
"""
|
"""
|
||||||
Get url and return request and info
|
Get url and return request and info
|
||||||
Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
|
Credits: http://stackoverflow.com/questions/7006574/how-to-download-file-from-ftp
|
||||||
|
@ -171,33 +176,32 @@ def url_do_get(module, url, dest, use_proxy):
|
||||||
request = urllib2.Request(url)
|
request = urllib2.Request(url)
|
||||||
request.add_header('User-agent', USERAGENT)
|
request.add_header('User-agent', USERAGENT)
|
||||||
|
|
||||||
if os.path.exists(dest) and not module.params['force']:
|
if last_mod_time:
|
||||||
t = datetime.datetime.utcfromtimestamp(os.path.getmtime(dest))
|
tstamp = last_mod_time.strftime('%a, %d %b %Y %H:%M:%S +0000')
|
||||||
tstamp = t.strftime('%a, %d %b %Y %H:%M:%S +0000')
|
|
||||||
request.add_header('If-Modified-Since', tstamp)
|
request.add_header('If-Modified-Since', tstamp)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = urllib2.urlopen(request)
|
r = urllib2.urlopen(request)
|
||||||
info.update(r.info())
|
info.update(r.info())
|
||||||
|
info['url'] = r.geturl() # The URL goes in too, because of redirects.
|
||||||
info.update(dict(msg="OK (%s bytes)" % r.headers.get('Content-Length', 'unknown'), status=200))
|
info.update(dict(msg="OK (%s bytes)" % r.headers.get('Content-Length', 'unknown'), status=200))
|
||||||
except urllib2.HTTPError, e:
|
except urllib2.HTTPError, e:
|
||||||
# Must not fail_json() here so caller can handle HTTP 304 unmodified
|
# Must not fail_json() here so caller can handle HTTP 304 unmodified
|
||||||
info.update(dict(msg=str(e), status=e.code))
|
info.update(dict(msg=str(e), status=e.code))
|
||||||
return r, info
|
|
||||||
except urllib2.URLError, e:
|
except urllib2.URLError, e:
|
||||||
code = getattr(e, 'code', -1)
|
code = getattr(e, 'code', -1)
|
||||||
module.fail_json(msg="Request failed: %s" % str(e), status_code=code)
|
module.fail_json(msg="Request failed: %s" % str(e), status_code=code)
|
||||||
|
|
||||||
return r, info
|
return r, info
|
||||||
|
|
||||||
def url_get(module, url, dest, use_proxy):
|
def url_get(module, url, dest, use_proxy, last_mod_time):
|
||||||
"""
|
"""
|
||||||
Download url and store at dest.
|
Download data from the url and store in a temporary file.
|
||||||
If dest is a directory, determine filename from url.
|
|
||||||
Return (tempfile, info about the request)
|
Return (tempfile, info about the request)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
req, info = url_do_get(module, url, dest, use_proxy)
|
req, info = url_do_get(module, url, dest, use_proxy, last_mod_time)
|
||||||
|
|
||||||
# TODO: should really handle 304, but how? src file could exist (and be newer) but empty
|
# TODO: should really handle 304, but how? src file could exist (and be newer) but empty
|
||||||
if info['status'] == 304:
|
if info['status'] == 304:
|
||||||
|
@ -218,6 +222,25 @@ def url_get(module, url, dest, use_proxy):
|
||||||
req.close()
|
req.close()
|
||||||
return tempname, info
|
return tempname, info
|
||||||
|
|
||||||
|
def extract_filename_from_headers(headers):
|
||||||
|
"""
|
||||||
|
Extracts a filename from the given dict of HTTP headers.
|
||||||
|
|
||||||
|
Looks for the content-disposition header and applies a regex.
|
||||||
|
Returns the filename if successful, else None."""
|
||||||
|
cont_disp_regex = 'attachment; ?filename="(.+)"'
|
||||||
|
res = None
|
||||||
|
|
||||||
|
if 'content-disposition' in headers:
|
||||||
|
cont_disp = headers['content-disposition']
|
||||||
|
match = re.match(cont_disp_regex, cont_disp)
|
||||||
|
if match:
|
||||||
|
res = match.group(1)
|
||||||
|
# Try preventing any funny business.
|
||||||
|
res = os.path.basename(res)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
# ==============================================================
|
# ==============================================================
|
||||||
# main
|
# main
|
||||||
|
|
||||||
|
@ -247,15 +270,33 @@ def main():
|
||||||
sha256sum = module.params['sha256sum']
|
sha256sum = module.params['sha256sum']
|
||||||
use_proxy = module.params['use_proxy']
|
use_proxy = module.params['use_proxy']
|
||||||
|
|
||||||
if os.path.isdir(dest):
|
dest_is_dir = os.path.isdir(dest)
|
||||||
dest = os.path.join(dest, url_filename(url))
|
last_mod_time = None
|
||||||
|
|
||||||
if not force:
|
if not dest_is_dir and os.path.exists(dest):
|
||||||
if os.path.exists(dest):
|
if not force:
|
||||||
module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
|
module.exit_json(msg="file already exists", dest=dest, url=url, changed=False)
|
||||||
|
|
||||||
|
# If the file already exists, prepare the last modified time for the
|
||||||
|
# request.
|
||||||
|
mtime = os.path.getmtime(dest)
|
||||||
|
last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
|
||||||
|
|
||||||
# download to tmpsrc
|
# download to tmpsrc
|
||||||
tmpsrc, info = url_get(module, url, dest, use_proxy)
|
tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time)
|
||||||
|
|
||||||
|
# Now the request has completed, we can finally generate the final
|
||||||
|
# destination file name from the info dict.
|
||||||
|
|
||||||
|
if dest_is_dir:
|
||||||
|
filename = extract_filename_from_headers(info)
|
||||||
|
if not filename:
|
||||||
|
# Fall back to extracting the filename from the URL.
|
||||||
|
# Pluck the URL from the info, since a redirect could have changed
|
||||||
|
# it.
|
||||||
|
filename = url_filename(info['url'])
|
||||||
|
dest = os.path.join(dest, filename)
|
||||||
|
|
||||||
md5sum_src = None
|
md5sum_src = None
|
||||||
md5sum_dest = None
|
md5sum_dest = None
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue