Fix to bytes surrogate and nonencodable chars (#21180)

* Add a surrogate_then_replace error strategy to keep to_bytes from tracebacking by default * Port all code that explicitly used surrogate_or_replace to surrogate_then_replace
2017-02-09 17:13:40 -08:00 · 2017-02-09 17:13:40 -08:00 · 98541b7c8b
commit 98541b7c8b
parent 149dd9ca86
5 changed files with 94 additions and 39 deletions
--- a/lib/ansible/inventory/ini.py
+++ b/lib/ansible/inventory/ini.py
@ -69,7 +69,7 @@ class InventoryParser(object):
            for line in b_data.splitlines():
                if line and line[0] in self.b_COMMENT_MARKERS:
                    # Replace is okay for comment lines
-                    #data.append(to_text(line, errors='surrogate_or_replace'))
+                    #data.append(to_text(line, errors='surrogate_then_replace'))
                    # Currently we only need these lines for accurate lineno in errors
                    data.append(u'')
                else:
--- a/lib/ansible/module_utils/_text.py
+++ b/lib/ansible/module_utils/_text.py
@ -44,6 +44,11 @@ except LookupError:
    HAS_SURROGATEESCAPE = False


+_COMPOSED_ERROR_HANDLERS = frozenset((None, 'surrogate_or_escape',
+                                     'surrogate_or_strict',
+                                     'surrogate_then_replace'))
+
+
 def to_bytes(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
    """Make sure that a string is a byte string

@ -56,22 +61,35 @@ def to_bytes(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
    :kwarg errors: The error handler to use if the text string is not
        encodable using the specified encoding.  Any valid `codecs error
        handler <https://docs.python.org/2/library/codecs.html#codec-base-classes>`_
-        may be specified. There are two additional error strategies
-        specifically aimed at helping people to port code:
+        may be specified. There are three additional error strategies
+        specifically aimed at helping people to port code.  The first two are:

-            :surrogate_or_strict: Will use surrogateescape if it is a valid
-                handler, otherwise it will use strict
-            :surrogate_or_replace: Will use surrogateescape if it is a valid
-                handler, otherwise it will use replace.
+            :surrogate_or_strict: Will use ``surrogateescape`` if it is a valid
+                handler, otherwise it will use ``strict``
+            :surrogate_or_replace: Will use ``surrogateescape`` if it is a valid
+                handler, otherwise it will use ``replace``.

-        Because surrogateescape was added in Python3 this usually means that
-        Python3 will use surrogateescape and Python2 will use the fallback
-        error handler. Note that the code checks for surrogateescape when the
-        module is imported.  If you have a backport of surrogateescape for
-        python2, be sure to register the error handler prior to importing this
+        Because ``surrogateescape`` was added in Python3 this usually means that
+        Python3 will use ``surrogateescape`` and Python2 will use the fallback
+        error handler. Note that the code checks for ``surrogateescape`` when the
+        module is imported.  If you have a backport of ``surrogateescape`` for
+        Python2, be sure to register the error handler prior to importing this
        module.

-        The default is `surrogate_or_replace`
+        The last error handler is:
+
+            :surrogate_then_replace: Will use ``surrogateescape`` if it is a valid
+                handler.  If encoding with ``surrogateescape`` would traceback,
+                surrogates are first replaced with a replacement characters
+                and then the string is encoded using ``replace`` (which replaces
+                the rest of the nonencodable bytes).  If ``surrogateescape`` is
+                not present it will simply use ``replace``.  (Added in Ansible 2.3)
+                This strategy is designed to never traceback when it attempts
+                to encode a string.
+
+        The default until Ansible-2.2 was ``surrogate_or_replace``
+        From Ansible-2.3 onwards, the default is ``surrogate_then_replace``.
+
    :kwarg nonstring: The strategy to use if a nonstring is specified in
        ``obj``.  Default is 'simplerepr'.  Valid values are:

@ -90,23 +108,36 @@ def to_bytes(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
        byte string is in the specified encoding do::

            encoded_string = to_bytes(to_text(input_string, 'latin-1'), 'utf-8')
+
+    .. version_changed:: 2.3
+
+        Added the ``surrogate_then_replace`` error handler and made it the default error handler.
    """
    if isinstance(obj, binary_type):
        return obj

-    if errors in (None, 'surrogate_or_replace'):
+    # We're given a text string
+    # If it has surrogates, we know because it will decode
+    original_errors = errors
+    if errors in _COMPOSED_ERROR_HANDLERS:
        if HAS_SURROGATEESCAPE:
            errors = 'surrogateescape'
+        elif errors == 'surrogate_or_strict':
+            errors = 'strict'
        else:
            errors = 'replace'
-    elif errors == 'surrogate_or_strict':
-        if HAS_SURROGATEESCAPE:
-            errors = 'surrogateescape'
-        else:
-            errors = 'strict'

    if isinstance(obj, text_type):
-        return obj.encode(encoding, errors)
+        try:
+            # Try this first as it's the fastest
+            return obj.encode(encoding, errors)
+        except UnicodeEncodeError:
+            if original_errors in (None, 'surrogate_then_replace'):
+                # Slow but works
+                return_string = obj.encode('utf-8', 'surrogateescape')
+                return_string = return_string.decode('utf-8', 'replace')
+                return return_string.encode(encoding, 'replace')
+            raise

    # Note: We do these last even though we have to call to_bytes again on the
    # value because we're optimizing the common case
@ -144,8 +175,27 @@ def to_text(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
    :kwarg errors: The error handler to use if the byte string is not
        decodable using the specified encoding.  Any valid `codecs error
        handler <https://docs.python.org/2/library/codecs.html#codec-base-classes>`_
-        may be specified. On Python3 this defaults to 'surrogateescape'.  On
-        Python2, this defaults to 'replace'.
+        may be specified.   We support three additional error strategies
+        specifically aimed at helping people to port code:
+
+            :surrogate_or_strict: Will use surrogateescape if it is a valid
+                handler, otherwise it will use strict
+            :surrogate_or_replace: Will use surrogateescape if it is a valid
+                handler, otherwise it will use replace.
+            :surrogate_then_replace: Does the same as surrogate_or_replace but
+                `was added for symmetry with the error handlers in
+                :func:`ansible.module_utils._text.to_bytes` (Added in Ansible 2.3)
+
+        Because surrogateescape was added in Python3 this usually means that
+        Python3 will use `surrogateescape` and Python2 will use the fallback
+        error handler. Note that the code checks for surrogateescape when the
+        module is imported.  If you have a backport of `surrogateescape` for
+        python2, be sure to register the error handler prior to importing this
+        module.
+
+        The default until Ansible-2.2 was `surrogate_or_replace`
+        In Ansible-2.3 this defaults to `surrogate_then_replace` for symmetry
+        with :func:`ansible.module_utils._text.to_bytes` .
    :kwarg nonstring: The strategy to use if a nonstring is specified in
        ``obj``.  Default is 'simplerepr'.  Valid values are:

@ -158,22 +208,27 @@ def to_text(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
    :returns: Typically this returns a text string.  If a nonstring object is
        passed in this may be a different type depending on the strategy
        specified by nonstring.  This will never return a byte string.
+        From Ansible-2.3 onwards, the default is `surrogate_then_replace`.
+
+    .. version_changed:: 2.3
+
+        Added the surrogate_then_replace error handler and made it the default error handler.
    """
    if isinstance(obj, text_type):
        return obj

-    if errors in (None, 'surrogate_or_replace'):
+    if errors in _COMPOSED_ERROR_HANDLERS:
        if HAS_SURROGATEESCAPE:
            errors = 'surrogateescape'
+        elif errors == 'surrogate_or_strict':
+            errors = 'strict'
        else:
            errors = 'replace'
-    elif errors == 'surrogate_or_strict':
-        if HAS_SURROGATEESCAPE:
-            errors = 'surrogateescape'
-        else:
-            errors = 'strict'

    if isinstance(obj, binary_type):
+        # Note: We don't need special handling for surrogate_then_replace
+        # because all bytes will either be made into surrogates or are valid
+        # to decode.
        return obj.decode(encoding, errors)

    # Note: We do these last even though we have to call to_text again on the
--- a/lib/ansible/module_utils/basic.py
+++ b/lib/ansible/module_utils/basic.py
@ -403,9 +403,9 @@ def remove_values(value, no_log_strings):
            native_str_value = native_str_value.replace(omit_me, '*' * 8)

        if value_is_text and isinstance(native_str_value, binary_type):
-            value = to_text(native_str_value, encoding='utf-8', errors='surrogate_or_replace')
+            value = to_text(native_str_value, encoding='utf-8', errors='surrogate_then_replace')
        elif not value_is_text and isinstance(native_str_value, text_type):
-            value = to_bytes(native_str_value, encoding='utf-8', errors='surrogate_or_replace')
+            value = to_bytes(native_str_value, encoding='utf-8', errors='surrogate_then_replace')
        else:
            value = native_str_value
    elif isinstance(value, SEQUENCETYPE):
--- a/lib/ansible/module_utils/facts.py
+++ b/lib/ansible/module_utils/facts.py
@ -406,7 +406,7 @@ class Facts(object):
    def get_lsb_facts(self):
        lsb_path = self.module.get_bin_path('lsb_release')
        if lsb_path:
-            rc, out, err = self.module.run_command([lsb_path, "-a"], errors='surrogate_or_replace')
+            rc, out, err = self.module.run_command([lsb_path, "-a"], errors='surrogate_then_replace')
            if rc == 0:
                self.facts['lsb'] = {}
                for line in out.splitlines():
@ -484,7 +484,7 @@ class Facts(object):
    def get_caps_facts(self):
        capsh_path = self.module.get_bin_path('capsh')
        if capsh_path:
-            rc, out, err = self.module.run_command([capsh_path, "--print"], errors='surrogate_or_replace')
+            rc, out, err = self.module.run_command([capsh_path, "--print"], errors='surrogate_then_replace')
            enforced_caps = []
            enforced = 'NA'
            for line in out.splitlines():
@ -1329,7 +1329,7 @@ class LinuxHardware(Hardware):
    def _run_findmnt(self, findmnt_path):
        args = ['--list', '--noheadings', '--notruncate']
        cmd = [findmnt_path] + args
-        rc, out, err = self.module.run_command(cmd, errors='surrogate_or_replace')
+        rc, out, err = self.module.run_command(cmd, errors='surrogate_then_replace')
        return rc, out, err

    def _find_bind_mounts(self):
@ -1423,7 +1423,7 @@ class LinuxHardware(Hardware):
        self.facts['devices'] = {}
        lspci = self.module.get_bin_path('lspci')
        if lspci:
-            rc, pcidata, err = self.module.run_command([lspci, '-D'], errors='surrogate_or_replace')
+            rc, pcidata, err = self.module.run_command([lspci, '-D'], errors='surrogate_then_replace')
        else:
            pcidata = None

@ -2482,7 +2482,7 @@ class LinuxNetwork(Network):
                continue
            if v == 'v6' and not socket.has_ipv6:
                continue
-            rc, out, err = self.module.run_command(command[v], errors='surrogate_or_replace')
+            rc, out, err = self.module.run_command(command[v], errors='surrogate_then_replace')
            if not out:
                # v6 routing may result in
                #   RTNETLINK answers: Invalid argument
@ -2647,10 +2647,10 @@ class LinuxNetwork(Network):
            ip_path = self.module.get_bin_path("ip")

            args = [ip_path, 'addr', 'show', 'primary', device]
-            rc, primary_data, stderr = self.module.run_command(args, errors='surrogate_or_replace')
+            rc, primary_data, stderr = self.module.run_command(args, errors='surrogate_then_replace')

            args = [ip_path, 'addr', 'show', 'secondary', device]
-            rc, secondary_data, stderr = self.module.run_command(args, errors='surrogate_or_replace')
+            rc, secondary_data, stderr = self.module.run_command(args, errors='surrogate_then_replace')

            parse_ip_output(primary_data)
            parse_ip_output(secondary_data, secondary=True)
@ -2672,7 +2672,7 @@ class LinuxNetwork(Network):
        ethtool_path = self.module.get_bin_path("ethtool")
        if ethtool_path:
            args = [ethtool_path, '-k', device]
-            rc, stdout, stderr = self.module.run_command(args, errors='surrogate_or_replace')
+            rc, stdout, stderr = self.module.run_command(args, errors='surrogate_then_replace')
            if rc == 0:
                for line in stdout.strip().splitlines():
                    if not line or line.endswith(":"):
--- a/lib/ansible/plugins/action/init.py
+++ b/lib/ansible/plugins/action/init.py
@ -818,7 +818,7 @@ class ActionBase(with_metaclass(ABCMeta, object)):
                data['rc'] = res['rc']
        return data

-    def _low_level_execute_command(self, cmd, sudoable=True, in_data=None, executable=None, encoding_errors='surrogate_or_replace'):
+    def _low_level_execute_command(self, cmd, sudoable=True, in_data=None, executable=None, encoding_errors='surrogate_then_replace'):
        '''
        This is the function which executes the low level shell command, which
        may be commands to create/remove directories for temporary files, or to