|  | 
| 10 | 10 | import locale | 
| 11 | 11 | import os | 
| 12 | 12 | import sys | 
|  | 13 | +import codecs | 
|  | 14 | + | 
| 13 | 15 | 
 | 
| 14 | 16 | from gitdb.utils.compat import ( | 
| 15 | 17 |     xrange, | 
| @@ -67,7 +69,7 @@ def safe_decode(s): | 
| 67 | 69 |     if isinstance(s, unicode): | 
| 68 | 70 |         return s | 
| 69 | 71 |     elif isinstance(s, bytes): | 
| 70 |  | -        return s.decode(defenc, 'replace') | 
|  | 72 | +        return s.decode(defenc, 'surrogateescape') | 
| 71 | 73 |     elif s is not None: | 
| 72 | 74 |         raise TypeError('Expected bytes or text, but got %r' % (s,)) | 
| 73 | 75 | 
 | 
| @@ -121,3 +123,191 @@ def __str__(self): | 
| 121 | 123 |     else:  # Python 2 | 
| 122 | 124 |         def __str__(self): | 
| 123 | 125 |             return self.__unicode__().encode(defenc) | 
|  | 126 | +             | 
|  | 127 | +             | 
|  | 128 | +""" | 
|  | 129 | +This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error | 
|  | 130 | +handler of Python 3. | 
|  | 131 | +Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc | 
|  | 132 | +""" | 
|  | 133 | + | 
|  | 134 | +# This code is released under the Python license and the BSD 2-clause license | 
|  | 135 | + | 
|  | 136 | + | 
|  | 137 | +FS_ERRORS = 'surrogateescape' | 
|  | 138 | + | 
|  | 139 | +#     # -- Python 2/3 compatibility ------------------------------------- | 
|  | 140 | +#     FS_ERRORS = 'my_surrogateescape' | 
|  | 141 | + | 
|  | 142 | +def u(text): | 
|  | 143 | +    if PY3: | 
|  | 144 | +        return text | 
|  | 145 | +    else: | 
|  | 146 | +        return text.decode('unicode_escape') | 
|  | 147 | + | 
|  | 148 | +def b(data): | 
|  | 149 | +    if PY3: | 
|  | 150 | +        return data.encode('latin1') | 
|  | 151 | +    else: | 
|  | 152 | +        return data | 
|  | 153 | + | 
|  | 154 | +if PY3: | 
|  | 155 | +    _unichr = chr | 
|  | 156 | +    bytes_chr = lambda code: bytes((code,)) | 
|  | 157 | +else: | 
|  | 158 | +    _unichr = unichr | 
|  | 159 | +    bytes_chr = chr | 
|  | 160 | + | 
|  | 161 | +def surrogateescape_handler(exc): | 
|  | 162 | +    """ | 
|  | 163 | +    Pure Python implementation of the PEP 383: the "surrogateescape" error | 
|  | 164 | +    handler of Python 3. Undecodable bytes will be replaced by a Unicode | 
|  | 165 | +    character U+DCxx on decoding, and these are translated into the | 
|  | 166 | +    original bytes on encoding. | 
|  | 167 | +    """ | 
|  | 168 | +    mystring = exc.object[exc.start:exc.end] | 
|  | 169 | + | 
|  | 170 | +    try: | 
|  | 171 | +        if isinstance(exc, UnicodeDecodeError): | 
|  | 172 | +            # mystring is a byte-string in this case | 
|  | 173 | +            decoded = replace_surrogate_decode(mystring) | 
|  | 174 | +        elif isinstance(exc, UnicodeEncodeError): | 
|  | 175 | +            # In the case of u'\udcc3'.encode('ascii', | 
|  | 176 | +            # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an | 
|  | 177 | +            # exception anyway after this function is called, even though I think | 
|  | 178 | +            # it's doing what it should. It seems that the strict encoder is called | 
|  | 179 | +            # to encode the unicode string that this function returns ... | 
|  | 180 | +            decoded = replace_surrogate_encode(mystring) | 
|  | 181 | +        else: | 
|  | 182 | +            raise exc | 
|  | 183 | +    except NotASurrogateError: | 
|  | 184 | +        raise exc | 
|  | 185 | +    return (decoded, exc.end) | 
|  | 186 | + | 
|  | 187 | + | 
|  | 188 | +class NotASurrogateError(Exception): | 
|  | 189 | +    pass | 
|  | 190 | + | 
|  | 191 | + | 
|  | 192 | +def replace_surrogate_encode(mystring): | 
|  | 193 | +    """ | 
|  | 194 | +    Returns a (unicode) string, not the more logical bytes, because the codecs | 
|  | 195 | +    register_error functionality expects this. | 
|  | 196 | +    """ | 
|  | 197 | +    decoded = [] | 
|  | 198 | +    for ch in mystring: | 
|  | 199 | +        # if PY3: | 
|  | 200 | +        #     code = ch | 
|  | 201 | +        # else: | 
|  | 202 | +        code = ord(ch) | 
|  | 203 | + | 
|  | 204 | +        # The following magic comes from Py3.3's Python/codecs.c file: | 
|  | 205 | +        if not 0xD800 <= code <= 0xDCFF: | 
|  | 206 | +            # Not a surrogate. Fail with the original exception. | 
|  | 207 | +            raise exc | 
|  | 208 | +        # mybytes = [0xe0 | (code >> 12), | 
|  | 209 | +        #            0x80 | ((code >> 6) & 0x3f), | 
|  | 210 | +        #            0x80 | (code & 0x3f)] | 
|  | 211 | +        # Is this a good idea? | 
|  | 212 | +        if 0xDC00 <= code <= 0xDC7F: | 
|  | 213 | +            decoded.append(_unichr(code - 0xDC00)) | 
|  | 214 | +        elif code <= 0xDCFF: | 
|  | 215 | +            decoded.append(_unichr(code - 0xDC00)) | 
|  | 216 | +        else: | 
|  | 217 | +            raise NotASurrogateError | 
|  | 218 | +    return str().join(decoded) | 
|  | 219 | + | 
|  | 220 | + | 
|  | 221 | +def replace_surrogate_decode(mybytes): | 
|  | 222 | +    """ | 
|  | 223 | +    Returns a (unicode) string | 
|  | 224 | +    """ | 
|  | 225 | +    decoded = [] | 
|  | 226 | +    for ch in mybytes: | 
|  | 227 | +        # We may be parsing newbytes (in which case ch is an int) or a native | 
|  | 228 | +        # str on Py2 | 
|  | 229 | +        if isinstance(ch, int): | 
|  | 230 | +            code = ch | 
|  | 231 | +        else: | 
|  | 232 | +            code = ord(ch) | 
|  | 233 | +        if 0x80 <= code <= 0xFF: | 
|  | 234 | +            decoded.append(_unichr(0xDC00 + code)) | 
|  | 235 | +        elif code <= 0x7F: | 
|  | 236 | +            decoded.append(_unichr(code)) | 
|  | 237 | +        else: | 
|  | 238 | +            # # It may be a bad byte | 
|  | 239 | +            # # Try swallowing it. | 
|  | 240 | +            # continue | 
|  | 241 | +            # print("RAISE!") | 
|  | 242 | +            raise NotASurrogateError | 
|  | 243 | +    return str().join(decoded) | 
|  | 244 | + | 
|  | 245 | + | 
|  | 246 | +def encodefilename(fn): | 
|  | 247 | +    if FS_ENCODING == 'ascii': | 
|  | 248 | +        # ASCII encoder of Python 2 expects that the error handler returns a | 
|  | 249 | +        # Unicode string encodable to ASCII, whereas our surrogateescape error | 
|  | 250 | +        # handler has to return bytes in 0x80-0xFF range. | 
|  | 251 | +        encoded = [] | 
|  | 252 | +        for index, ch in enumerate(fn): | 
|  | 253 | +            code = ord(ch) | 
|  | 254 | +            if code < 128: | 
|  | 255 | +                ch = bytes_chr(code) | 
|  | 256 | +            elif 0xDC80 <= code <= 0xDCFF: | 
|  | 257 | +                ch = bytes_chr(code - 0xDC00) | 
|  | 258 | +            else: | 
|  | 259 | +                raise UnicodeEncodeError(FS_ENCODING, | 
|  | 260 | +                    fn, index, index+1, | 
|  | 261 | +                    'ordinal not in range(128)') | 
|  | 262 | +            encoded.append(ch) | 
|  | 263 | +        return bytes().join(encoded) | 
|  | 264 | +    elif FS_ENCODING == 'utf-8': | 
|  | 265 | +        # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF | 
|  | 266 | +        # doesn't go through our error handler | 
|  | 267 | +        encoded = [] | 
|  | 268 | +        for index, ch in enumerate(fn): | 
|  | 269 | +            code = ord(ch) | 
|  | 270 | +            if 0xD800 <= code <= 0xDFFF: | 
|  | 271 | +                if 0xDC80 <= code <= 0xDCFF: | 
|  | 272 | +                    ch = bytes_chr(code - 0xDC00) | 
|  | 273 | +                    encoded.append(ch) | 
|  | 274 | +                else: | 
|  | 275 | +                    raise UnicodeEncodeError( | 
|  | 276 | +                        FS_ENCODING, | 
|  | 277 | +                        fn, index, index+1, 'surrogates not allowed') | 
|  | 278 | +            else: | 
|  | 279 | +                ch_utf8 = ch.encode('utf-8') | 
|  | 280 | +                encoded.append(ch_utf8) | 
|  | 281 | +        return bytes().join(encoded) | 
|  | 282 | +    else: | 
|  | 283 | +        return fn.encode(FS_ENCODING, FS_ERRORS) | 
|  | 284 | + | 
|  | 285 | +def decodefilename(fn): | 
|  | 286 | +    return fn.decode(FS_ENCODING, FS_ERRORS) | 
|  | 287 | + | 
|  | 288 | +FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | 
|  | 289 | +# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') | 
|  | 290 | +# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') | 
|  | 291 | + | 
|  | 292 | + | 
|  | 293 | +# normalize the filesystem encoding name. | 
|  | 294 | +# For example, we expect "utf-8", not "UTF8". | 
|  | 295 | +FS_ENCODING = codecs.lookup(FS_ENCODING).name | 
|  | 296 | + | 
|  | 297 | + | 
|  | 298 | +def register_surrogateescape(): | 
|  | 299 | +    """ | 
|  | 300 | +    Registers the surrogateescape error handler on Python 2 (only) | 
|  | 301 | +    """ | 
|  | 302 | +    if PY3: | 
|  | 303 | +        return | 
|  | 304 | +    try: | 
|  | 305 | +        codecs.lookup_error(FS_ERRORS) | 
|  | 306 | +    except LookupError: | 
|  | 307 | +        codecs.register_error(FS_ERRORS, surrogateescape_handler) | 
|  | 308 | + | 
|  | 309 | + | 
|  | 310 | +try: | 
|  | 311 | +    b"100644 \x9f\0aaa".decode(defenc, "surrogateescape") | 
|  | 312 | +except: | 
|  | 313 | +    register_surrogateescape() | 
0 commit comments