| 
10 | 10 | import locale  | 
11 | 11 | import os  | 
12 | 12 | import sys  | 
 | 13 | +import codecs  | 
 | 14 | + | 
13 | 15 | 
 
  | 
14 | 16 | from gitdb.utils.compat import (  | 
15 | 17 |     xrange,  | 
@@ -67,7 +69,7 @@ def safe_decode(s):  | 
67 | 69 |     if isinstance(s, unicode):  | 
68 | 70 |         return s  | 
69 | 71 |     elif isinstance(s, bytes):  | 
70 |  | -        return s.decode(defenc, 'replace')  | 
 | 72 | +        return s.decode(defenc, 'surrogateescape')  | 
71 | 73 |     elif s is not None:  | 
72 | 74 |         raise TypeError('Expected bytes or text, but got %r' % (s,))  | 
73 | 75 | 
 
  | 
@@ -121,3 +123,191 @@ def __str__(self):  | 
121 | 123 |     else:  # Python 2  | 
122 | 124 |         def __str__(self):  | 
123 | 125 |             return self.__unicode__().encode(defenc)  | 
 | 126 | +              | 
 | 127 | +              | 
 | 128 | +"""  | 
 | 129 | +This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error  | 
 | 130 | +handler of Python 3.  | 
 | 131 | +Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc  | 
 | 132 | +"""  | 
 | 133 | + | 
 | 134 | +# This code is released under the Python license and the BSD 2-clause license  | 
 | 135 | + | 
 | 136 | + | 
 | 137 | +FS_ERRORS = 'surrogateescape'  | 
 | 138 | + | 
 | 139 | +#     # -- Python 2/3 compatibility -------------------------------------  | 
 | 140 | +#     FS_ERRORS = 'my_surrogateescape'  | 
 | 141 | + | 
 | 142 | +def u(text):  | 
 | 143 | +    if PY3:  | 
 | 144 | +        return text  | 
 | 145 | +    else:  | 
 | 146 | +        return text.decode('unicode_escape')  | 
 | 147 | + | 
 | 148 | +def b(data):  | 
 | 149 | +    if PY3:  | 
 | 150 | +        return data.encode('latin1')  | 
 | 151 | +    else:  | 
 | 152 | +        return data  | 
 | 153 | + | 
 | 154 | +if PY3:  | 
 | 155 | +    _unichr = chr  | 
 | 156 | +    bytes_chr = lambda code: bytes((code,))  | 
 | 157 | +else:  | 
 | 158 | +    _unichr = unichr  | 
 | 159 | +    bytes_chr = chr  | 
 | 160 | + | 
 | 161 | +def surrogateescape_handler(exc):  | 
 | 162 | +    """  | 
 | 163 | +    Pure Python implementation of the PEP 383: the "surrogateescape" error  | 
 | 164 | +    handler of Python 3. Undecodable bytes will be replaced by a Unicode  | 
 | 165 | +    character U+DCxx on decoding, and these are translated into the  | 
 | 166 | +    original bytes on encoding.  | 
 | 167 | +    """  | 
 | 168 | +    mystring = exc.object[exc.start:exc.end]  | 
 | 169 | + | 
 | 170 | +    try:  | 
 | 171 | +        if isinstance(exc, UnicodeDecodeError):  | 
 | 172 | +            # mystring is a byte-string in this case  | 
 | 173 | +            decoded = replace_surrogate_decode(mystring)  | 
 | 174 | +        elif isinstance(exc, UnicodeEncodeError):  | 
 | 175 | +            # In the case of u'\udcc3'.encode('ascii',  | 
 | 176 | +            # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an  | 
 | 177 | +            # exception anyway after this function is called, even though I think  | 
 | 178 | +            # it's doing what it should. It seems that the strict encoder is called  | 
 | 179 | +            # to encode the unicode string that this function returns ...  | 
 | 180 | +            decoded = replace_surrogate_encode(mystring)  | 
 | 181 | +        else:  | 
 | 182 | +            raise exc  | 
 | 183 | +    except NotASurrogateError:  | 
 | 184 | +        raise exc  | 
 | 185 | +    return (decoded, exc.end)  | 
 | 186 | + | 
 | 187 | + | 
 | 188 | +class NotASurrogateError(Exception):  | 
 | 189 | +    pass  | 
 | 190 | + | 
 | 191 | + | 
 | 192 | +def replace_surrogate_encode(mystring):  | 
 | 193 | +    """  | 
 | 194 | +    Returns a (unicode) string, not the more logical bytes, because the codecs  | 
 | 195 | +    register_error functionality expects this.  | 
 | 196 | +    """  | 
 | 197 | +    decoded = []  | 
 | 198 | +    for ch in mystring:  | 
 | 199 | +        # if PY3:  | 
 | 200 | +        #     code = ch  | 
 | 201 | +        # else:  | 
 | 202 | +        code = ord(ch)  | 
 | 203 | + | 
 | 204 | +        # The following magic comes from Py3.3's Python/codecs.c file:  | 
 | 205 | +        if not 0xD800 <= code <= 0xDCFF:  | 
 | 206 | +            # Not a surrogate. Fail with the original exception.  | 
 | 207 | +            raise exc  | 
 | 208 | +        # mybytes = [0xe0 | (code >> 12),  | 
 | 209 | +        #            0x80 | ((code >> 6) & 0x3f),  | 
 | 210 | +        #            0x80 | (code & 0x3f)]  | 
 | 211 | +        # Is this a good idea?  | 
 | 212 | +        if 0xDC00 <= code <= 0xDC7F:  | 
 | 213 | +            decoded.append(_unichr(code - 0xDC00))  | 
 | 214 | +        elif code <= 0xDCFF:  | 
 | 215 | +            decoded.append(_unichr(code - 0xDC00))  | 
 | 216 | +        else:  | 
 | 217 | +            raise NotASurrogateError  | 
 | 218 | +    return str().join(decoded)  | 
 | 219 | + | 
 | 220 | + | 
 | 221 | +def replace_surrogate_decode(mybytes):  | 
 | 222 | +    """  | 
 | 223 | +    Returns a (unicode) string  | 
 | 224 | +    """  | 
 | 225 | +    decoded = []  | 
 | 226 | +    for ch in mybytes:  | 
 | 227 | +        # We may be parsing newbytes (in which case ch is an int) or a native  | 
 | 228 | +        # str on Py2  | 
 | 229 | +        if isinstance(ch, int):  | 
 | 230 | +            code = ch  | 
 | 231 | +        else:  | 
 | 232 | +            code = ord(ch)  | 
 | 233 | +        if 0x80 <= code <= 0xFF:  | 
 | 234 | +            decoded.append(_unichr(0xDC00 + code))  | 
 | 235 | +        elif code <= 0x7F:  | 
 | 236 | +            decoded.append(_unichr(code))  | 
 | 237 | +        else:  | 
 | 238 | +            # # It may be a bad byte  | 
 | 239 | +            # # Try swallowing it.  | 
 | 240 | +            # continue  | 
 | 241 | +            # print("RAISE!")  | 
 | 242 | +            raise NotASurrogateError  | 
 | 243 | +    return str().join(decoded)  | 
 | 244 | + | 
 | 245 | + | 
 | 246 | +def encodefilename(fn):  | 
 | 247 | +    if FS_ENCODING == 'ascii':  | 
 | 248 | +        # ASCII encoder of Python 2 expects that the error handler returns a  | 
 | 249 | +        # Unicode string encodable to ASCII, whereas our surrogateescape error  | 
 | 250 | +        # handler has to return bytes in 0x80-0xFF range.  | 
 | 251 | +        encoded = []  | 
 | 252 | +        for index, ch in enumerate(fn):  | 
 | 253 | +            code = ord(ch)  | 
 | 254 | +            if code < 128:  | 
 | 255 | +                ch = bytes_chr(code)  | 
 | 256 | +            elif 0xDC80 <= code <= 0xDCFF:  | 
 | 257 | +                ch = bytes_chr(code - 0xDC00)  | 
 | 258 | +            else:  | 
 | 259 | +                raise UnicodeEncodeError(FS_ENCODING,  | 
 | 260 | +                    fn, index, index+1,  | 
 | 261 | +                    'ordinal not in range(128)')  | 
 | 262 | +            encoded.append(ch)  | 
 | 263 | +        return bytes().join(encoded)  | 
 | 264 | +    elif FS_ENCODING == 'utf-8':  | 
 | 265 | +        # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF  | 
 | 266 | +        # doesn't go through our error handler  | 
 | 267 | +        encoded = []  | 
 | 268 | +        for index, ch in enumerate(fn):  | 
 | 269 | +            code = ord(ch)  | 
 | 270 | +            if 0xD800 <= code <= 0xDFFF:  | 
 | 271 | +                if 0xDC80 <= code <= 0xDCFF:  | 
 | 272 | +                    ch = bytes_chr(code - 0xDC00)  | 
 | 273 | +                    encoded.append(ch)  | 
 | 274 | +                else:  | 
 | 275 | +                    raise UnicodeEncodeError(  | 
 | 276 | +                        FS_ENCODING,  | 
 | 277 | +                        fn, index, index+1, 'surrogates not allowed')  | 
 | 278 | +            else:  | 
 | 279 | +                ch_utf8 = ch.encode('utf-8')  | 
 | 280 | +                encoded.append(ch_utf8)  | 
 | 281 | +        return bytes().join(encoded)  | 
 | 282 | +    else:  | 
 | 283 | +        return fn.encode(FS_ENCODING, FS_ERRORS)  | 
 | 284 | + | 
 | 285 | +def decodefilename(fn):  | 
 | 286 | +    return fn.decode(FS_ENCODING, FS_ERRORS)  | 
 | 287 | + | 
 | 288 | +FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')  | 
 | 289 | +# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')  | 
 | 290 | +# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')  | 
 | 291 | + | 
 | 292 | + | 
 | 293 | +# normalize the filesystem encoding name.  | 
 | 294 | +# For example, we expect "utf-8", not "UTF8".  | 
 | 295 | +FS_ENCODING = codecs.lookup(FS_ENCODING).name  | 
 | 296 | + | 
 | 297 | + | 
 | 298 | +def register_surrogateescape():  | 
 | 299 | +    """  | 
 | 300 | +    Registers the surrogateescape error handler on Python 2 (only)  | 
 | 301 | +    """  | 
 | 302 | +    if PY3:  | 
 | 303 | +        return  | 
 | 304 | +    try:  | 
 | 305 | +        codecs.lookup_error(FS_ERRORS)  | 
 | 306 | +    except LookupError:  | 
 | 307 | +        codecs.register_error(FS_ERRORS, surrogateescape_handler)  | 
 | 308 | + | 
 | 309 | + | 
 | 310 | +try:  | 
 | 311 | +    "hello".decode(defenc, "surrogateescape")  | 
 | 312 | +except:  | 
 | 313 | +    register_surrogateescape()  | 
0 commit comments