source: trunk/src/allmydata/util/encodingutil.py @ f036dfa

Last change on this file since f036dfa was f036dfa, checked in by david-sarah <david-sarah@…>, at 2010-07-25T01:03:18Z

Fix test failures due to Unicode basedir patches.

  • Property mode set to 100644
File size: 8.6 KB
Line 
1"""
2Functions used to convert inputs from whatever encoding used in the system to
3unicode and back.
4"""
5
6import sys
7import os
8import re
9from allmydata.util.assertutil import precondition
10from twisted.python import usage
11import locale
12from allmydata.util import log
13from allmydata.util.fileutil import abspath_expanduser_unicode
14
15
16def _canonical_encoding(encoding):
17    if encoding is None:
18        log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
19        encoding = 'utf-8'
20    encoding = encoding.lower()
21    if encoding == "cp65001":
22        encoding = 'utf-8'
23    elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
24        encoding = 'ascii'
25
26    # sometimes Python returns an encoding name that it doesn't support for conversion
27    # fail early if this happens
28    try:
29        u"test".encode(encoding)
30    except (LookupError, AttributeError):
31        raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
32
33    return encoding
34
35filesystem_encoding = None
36output_encoding = None
37argv_encoding = None
38is_unicode_platform = False
39
40def _reload():
41    global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
42
43    filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
44
45    outenc = None
46    if hasattr(sys.stdout, 'encoding'):
47        outenc = sys.stdout.encoding
48    if outenc is None:
49        try:
50            outenc = locale.getpreferredencoding()
51        except Exception:
52            pass  # work around <http://bugs.python.org/issue1443504>
53    output_encoding = _canonical_encoding(outenc)
54
55    if sys.platform == 'win32':
56        # Unicode arguments are not supported on Windows yet; see #565 and #1074.
57        argv_encoding = 'ascii'
58    else:
59        argv_encoding = output_encoding
60    is_unicode_platform = sys.platform in ["win32", "darwin"]
61
62_reload()
63
64
65def get_filesystem_encoding():
66    """
67    Returns expected encoding for local filenames.
68    """
69    return filesystem_encoding
70
71def get_output_encoding():
72    """
73    Returns expected encoding for writing to stdout or stderr.
74    """
75    return output_encoding
76
77def get_argv_encoding():
78    """
79    Returns expected encoding for command-line arguments.
80    """
81    return argv_encoding
82
83def argv_to_unicode(s):
84    """
85    Decode given argv element to unicode. If this fails, raise a UsageError.
86    """
87    precondition(isinstance(s, str), s)
88
89    try:
90        return unicode(s, argv_encoding)
91    except UnicodeDecodeError:
92        raise usage.UsageError("Argument %s cannot be decoded as %s." %
93                               (quote_output(s), argv_encoding))
94
95def argv_to_abspath(s):
96    """
97    Convenience function to decode an argv element to an absolute path, with ~ expanded.
98    If this fails, raise a UsageError.
99    """
100    return abspath_expanduser_unicode(argv_to_unicode(s))
101
102def unicode_to_argv(s, mangle=False):
103    """
104    Encode the given Unicode argument as a bytestring.
105    If the argument is to be passed to a different process, then the 'mangle' argument
106    should be true; on Windows, this uses a mangled encoding that will be reversed by
107    code in runner.py.
108    """
109    precondition(isinstance(s, unicode), s)
110
111    if mangle and sys.platform == "win32":
112        # This must be the same as 'mangle' in bin/tahoe-script.template.
113        return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
114    else:
115        return s.encode(argv_encoding)
116
117def unicode_to_url(s):
118    """
119    Encode an unicode object used in an URL.
120    """
121    # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
122
123    # FIXME
124    return to_str(s)
125    #precondition(isinstance(s, unicode), s)
126    #return s.encode('utf-8')
127
128def to_str(s):
129    if s is None or isinstance(s, str):
130        return s
131    return s.encode('utf-8')
132
133PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
134PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
135
136def is_printable_ascii(s):
137    return PRINTABLE_ASCII.search(s) is not None
138
139def unicode_to_output(s):
140    """
141    Encode an unicode object for representation on stdout or stderr.
142    """
143    precondition(isinstance(s, unicode), s)
144
145    try:
146        out = s.encode(output_encoding)
147    except (UnicodeEncodeError, UnicodeDecodeError):
148        raise UnicodeEncodeError(output_encoding, s, 0, 0,
149                                 "A string could not be encoded as %s for output to the terminal:\n%r" %
150                                 (output_encoding, repr(s)))
151
152    if PRINTABLE_8BIT.search(out) is None:
153        raise UnicodeEncodeError(output_encoding, s, 0, 0,
154                                 "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
155                                 (output_encoding, repr(s)))
156    return out
157
158
159def _unicode_escape(m):
160    u = m.group(0)
161    if u == '"' or u == '$' or u == '`' or u == '\\':
162        return u'\\' + u
163    if len(u) == 2:
164        codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
165    else:
166        codepoint = ord(u)
167    if codepoint > 0xFFFF:
168        return u'\\U%08x' % (codepoint,)
169    elif codepoint > 0xFF:
170        return u'\\u%04x' % (codepoint,)
171    else:
172        return u'\\x%02x' % (codepoint,)
173
174def _str_escape(m):
175    c = m.group(0)
176    if c == '"' or c == '$' or c == '`' or c == '\\':
177        return '\\' + c
178    else:
179        return '\\x%02x' % (ord(c),)
180
181MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
182
183# if we must double-quote, then we have to escape ", $ and `, but need not escape '
184ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
185                               ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
186                               re.DOTALL)
187
188ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
189
190def quote_output(s, quotemarks=True, encoding=None):
191    """
192    Encode either a Unicode string or a UTF-8-encoded bytestring for representation
193    on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
194    always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
195    control bytes in the output.
196    Quoting may use either single or double quotes. Within single quotes, all
197    characters stand for themselves, and ' will not appear. Within double quotes,
198    Python-compatible backslash escaping is used.
199    """
200    precondition(isinstance(s, (str, unicode)), s)
201
202    if isinstance(s, str):
203        try:
204            s = s.decode('utf-8')
205        except UnicodeDecodeError:
206            return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
207
208    if MUST_DOUBLE_QUOTE.search(s) is None:
209        try:
210            out = s.encode(encoding or output_encoding)
211            if quotemarks or out.startswith('"'):
212                return "'%s'" % (out,)
213            else:
214                return out
215        except (UnicodeDecodeError, UnicodeEncodeError):
216            pass
217
218    escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
219    return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
220
221def quote_path(path, quotemarks=True):
222    return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
223
224
225def unicode_platform():
226    """
227    Does the current platform handle Unicode filenames natively?
228    """
229    return is_unicode_platform
230
231class FilenameEncodingError(Exception):
232    """
233    Filename cannot be encoded using the current encoding of your filesystem
234    (%s). Please configure your locale correctly or rename this file.
235    """
236    pass
237
238def listdir_unicode_fallback(path):
239    """
240    This function emulates a fallback Unicode API similar to one available
241    under Windows or MacOS X.
242
243    If badly encoded filenames are encountered, an exception is raised.
244    """
245    precondition(isinstance(path, unicode), path)
246
247    try:
248        byte_path = path.encode(filesystem_encoding)
249    except (UnicodeEncodeError, UnicodeDecodeError):
250        raise FilenameEncodingError(path)
251
252    try:
253        return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
254    except UnicodeDecodeError:
255        raise FilenameEncodingError(fn)
256
257def listdir_unicode(path):
258    """
259    Wrapper around listdir() which provides safe access to the convenient
260    Unicode API even under platforms that don't provide one natively.
261    """
262    precondition(isinstance(path, unicode), path)
263
264    # On Windows and MacOS X, the Unicode API is used
265    # On other platforms (ie. Unix systems), the byte-level API is used
266
267    if is_unicode_platform:
268        return os.listdir(path)
269    else:
270        return listdir_unicode_fallback(path)
Note: See TracBrowser for help on using the repository browser.