Context Navigation

source: trunk/src/allmydata/util/encodingutil.py @ f036dfa

Visit:

Last change on this file since f036dfa was f036dfa, checked in by david-sarah <david-sarah@…>, at 2010-07-25T01:03:18Z
Fix test failures due to Unicode basedir patches.
Property mode set to `100644`
File size: 8.6 KB

Line
1	"""
2	Functions used to convert inputs from whatever encoding used in the system to
3	unicode and back.
4	"""
5
6	import sys
7	import os
8	import re
9	from allmydata.util.assertutil import precondition
10	from twisted.python import usage
11	import locale
12	from allmydata.util import log
13	from allmydata.util.fileutil import abspath_expanduser_unicode
14
15
16	def _canonical_encoding(encoding):
17	if encoding is None:
18	log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
19	encoding = 'utf-8'
20	encoding = encoding.lower()
21	if encoding == "cp65001":
22	encoding = 'utf-8'
23	elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
24	encoding = 'ascii'
25
26	# sometimes Python returns an encoding name that it doesn't support for conversion
27	# fail early if this happens
28	try:
29	u"test".encode(encoding)
30	except (LookupError, AttributeError):
31	raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
32
33	return encoding
34
35	filesystem_encoding = None
36	output_encoding = None
37	argv_encoding = None
38	is_unicode_platform = False
39
40	def _reload():
41	global filesystem_encoding, output_encoding, argv_encoding, is_unicode_platform
42
43	filesystem_encoding = _canonical_encoding(sys.getfilesystemencoding())
44
45	outenc = None
46	if hasattr(sys.stdout, 'encoding'):
47	outenc = sys.stdout.encoding
48	if outenc is None:
49	try:
50	outenc = locale.getpreferredencoding()
51	except Exception:
52	pass # work around <http://bugs.python.org/issue1443504>
53	output_encoding = _canonical_encoding(outenc)
54
55	if sys.platform == 'win32':
56	# Unicode arguments are not supported on Windows yet; see #565 and #1074.
57	argv_encoding = 'ascii'
58	else:
59	argv_encoding = output_encoding
60	is_unicode_platform = sys.platform in ["win32", "darwin"]
61
62	_reload()
63
64
65	def get_filesystem_encoding():
66	"""
67	Returns expected encoding for local filenames.
68	"""
69	return filesystem_encoding
70
71	def get_output_encoding():
72	"""
73	Returns expected encoding for writing to stdout or stderr.
74	"""
75	return output_encoding
76
77	def get_argv_encoding():
78	"""
79	Returns expected encoding for command-line arguments.
80	"""
81	return argv_encoding
82
83	def argv_to_unicode(s):
84	"""
85	Decode given argv element to unicode. If this fails, raise a UsageError.
86	"""
87	precondition(isinstance(s, str), s)
88
89	try:
90	return unicode(s, argv_encoding)
91	except UnicodeDecodeError:
92	raise usage.UsageError("Argument %s cannot be decoded as %s." %
93	(quote_output(s), argv_encoding))
94
95	def argv_to_abspath(s):
96	"""
97	Convenience function to decode an argv element to an absolute path, with ~ expanded.
98	If this fails, raise a UsageError.
99	"""
100	return abspath_expanduser_unicode(argv_to_unicode(s))
101
102	def unicode_to_argv(s, mangle=False):
103	"""
104	Encode the given Unicode argument as a bytestring.
105	If the argument is to be passed to a different process, then the 'mangle' argument
106	should be true; on Windows, this uses a mangled encoding that will be reversed by
107	code in runner.py.
108	"""
109	precondition(isinstance(s, unicode), s)
110
111	if mangle and sys.platform == "win32":
112	# This must be the same as 'mangle' in bin/tahoe-script.template.
113	return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s))
114	else:
115	return s.encode(argv_encoding)
116
117	def unicode_to_url(s):
118	"""
119	Encode an unicode object used in an URL.
120	"""
121	# According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
122
123	# FIXME
124	return to_str(s)
125	#precondition(isinstance(s, unicode), s)
126	#return s.encode('utf-8')
127
128	def to_str(s):
129	if s is None or isinstance(s, str):
130	return s
131	return s.encode('utf-8')
132
133	PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL)
134	PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
135
136	def is_printable_ascii(s):
137	return PRINTABLE_ASCII.search(s) is not None
138
139	def unicode_to_output(s):
140	"""
141	Encode an unicode object for representation on stdout or stderr.
142	"""
143	precondition(isinstance(s, unicode), s)
144
145	try:
146	out = s.encode(output_encoding)
147	except (UnicodeEncodeError, UnicodeDecodeError):
148	raise UnicodeEncodeError(output_encoding, s, 0, 0,
149	"A string could not be encoded as %s for output to the terminal:\n%r" %
150	(output_encoding, repr(s)))
151
152	if PRINTABLE_8BIT.search(out) is None:
153	raise UnicodeEncodeError(output_encoding, s, 0, 0,
154	"A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" %
155	(output_encoding, repr(s)))
156	return out
157
158
159	def _unicode_escape(m):
160	u = m.group(0)
161	if u == '"' or u == '$' or u == '`' or u == '\\':
162	return u'\\' + u
163	if len(u) == 2:
164	codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
165	else:
166	codepoint = ord(u)
167	if codepoint > 0xFFFF:
168	return u'\\U%08x' % (codepoint,)
169	elif codepoint > 0xFF:
170	return u'\\u%04x' % (codepoint,)
171	else:
172	return u'\\x%02x' % (codepoint,)
173
174	def _str_escape(m):
175	c = m.group(0)
176	if c == '"' or c == '$' or c == '`' or c == '\\':
177	return '\\' + c
178	else:
179	return '\\x%02x' % (ord(c),)
180
181	MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
182
183	# if we must double-quote, then we have to escape ", $ and `, but need not escape '
184	ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])\|' # valid surrogate pairs
185	ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
186	re.DOTALL)
187
188	ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
189
190	def quote_output(s, quotemarks=True, encoding=None):
191	"""
192	Encode either a Unicode string or a UTF-8-encoded bytestring for representation
193	on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
194	always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
195	control bytes in the output.
196	Quoting may use either single or double quotes. Within single quotes, all
197	characters stand for themselves, and ' will not appear. Within double quotes,
198	Python-compatible backslash escaping is used.
199	"""
200	precondition(isinstance(s, (str, unicode)), s)
201
202	if isinstance(s, str):
203	try:
204	s = s.decode('utf-8')
205	except UnicodeDecodeError:
206	return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
207
208	if MUST_DOUBLE_QUOTE.search(s) is None:
209	try:
210	out = s.encode(encoding or output_encoding)
211	if quotemarks or out.startswith('"'):
212	return "'%s'" % (out,)
213	else:
214	return out
215	except (UnicodeDecodeError, UnicodeEncodeError):
216	pass
217
218	escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
219	return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
220
221	def quote_path(path, quotemarks=True):
222	return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
223
224
225	def unicode_platform():
226	"""
227	Does the current platform handle Unicode filenames natively?
228	"""
229	return is_unicode_platform
230
231	class FilenameEncodingError(Exception):
232	"""
233	Filename cannot be encoded using the current encoding of your filesystem
234	(%s). Please configure your locale correctly or rename this file.
235	"""
236	pass
237
238	def listdir_unicode_fallback(path):
239	"""
240	This function emulates a fallback Unicode API similar to one available
241	under Windows or MacOS X.
242
243	If badly encoded filenames are encountered, an exception is raised.
244	"""
245	precondition(isinstance(path, unicode), path)
246
247	try:
248	byte_path = path.encode(filesystem_encoding)
249	except (UnicodeEncodeError, UnicodeDecodeError):
250	raise FilenameEncodingError(path)
251
252	try:
253	return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
254	except UnicodeDecodeError:
255	raise FilenameEncodingError(fn)
256
257	def listdir_unicode(path):
258	"""
259	Wrapper around listdir() which provides safe access to the convenient
260	Unicode API even under platforms that don't provide one natively.
261	"""
262	precondition(isinstance(path, unicode), path)
263
264	# On Windows and MacOS X, the Unicode API is used
265	# On other platforms (ie. Unix systems), the byte-level API is used
266
267	if is_unicode_platform:
268	return os.listdir(path)
269	else:
270	return listdir_unicode_fallback(path)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: