1 | Sat Apr 24 01:56:43 CEST 2010 Francois Deppierraz <francois@ctrlaltdel.ch> |
---|
2 | * stringutils.py: Unicode helper functions + associated tests |
---|
3 | |
---|
4 | This file contains a bunch of helper functions which converts |
---|
5 | unicode string from and to argv, filenames and stdout. |
---|
6 | diff -rN old-tahoe-534/src/allmydata/test/test_stringutils.py new-tahoe-534/src/allmydata/test/test_stringutils.py |
---|
7 | 0a1,158 |
---|
8 | > # coding=utf-8 |
---|
9 | > |
---|
10 | > TEST_FILENAMES = ( |
---|
11 | > u'Ärtonwall.mp3', |
---|
12 | > u'test_file', |
---|
13 | > u'Blah blah.txt', |
---|
14 | > ) |
---|
15 | > |
---|
16 | > # The following main helps to generate a test class for other operating |
---|
17 | > # systems. |
---|
18 | > |
---|
19 | > if __name__ == "__main__": |
---|
20 | > import sys, os |
---|
21 | > import tempfile |
---|
22 | > import shutil |
---|
23 | > import platform |
---|
24 | > |
---|
25 | > if len(sys.argv) != 2: |
---|
26 | > print "Usage: %s lumière" % sys.argv[0] |
---|
27 | > sys.exit(1) |
---|
28 | > |
---|
29 | > print |
---|
30 | > print "class MyWeirdOS(StringUtils, unittest.TestCase):" |
---|
31 | > print " uname = '%s'" % ' '.join(platform.uname()) |
---|
32 | > print " argv = %s" % repr(sys.argv[1]) |
---|
33 | > print " platform = '%s'" % sys.platform |
---|
34 | > print " filesystemencoding = '%s'" % sys.getfilesystemencoding() |
---|
35 | > print " stdoutencoding = '%s'" % sys.stdout.encoding |
---|
36 | > |
---|
37 | > try: |
---|
38 | > tmpdir = tempfile.mkdtemp() |
---|
39 | > for fname in TEST_FILENAMES: |
---|
40 | > open(os.path.join(tmpdir, fname), 'w').close() |
---|
41 | > |
---|
42 | > # Use Unicode API under Windows or MacOS X |
---|
43 | > if sys.platform in ('win32', 'darwin'): |
---|
44 | > dirlist = os.listdir(unicode(tmpdir)) |
---|
45 | > else: |
---|
46 | > dirlist = os.listdir(tmpdir) |
---|
47 | > |
---|
48 | > print " dirlist = %s" % repr(dirlist) |
---|
49 | > except: |
---|
50 | > print " # Oops, I cannot write filenames containing non-ascii characters" |
---|
51 | > print |
---|
52 | > |
---|
53 | > shutil.rmtree(tmpdir) |
---|
54 | > sys.exit(0) |
---|
55 | > |
---|
56 | > from twisted.trial import unittest |
---|
57 | > from mock import patch |
---|
58 | > import sys |
---|
59 | > |
---|
60 | > from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \ |
---|
61 | > unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode |
---|
62 | > |
---|
63 | > class StringUtils(): |
---|
64 | > def setUp(self): |
---|
65 | > # Mock sys.platform because unicode_platform() uses it |
---|
66 | > self.original_platform = sys.platform |
---|
67 | > sys.platform = self.platform |
---|
68 | > |
---|
69 | > def tearDown(self): |
---|
70 | > sys.platform = self.original_platform |
---|
71 | > |
---|
72 | > @patch('sys.stdout') |
---|
73 | > def test_argv_to_unicode(self, mock): |
---|
74 | > mock.encoding = self.stdoutencoding |
---|
75 | > |
---|
76 | > argu = u'lumière' |
---|
77 | > argv = self.argv |
---|
78 | > |
---|
79 | > self.failUnlessEqual(argv_to_unicode(argv), argu) |
---|
80 | > |
---|
81 | > def test_unicode_to_url(self): |
---|
82 | > self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8')) |
---|
83 | > |
---|
84 | > @patch('sys.stdout') |
---|
85 | > def test_unicode_to_stdout(self, mock): |
---|
86 | > mock.encoding = self.stdoutencoding |
---|
87 | > self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv) |
---|
88 | > |
---|
89 | > def test_unicode_platform(self): |
---|
90 | > matrix = { |
---|
91 | > 'linux2': False, |
---|
92 | > 'win32': True, |
---|
93 | > 'darwin': True, |
---|
94 | > } |
---|
95 | > |
---|
96 | > self.failUnlessEqual(unicode_platform(), matrix[self.platform]) |
---|
97 | > |
---|
98 | > @patch('sys.getfilesystemencoding') |
---|
99 | > @patch('os.listdir') |
---|
100 | > def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding): |
---|
101 | > |
---|
102 | > mock_listdir.return_value = self.dirlist |
---|
103 | > mock_getfilesystemencoding.return_value = self.filesystemencoding |
---|
104 | > |
---|
105 | > filenames = listdir_unicode(u'/dummy') |
---|
106 | > |
---|
107 | > for fname in TEST_FILENAMES: |
---|
108 | > self.failUnless(isinstance(fname, unicode)) |
---|
109 | > |
---|
110 | > if fname not in filenames: |
---|
111 | > self.fail("Cannot find %r in %r" % (fname, filenames)) |
---|
112 | > |
---|
113 | > @patch('os.open') |
---|
114 | > def test_open_unicode(self, mock): |
---|
115 | > pass |
---|
116 | > |
---|
117 | > class UbuntuKarmicUTF8(StringUtils, unittest.TestCase): |
---|
118 | > uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' |
---|
119 | > argv = 'lumi\xc3\xa8re' |
---|
120 | > platform = 'linux2' |
---|
121 | > filesystemencoding = 'UTF-8' |
---|
122 | > stdoutencoding = 'UTF-8' |
---|
123 | > dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt'] |
---|
124 | > |
---|
125 | > |
---|
126 | > class UbuntuKarmicLatin1(StringUtils, unittest.TestCase): |
---|
127 | > uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' |
---|
128 | > argv = 'lumi\xe8re' |
---|
129 | > platform = 'linux2' |
---|
130 | > filesystemencoding = 'ISO-8859-1' |
---|
131 | > stdoutencoding = 'ISO-8859-1' |
---|
132 | > dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3'] |
---|
133 | > |
---|
134 | > class WindowsXP(StringUtils, unittest.TestCase): |
---|
135 | > uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' |
---|
136 | > argv = 'lumi\xe8re' |
---|
137 | > platform = 'win32' |
---|
138 | > filesystemencoding = 'mbcs' |
---|
139 | > stdoutencoding = 'cp850' |
---|
140 | > dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] |
---|
141 | > |
---|
142 | > #class WindowsXP_UTF8(StringUtils, unittest.TestCase): |
---|
143 | > # uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' |
---|
144 | > # argv = 'lumi\xe8re' |
---|
145 | > # platform = 'win32' |
---|
146 | > # filesystemencoding = 'mbcs' |
---|
147 | > # stdoutencoding = 'cp65001' |
---|
148 | > # dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] |
---|
149 | > |
---|
150 | > class WindowsVista(StringUtils, unittest.TestCase): |
---|
151 | > uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel' |
---|
152 | > argv = 'lumi\xe8re' |
---|
153 | > platform = 'win32' |
---|
154 | > filesystemencoding = 'mbcs' |
---|
155 | > stdoutencoding = 'cp850' |
---|
156 | > dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] |
---|
157 | > |
---|
158 | > class MacOSXLeopard(StringUtils, unittest.TestCase): |
---|
159 | > uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' |
---|
160 | > argv = 'lumi\xc3\xa8re' |
---|
161 | > platform = 'darwin' |
---|
162 | > filesystemencoding = 'utf-8' |
---|
163 | > stdoutencoding = 'UTF-8' |
---|
164 | > dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] |
---|
165 | > |
---|
166 | diff -rN old-tahoe-534/src/allmydata/util/stringutils.py new-tahoe-534/src/allmydata/util/stringutils.py |
---|
167 | 0a1,144 |
---|
168 | > """ |
---|
169 | > Functions used to convert inputs from whatever encoding used in the system to |
---|
170 | > unicode and back. |
---|
171 | > """ |
---|
172 | > |
---|
173 | > import sys |
---|
174 | > import os |
---|
175 | > import unicodedata |
---|
176 | > from allmydata.util.assertutil import precondition |
---|
177 | > from twisted.python import usage |
---|
178 | > |
---|
179 | > def get_stdout_encoding(): |
---|
180 | > """ |
---|
181 | > Returns the encoding expected for writing to stdout. If valid encoding |
---|
182 | > could be found, use UTF-8. |
---|
183 | > """ |
---|
184 | > # If you force Windows cmd.exe set to use UTF-8 by typing 'chcp 65001', |
---|
185 | > # sys.stdin.encoding and sys.stdout.encoding will be set to 'cp65001', |
---|
186 | > # which is not recognized as being the same as UTF-8. |
---|
187 | > # |
---|
188 | > # http://msdn.microsoft.com/en-us/library/dd317756%28VS.85%29.aspx |
---|
189 | > # Codepage 65001 -> Unicode (UTF-8) |
---|
190 | > # Codepage 850 -> OEM Multilingual Latin 1; Western European (DOS) |
---|
191 | > |
---|
192 | > enc = sys.stdout.encoding |
---|
193 | > |
---|
194 | > if enc is None or enc == 'cp65001': |
---|
195 | > enc = 'utf-8' |
---|
196 | > |
---|
197 | > if enc == 'cp850': |
---|
198 | > enc = 'ISO-8859-1' |
---|
199 | > |
---|
200 | > return enc |
---|
201 | > |
---|
202 | > def argv_to_unicode(s): |
---|
203 | > """ |
---|
204 | > Decode given argv element to unicode. |
---|
205 | > """ |
---|
206 | > # sys.argv encoding detection in Python is not trivial so utf-8 is |
---|
207 | > # currently used by default and an informative error message is given if |
---|
208 | > # the argument cannot be correctly decoded. |
---|
209 | > |
---|
210 | > precondition(isinstance(s, str), s) |
---|
211 | > |
---|
212 | > try: |
---|
213 | > return unicode(s, get_stdout_encoding()) |
---|
214 | > except UnicodeEncodeError: |
---|
215 | > raise usageError("Argument '%s' cannot be decoded as UTF-8." % s) |
---|
216 | > |
---|
217 | > def unicode_to_url(s): |
---|
218 | > """ |
---|
219 | > Encode an unicode object used in an URL. |
---|
220 | > """ |
---|
221 | > # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded. |
---|
222 | > |
---|
223 | > precondition(isinstance(s, unicode), s) |
---|
224 | > return s.encode('utf-8') |
---|
225 | > |
---|
226 | > def unicode_to_stdout(s): |
---|
227 | > """ |
---|
228 | > Encode an unicode object for representation on stdout. |
---|
229 | > """ |
---|
230 | > |
---|
231 | > precondition(isinstance(s, unicode), s) |
---|
232 | > |
---|
233 | > try: |
---|
234 | > return s.encode(get_stdout_encoding(), 'replace') |
---|
235 | > except LookupError: |
---|
236 | > return s.encode('utf-8', 'replace') # maybe |
---|
237 | > |
---|
238 | > def unicode_platform(): |
---|
239 | > """ |
---|
240 | > Does the current platform handle Unicode filenames natively ? |
---|
241 | > """ |
---|
242 | > |
---|
243 | > return sys.platform in ('win32', 'darwin') |
---|
244 | > |
---|
245 | > class FilenameEncodingError(Exception): |
---|
246 | > """ |
---|
247 | > Filename cannot be encoded using the current encoding of your filesystem |
---|
248 | > (%s). Please configure your locale correctly or rename this file. |
---|
249 | > """ |
---|
250 | > |
---|
251 | > pass |
---|
252 | > |
---|
253 | > def listdir_unicode_unix(path): |
---|
254 | > """ |
---|
255 | > This function emulates an Unicode API under Unix similar to one available |
---|
256 | > under Windows or MacOS X. |
---|
257 | > |
---|
258 | > If badly encoded filenames are encoutered, an exception is raised. |
---|
259 | > """ |
---|
260 | > precondition(isinstance(path, unicode), path) |
---|
261 | > |
---|
262 | > encoding = sys.getfilesystemencoding() |
---|
263 | > try: |
---|
264 | > byte_path = path.encode(encoding) |
---|
265 | > except UnicodeEncodeError: |
---|
266 | > raise FilenameEncodingError(path) |
---|
267 | > |
---|
268 | > try: |
---|
269 | > return [unicode(fn, encoding) for fn in os.listdir(byte_path)] |
---|
270 | > except UnicodeDecodeError: |
---|
271 | > raise FilenameEncodingError(fn) |
---|
272 | > |
---|
273 | > def listdir_unicode(path, encoding = None): |
---|
274 | > """ |
---|
275 | > Wrapper around listdir() which provides safe access to the convenient |
---|
276 | > Unicode API even under Unix. |
---|
277 | > """ |
---|
278 | > |
---|
279 | > precondition(isinstance(path, unicode), path) |
---|
280 | > |
---|
281 | > # On Windows and MacOS X, the Unicode API is used |
---|
282 | > if unicode_platform(): |
---|
283 | > dirlist = os.listdir(path) |
---|
284 | > |
---|
285 | > # On other platforms (ie. Unix systems), the byte-level API is used |
---|
286 | > else: |
---|
287 | > dirlist = listdir_unicode_unix(path) |
---|
288 | > |
---|
289 | > # Normalize the resulting unicode filenames |
---|
290 | > # |
---|
291 | > # This prevents different OS from generating non-equal unicode strings for |
---|
292 | > # the same filename representation |
---|
293 | > return [unicodedata.normalize('NFC', fname) for fname in dirlist] |
---|
294 | > |
---|
295 | > def open_unicode(path, mode='r'): |
---|
296 | > """ |
---|
297 | > Wrapper around open() which provides safe access to the convenient Unicode |
---|
298 | > API even under Unix. |
---|
299 | > """ |
---|
300 | > |
---|
301 | > precondition(isinstance(path, unicode), path) |
---|
302 | > |
---|
303 | > if unicode_platform(): |
---|
304 | > return open(path, mode) |
---|
305 | > else: |
---|
306 | > encoding = sys.getfilesystemencoding() |
---|
307 | > |
---|
308 | > try: |
---|
309 | > return open(path.encode(encoding), mode) |
---|
310 | > except UnicodeEncodeError: |
---|
311 | > raise FilenameEncodingError(path) |
---|