Ticket #534: unicode-helper-functions.diff

File unicode-helper-functions.diff, 10.1 KB (added by francois, at 2010-04-24T00:11:27Z)

This patch contains Unicode helper functions (stringutils.py) and associated tests

Line 
1Sat Apr 24 01:56:43 CEST 2010  Francois Deppierraz <francois@ctrlaltdel.ch>
2  * stringutils.py: Unicode helper functions + associated tests
3 
4  This file contains a bunch of helper functions which converts
5  unicode string from and to argv, filenames and stdout.
6diff -rN old-tahoe-534/src/allmydata/test/test_stringutils.py new-tahoe-534/src/allmydata/test/test_stringutils.py
70a1,158
8> # coding=utf-8
9>
10> TEST_FILENAMES = (
11>   u'Ärtonwall.mp3',
12>   u'test_file',
13>   u'Blah blah.txt',
14> )
15>
16> # The following main helps to generate a test class for other operating
17> # systems.
18>
19> if __name__ == "__main__":
20>     import sys, os
21>     import tempfile
22>     import shutil
23>     import platform
24>     
25>     if len(sys.argv) != 2:
26>         print "Usage: %s lumière" % sys.argv[0]
27>         sys.exit(1)
28>     
29>     print
30>     print "class MyWeirdOS(StringUtils, unittest.TestCase):"
31>     print "    uname = '%s'" % ' '.join(platform.uname())
32>     print "    argv = %s" % repr(sys.argv[1])
33>     print "    platform = '%s'" % sys.platform
34>     print "    filesystemencoding = '%s'" % sys.getfilesystemencoding()
35>     print "    stdoutencoding = '%s'" % sys.stdout.encoding
36>
37>     try:
38>         tmpdir = tempfile.mkdtemp()
39>         for fname in TEST_FILENAMES:
40>             open(os.path.join(tmpdir, fname), 'w').close()
41>
42>         # Use Unicode API under Windows or MacOS X
43>         if sys.platform in ('win32', 'darwin'):
44>             dirlist = os.listdir(unicode(tmpdir))
45>         else:
46>             dirlist = os.listdir(tmpdir)
47>
48>         print "    dirlist = %s" % repr(dirlist)
49>     except:
50>         print "    # Oops, I cannot write filenames containing non-ascii characters"
51>     print
52>
53>     shutil.rmtree(tmpdir)
54>     sys.exit(0)
55>
56> from twisted.trial import unittest
57> from mock import patch
58> import sys
59>
60> from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \
61>     unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode
62>
63> class StringUtils():
64>     def setUp(self):
65>         # Mock sys.platform because unicode_platform() uses it
66>         self.original_platform = sys.platform
67>         sys.platform = self.platform
68>
69>     def tearDown(self):
70>         sys.platform = self.original_platform
71>
72>     @patch('sys.stdout')
73>     def test_argv_to_unicode(self, mock):
74>         mock.encoding = self.stdoutencoding
75>
76>         argu = u'lumière'
77>         argv = self.argv
78>
79>         self.failUnlessEqual(argv_to_unicode(argv), argu)
80>
81>     def test_unicode_to_url(self):
82>         self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8'))
83>
84>     @patch('sys.stdout')
85>     def test_unicode_to_stdout(self, mock):
86>         mock.encoding = self.stdoutencoding
87>         self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv)
88>
89>     def test_unicode_platform(self):
90>         matrix = {
91>           'linux2': False,
92>           'win32':  True,
93>           'darwin': True,
94>         }
95>
96>         self.failUnlessEqual(unicode_platform(), matrix[self.platform])
97
98>     @patch('sys.getfilesystemencoding')
99>     @patch('os.listdir')
100>     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
101>
102>         mock_listdir.return_value = self.dirlist
103>         mock_getfilesystemencoding.return_value = self.filesystemencoding
104>       
105>         filenames = listdir_unicode(u'/dummy')
106>
107>         for fname in TEST_FILENAMES:
108>             self.failUnless(isinstance(fname, unicode))
109>
110>             if fname not in filenames:
111>                 self.fail("Cannot find %r in %r" % (fname, filenames))
112>
113>     @patch('os.open')
114>     def test_open_unicode(self, mock):
115>         pass
116>
117> class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
118>     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
119>     argv = 'lumi\xc3\xa8re'
120>     platform = 'linux2'
121>     filesystemencoding = 'UTF-8'
122>     stdoutencoding = 'UTF-8'
123>     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
124>
125>
126> class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
127>     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
128>     argv = 'lumi\xe8re'
129>     platform = 'linux2'
130>     filesystemencoding = 'ISO-8859-1'
131>     stdoutencoding = 'ISO-8859-1'
132>     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
133>
134> class WindowsXP(StringUtils, unittest.TestCase):
135>     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
136>     argv = 'lumi\xe8re'
137>     platform = 'win32'
138>     filesystemencoding = 'mbcs'
139>     stdoutencoding = 'cp850'
140>     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
141>
142> #class WindowsXP_UTF8(StringUtils, unittest.TestCase):
143> #    uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
144> #    argv = 'lumi\xe8re'
145> #    platform = 'win32'
146> #    filesystemencoding = 'mbcs'
147> #    stdoutencoding = 'cp65001'
148> #    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
149>
150> class WindowsVista(StringUtils, unittest.TestCase):
151>     uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
152>     argv = 'lumi\xe8re'
153>     platform = 'win32'
154>     filesystemencoding = 'mbcs'
155>     stdoutencoding = 'cp850'
156>     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
157>
158> class MacOSXLeopard(StringUtils, unittest.TestCase):
159>     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
160>     argv = 'lumi\xc3\xa8re'
161>     platform = 'darwin'
162>     filesystemencoding = 'utf-8'
163>     stdoutencoding = 'UTF-8'
164>     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
165>
166diff -rN old-tahoe-534/src/allmydata/util/stringutils.py new-tahoe-534/src/allmydata/util/stringutils.py
1670a1,144
168> """
169> Functions used to convert inputs from whatever encoding used in the system to
170> unicode and back.
171> """
172>
173> import sys
174> import os
175> import unicodedata
176> from allmydata.util.assertutil import precondition
177> from twisted.python import usage
178>
179> def get_stdout_encoding():
180>     """
181>     Returns the encoding expected for writing to stdout. If valid encoding
182>     could be found, use UTF-8.
183>     """
184>     # If you force Windows cmd.exe set to use UTF-8 by typing 'chcp 65001',
185>     # sys.stdin.encoding and sys.stdout.encoding will be set to 'cp65001',
186>     # which is not recognized as being the same as UTF-8.
187>     #
188>     # http://msdn.microsoft.com/en-us/library/dd317756%28VS.85%29.aspx
189>     # Codepage 65001 -> Unicode (UTF-8)
190>     # Codepage 850   -> OEM Multilingual Latin 1; Western European (DOS)
191>
192>     enc = sys.stdout.encoding
193>
194>     if enc is None or enc == 'cp65001':
195>         enc = 'utf-8'
196>
197>     if enc == 'cp850':
198>         enc = 'ISO-8859-1'
199>
200>     return enc
201>
202> def argv_to_unicode(s):
203>     """
204>     Decode given argv element to unicode.
205>     """
206>     # sys.argv encoding detection in Python is not trivial so utf-8 is
207>     # currently used by default and an informative error message is given if
208>     # the argument cannot be correctly decoded.
209>
210>     precondition(isinstance(s, str), s)
211>
212>     try:
213>         return unicode(s, get_stdout_encoding())
214>     except UnicodeEncodeError:
215>         raise usageError("Argument '%s' cannot be decoded as UTF-8." % s)
216>
217> def unicode_to_url(s):
218>     """
219>     Encode an unicode object used in an URL.
220>     """
221>     # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded.
222>
223>     precondition(isinstance(s, unicode), s)
224>     return s.encode('utf-8')
225>
226> def unicode_to_stdout(s):
227>     """
228>     Encode an unicode object for representation on stdout.
229>     """
230>
231>     precondition(isinstance(s, unicode), s)
232>
233>     try:
234>         return s.encode(get_stdout_encoding(), 'replace')
235>     except LookupError:
236>         return s.encode('utf-8', 'replace')  # maybe
237>
238> def unicode_platform():
239>     """
240>     Does the current platform handle Unicode filenames natively ?
241>     """
242>
243>     return sys.platform in ('win32', 'darwin')
244>
245> class FilenameEncodingError(Exception):
246>     """
247>     Filename cannot be encoded using the current encoding of your filesystem
248>     (%s). Please configure your locale correctly or rename this file.
249>     """
250>
251>     pass
252>
253> def listdir_unicode_unix(path):
254>     """
255>     This function emulates an Unicode API under Unix similar to one available
256>     under Windows or MacOS X.
257>
258>     If badly encoded filenames are encoutered, an exception is raised.
259>     """
260>     precondition(isinstance(path, unicode), path)
261>
262>     encoding = sys.getfilesystemencoding()
263>     try:
264>         byte_path = path.encode(encoding)
265>     except UnicodeEncodeError:
266>         raise FilenameEncodingError(path)
267>
268>     try:
269>         return [unicode(fn, encoding) for fn in os.listdir(byte_path)]
270>     except UnicodeDecodeError:
271>         raise FilenameEncodingError(fn)
272>
273> def listdir_unicode(path, encoding = None):
274>     """
275>     Wrapper around listdir() which provides safe access to the convenient
276>     Unicode API even under Unix.
277>     """
278>
279>     precondition(isinstance(path, unicode), path)
280>
281>     # On Windows and MacOS X, the Unicode API is used
282>     if unicode_platform():
283>         dirlist = os.listdir(path)
284>
285>     # On other platforms (ie. Unix systems), the byte-level API is used
286>     else:
287>         dirlist = listdir_unicode_unix(path)
288>
289>     # Normalize the resulting unicode filenames
290>     #
291>     # This prevents different OS from generating non-equal unicode strings for
292>     # the same filename representation
293>     return [unicodedata.normalize('NFC', fname) for fname in dirlist]
294>
295> def open_unicode(path, mode='r'):
296>     """
297>     Wrapper around open() which provides safe access to the convenient Unicode
298>     API even under Unix.
299>     """
300>
301>     precondition(isinstance(path, unicode), path)
302>
303>     if unicode_platform():
304>         return open(path, mode)
305>     else:
306>         encoding = sys.getfilesystemencoding()
307>
308>         try:
309>             return open(path.encode(encoding), mode)
310>         except UnicodeEncodeError:
311>             raise FilenameEncodingError(path)