Ticket #1135: encodingutil-reduce-quote-escaping.dpatch

File encodingutil-reduce-quote-escaping.dpatch, 10.3 KB (added by davidsarah, at 2010-07-23T08:17:42Z)

util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used; and includes tests. fixes #1135

Line 
1Fri Jul 23 08:53:14 GMT Daylight Time 2010  david-sarah@jacaranda.org
2  * util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135
3
4New patches:
5
6[util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135
7david-sarah@jacaranda.org**20100723075314
8 Ignore-this: b82205834d17db61612dd16436b7c5a2
9] {
10hunk ./src/allmydata/test/test_encodingutil.py 60
11 
12 from allmydata.test.common_util import ReallyEqualMixin
13 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
14-    unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \
15-    get_output_encoding, get_filesystem_encoding, _reload
16+    unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
17+    FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload
18 from allmydata.dirnode import normalize
19 
20 from twisted.python import usage
21hunk ./src/allmydata/test/test_encodingutil.py 289
22             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
23 
24 
25+class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
26+    def _check(self, inp, out, enc, optional_quotes):
27+        out2 = out
28+        if optional_quotes:
29+            out2 = out2[1:-1]
30+        self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out)
31+        self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2)
32+        if out[0:2] != 'b"':
33+            if isinstance(inp, str):
34+                self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out)
35+                self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2)
36+            else:
37+                self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out)
38+                self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2)
39+
40+    def _test_quote_output_all(self, enc):
41+        def check(inp, out, optional_quotes=False):
42+            self._check(inp, out, enc, optional_quotes)
43+
44+        # optional single quotes
45+        check("foo",  "'foo'",  True)
46+        check("\\",   "'\\'",   True)
47+        check("$\"`", "'$\"`'", True)
48+
49+        # mandatory single quotes
50+        check("\"",   "'\"'")
51+
52+        # double quotes
53+        check("'",    "\"'\"")
54+        check("\n",   "\"\\x0a\"")
55+        check("\x00", "\"\\x00\"")
56+
57+        # invalid Unicode and astral planes
58+        check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
59+        check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
60+        check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
61+        check(u"\uD800\uDC00",       "\"\\U00010000\"")
62+        check(u"\uD800\uDC01",       "\"\\U00010001\"")
63+        check(u"\uD801\uDC00",       "\"\\U00010400\"")
64+        check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
65+        check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
66+        check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
67+
68+        # invalid UTF-8
69+        check("\xFF",                "b\"\\xff\"")
70+        check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
71+
72+    def test_quote_output_ascii(self, enc='ascii'):
73+        def check(inp, out, optional_quotes=False):
74+            self._check(inp, out, enc, optional_quotes)
75+
76+        self._test_quote_output_all(enc)
77+        check(u"\u00D7",   "\"\\xd7\"")
78+        check(u"'\u00D7",  "\"'\\xd7\"")
79+        check(u"\"\u00D7", "\"\\\"\\xd7\"")
80+        check(u"\u2621",   "\"\\u2621\"")
81+        check(u"'\u2621",  "\"'\\u2621\"")
82+        check(u"\"\u2621", "\"\\\"\\u2621\"")
83+
84+    def test_quote_output_latin1(self, enc='latin1'):
85+        def check(inp, out, optional_quotes=False):
86+            self._check(inp, out.encode('latin1'), enc, optional_quotes)
87+
88+        self._test_quote_output_all(enc)
89+        check(u"\u00D7",   u"'\u00D7'", True)
90+        check(u"'\u00D7",  u"\"'\u00D7\"")
91+        check(u"\"\u00D7", u"'\"\u00D7'")
92+        check(u"\u00D7\"", u"'\u00D7\"'", True)
93+        check(u"\u2621",   u"\"\\u2621\"")
94+        check(u"'\u2621",  u"\"'\\u2621\"")
95+        check(u"\"\u2621", u"\"\\\"\\u2621\"")
96+
97+    def test_quote_output_utf8(self, enc='utf-8'):
98+        def check(inp, out, optional_quotes=False):
99+            self._check(inp, out.encode('utf-8'), enc, optional_quotes)
100+
101+        self._test_quote_output_all(enc)
102+        check(u"\u2621",   u"'\u2621'", True)
103+        check(u"'\u2621",  u"\"'\u2621\"")
104+        check(u"\"\u2621", u"'\"\u2621'")
105+        check(u"\u2621\"", u"'\u2621\"'", True)
106+
107+    @patch('sys.stdout')
108+    def test_quote_output_mock(self, mock_stdout):
109+        mock_stdout.encoding = 'ascii'
110+        _reload()
111+        self.test_quote_output_ascii(None)
112+
113+        mock_stdout.encoding = 'latin1'
114+        _reload()
115+        self.test_quote_output_latin1(None)
116+
117+        mock_stdout.encoding = 'utf-8'
118+        _reload()
119+        self.test_quote_output_utf8(None)
120+
121+
122 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
123     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
124     output = 'lumi\xc3\xa8re'
125hunk ./src/allmydata/util/encodingutil.py 115
126         return s
127     return s.encode(argv_encoding)
128 
129-PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL)
130-PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL)
131+PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$',          re.DOTALL)
132+PRINTABLE_8BIT  = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
133 
134 def is_printable_ascii(s):
135     return PRINTABLE_ASCII.search(s) is not None
136hunk ./src/allmydata/util/encodingutil.py 140
137                                  (output_encoding, repr(s)))
138     return out
139 
140+
141+def _unicode_escape(m):
142+    u = m.group(0)
143+    if u == '"' or u == '$' or u == '`' or u == '\\':
144+        return u'\\' + u
145+    if len(u) == 2:
146+        codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
147+    else:
148+        codepoint = ord(u)
149+    if codepoint > 0xFFFF:
150+        return u'\\U%08x' % (codepoint,)
151+    elif codepoint > 0xFF:
152+        return u'\\u%04x' % (codepoint,)
153+    else:
154+        return u'\\x%02x' % (codepoint,)
155+
156+def _str_escape(m):
157+    c = m.group(0)
158+    if c == '"' or c == '$' or c == '`' or c == '\\':
159+        return '\\' + c
160+    else:
161+        return '\\x%02x' % (ord(c),)
162+
163+MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
164+
165+# if we must double-quote, then we have to escape ", $ and `, but need not escape '
166+ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
167+                               ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
168+                               re.DOTALL)
169+
170+ESCAPABLE_8BIT    = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
171+
172 def quote_output(s, quotemarks=True, encoding=None):
173     """
174     Encode either a Unicode string or a UTF-8-encoded bytestring for representation
175hunk ./src/allmydata/util/encodingutil.py 176
176     on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
177-    always surrounded by single quotes; otherwise, it is quoted only if necessary to
178-    avoid ambiguity or control bytes in the output.
179+    always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
180+    control bytes in the output.
181+    Quoting may use either single or double quotes. Within single quotes, all
182+    characters stand for themselves, and ' will not appear. Within double quotes,
183+    Python-compatible backslash escaping is used.
184     """
185     precondition(isinstance(s, (str, unicode)), s)
186 
187hunk ./src/allmydata/util/encodingutil.py 188
188         try:
189             s = s.decode('utf-8')
190         except UnicodeDecodeError:
191-            return 'b' + repr(s)
192-
193-    try:
194-        out = s.encode(encoding or output_encoding)
195-    except (UnicodeEncodeError, UnicodeDecodeError):
196-        return repr(s)
197+            return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),)
198 
199hunk ./src/allmydata/util/encodingutil.py 190
200-    if PRINTABLE_8BIT.search(out) is None:
201-        return repr(out)
202+    if MUST_DOUBLE_QUOTE.search(s) is None:
203+        try:
204+            out = s.encode(encoding or output_encoding)
205+            if quotemarks or out.startswith('"'):
206+                return "'%s'" % (out,)
207+            else:
208+                return out
209+        except (UnicodeDecodeError, UnicodeEncodeError):
210+            pass
211 
212hunk ./src/allmydata/util/encodingutil.py 200
213-    if quotemarks:
214-        return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'"
215-    else:
216-        return out
217+    escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s)
218+    return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),)
219 
220 def quote_path(path, quotemarks=True):
221     return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks)
222}
223
224Context:
225
226[docs/specifications/dirnodes.txt: 'mesh'->'grid'.
227david-sarah@jacaranda.org**20100723061616
228 Ignore-this: 887bcf921ef00afba8e05e9239035bca
229] 
230[docs: use current cap to Zooko's wiki page in example text
231zooko@zooko.com**20100721010543
232 Ignore-this: 4f36f36758f9fdbaf9eb73eac23b6652
233 fixes #1134
234] 
235[docs/specifications/dirnodes.txt: bring layer terminology up-to-date with architecture.txt, and a few other updates (e.g. note that the MAC is no longer verified, and that URIs can be unknown). Also 'Tahoe'->'Tahoe-LAFS'.
236david-sarah@jacaranda.org**20100723054703
237 Ignore-this: f3b98183e7d0a0f391225b8b93ac6c37
238] 
239[__init__.py: silence DeprecationWarning about BaseException.message globally. fixes #1129
240david-sarah@jacaranda.org**20100720011939
241 Ignore-this: 38808986ba79cb2786b010504a22f89
242] 
243[test_runner: test that 'tahoe --version' outputs no noise (e.g. DeprecationWarnings).
244david-sarah@jacaranda.org**20100720011345
245 Ignore-this: dd358b7b2e5d57282cbe133e8069702e
246] 
247[TAG allmydata-tahoe-1.7.1
248zooko@zooko.com**20100719131352
249 Ignore-this: 6942056548433dc653a746703819ad8c
250] 
251Patch bundle hash:
252d4aa6ac35c5dba44996999385ca90717c2525a3e