Fri Jul 23 08:53:14 GMT Daylight Time 2010 david-sarah@jacaranda.org * util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135 New patches: [util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135 david-sarah@jacaranda.org**20100723075314 Ignore-this: b82205834d17db61612dd16436b7c5a2 ] { hunk ./src/allmydata/test/test_encodingutil.py 60 from allmydata.test.common_util import ReallyEqualMixin from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \ - unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \ - get_output_encoding, get_filesystem_encoding, _reload + unicode_to_output, quote_output, unicode_platform, listdir_unicode, \ + FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload from allmydata.dirnode import normalize from twisted.python import usage hunk ./src/allmydata/test/test_encodingutil.py 289 self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb') +class QuoteOutput(ReallyEqualMixin, unittest.TestCase): + def _check(self, inp, out, enc, optional_quotes): + out2 = out + if optional_quotes: + out2 = out2[1:-1] + self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out) + self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2) + if out[0:2] != 'b"': + if isinstance(inp, str): + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out) + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2) + else: + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out) + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2) + + def _test_quote_output_all(self, enc): + def check(inp, out, optional_quotes=False): + self._check(inp, out, enc, optional_quotes) + + # optional single quotes + check("foo", "'foo'", True) + check("\\", "'\\'", True) + check("$\"`", "'$\"`'", True) + + # mandatory single quotes + check("\"", "'\"'") + + # double quotes + check("'", "\"'\"") + check("\n", "\"\\x0a\"") + check("\x00", "\"\\x00\"") + + # invalid Unicode and astral planes + check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"") + check(u"\uDC00\uD800", "\"\\udc00\\ud800\"") + check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"") + check(u"\uD800\uDC00", "\"\\U00010000\"") + check(u"\uD800\uDC01", "\"\\U00010001\"") + check(u"\uD801\uDC00", "\"\\U00010400\"") + check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"") + check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"") + check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"") + + # invalid UTF-8 + check("\xFF", "b\"\\xff\"") + check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") + + def test_quote_output_ascii(self, enc='ascii'): + def check(inp, out, optional_quotes=False): + self._check(inp, out, enc, optional_quotes) + + self._test_quote_output_all(enc) + check(u"\u00D7", "\"\\xd7\"") + check(u"'\u00D7", "\"'\\xd7\"") + check(u"\"\u00D7", "\"\\\"\\xd7\"") + check(u"\u2621", "\"\\u2621\"") + check(u"'\u2621", "\"'\\u2621\"") + check(u"\"\u2621", "\"\\\"\\u2621\"") + + def test_quote_output_latin1(self, enc='latin1'): + def check(inp, out, optional_quotes=False): + self._check(inp, out.encode('latin1'), enc, optional_quotes) + + self._test_quote_output_all(enc) + check(u"\u00D7", u"'\u00D7'", True) + check(u"'\u00D7", u"\"'\u00D7\"") + check(u"\"\u00D7", u"'\"\u00D7'") + check(u"\u00D7\"", u"'\u00D7\"'", True) + check(u"\u2621", u"\"\\u2621\"") + check(u"'\u2621", u"\"'\\u2621\"") + check(u"\"\u2621", u"\"\\\"\\u2621\"") + + def test_quote_output_utf8(self, enc='utf-8'): + def check(inp, out, optional_quotes=False): + self._check(inp, out.encode('utf-8'), enc, optional_quotes) + + self._test_quote_output_all(enc) + check(u"\u2621", u"'\u2621'", True) + check(u"'\u2621", u"\"'\u2621\"") + check(u"\"\u2621", u"'\"\u2621'") + check(u"\u2621\"", u"'\u2621\"'", True) + + @patch('sys.stdout') + def test_quote_output_mock(self, mock_stdout): + mock_stdout.encoding = 'ascii' + _reload() + self.test_quote_output_ascii(None) + + mock_stdout.encoding = 'latin1' + _reload() + self.test_quote_output_latin1(None) + + mock_stdout.encoding = 'utf-8' + _reload() + self.test_quote_output_utf8(None) + + class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase): uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' output = 'lumi\xc3\xa8re' hunk ./src/allmydata/util/encodingutil.py 115 return s return s.encode(argv_encoding) -PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL) -PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL) +PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL) +PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) def is_printable_ascii(s): return PRINTABLE_ASCII.search(s) is not None hunk ./src/allmydata/util/encodingutil.py 140 (output_encoding, repr(s))) return out + +def _unicode_escape(m): + u = m.group(0) + if u == '"' or u == '$' or u == '`' or u == '\\': + return u'\\' + u + if len(u) == 2: + codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 + else: + codepoint = ord(u) + if codepoint > 0xFFFF: + return u'\\U%08x' % (codepoint,) + elif codepoint > 0xFF: + return u'\\u%04x' % (codepoint,) + else: + return u'\\x%02x' % (codepoint,) + +def _str_escape(m): + c = m.group(0) + if c == '"' or c == '$' or c == '`' or c == '\\': + return '\\' + c + else: + return '\\x%02x' % (ord(c),) + +MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) + +# if we must double-quote, then we have to escape ", $ and `, but need not escape ' +ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs + ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', + re.DOTALL) + +ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) + def quote_output(s, quotemarks=True, encoding=None): """ Encode either a Unicode string or a UTF-8-encoded bytestring for representation hunk ./src/allmydata/util/encodingutil.py 176 on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is - always surrounded by single quotes; otherwise, it is quoted only if necessary to - avoid ambiguity or control bytes in the output. + always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or + control bytes in the output. + Quoting may use either single or double quotes. Within single quotes, all + characters stand for themselves, and ' will not appear. Within double quotes, + Python-compatible backslash escaping is used. """ precondition(isinstance(s, (str, unicode)), s) hunk ./src/allmydata/util/encodingutil.py 188 try: s = s.decode('utf-8') except UnicodeDecodeError: - return 'b' + repr(s) - - try: - out = s.encode(encoding or output_encoding) - except (UnicodeEncodeError, UnicodeDecodeError): - return repr(s) + return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),) hunk ./src/allmydata/util/encodingutil.py 190 - if PRINTABLE_8BIT.search(out) is None: - return repr(out) + if MUST_DOUBLE_QUOTE.search(s) is None: + try: + out = s.encode(encoding or output_encoding) + if quotemarks or out.startswith('"'): + return "'%s'" % (out,) + else: + return out + except (UnicodeDecodeError, UnicodeEncodeError): + pass hunk ./src/allmydata/util/encodingutil.py 200 - if quotemarks: - return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'" - else: - return out + escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s) + return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),) def quote_path(path, quotemarks=True): return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks) } Context: [docs/specifications/dirnodes.txt: 'mesh'->'grid'. david-sarah@jacaranda.org**20100723061616 Ignore-this: 887bcf921ef00afba8e05e9239035bca ] [docs: use current cap to Zooko's wiki page in example text zooko@zooko.com**20100721010543 Ignore-this: 4f36f36758f9fdbaf9eb73eac23b6652 fixes #1134 ] [docs/specifications/dirnodes.txt: bring layer terminology up-to-date with architecture.txt, and a few other updates (e.g. note that the MAC is no longer verified, and that URIs can be unknown). Also 'Tahoe'->'Tahoe-LAFS'. david-sarah@jacaranda.org**20100723054703 Ignore-this: f3b98183e7d0a0f391225b8b93ac6c37 ] [__init__.py: silence DeprecationWarning about BaseException.message globally. fixes #1129 david-sarah@jacaranda.org**20100720011939 Ignore-this: 38808986ba79cb2786b010504a22f89 ] [test_runner: test that 'tahoe --version' outputs no noise (e.g. DeprecationWarnings). david-sarah@jacaranda.org**20100720011345 Ignore-this: dd358b7b2e5d57282cbe133e8069702e ] [TAG allmydata-tahoe-1.7.1 zooko@zooko.com**20100719131352 Ignore-this: 6942056548433dc653a746703819ad8c ] Patch bundle hash: d4aa6ac35c5dba44996999385ca90717c2525a3e