Sat Apr 24 01:56:43 CEST 2010 Francois Deppierraz * stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. diff -rN old-tahoe-534/src/allmydata/test/test_stringutils.py new-tahoe-534/src/allmydata/test/test_stringutils.py 0a1,158 > # coding=utf-8 > > TEST_FILENAMES = ( > u'Ärtonwall.mp3', > u'test_file', > u'Blah blah.txt', > ) > > # The following main helps to generate a test class for other operating > # systems. > > if __name__ == "__main__": > import sys, os > import tempfile > import shutil > import platform > > if len(sys.argv) != 2: > print "Usage: %s lumière" % sys.argv[0] > sys.exit(1) > > print > print "class MyWeirdOS(StringUtils, unittest.TestCase):" > print " uname = '%s'" % ' '.join(platform.uname()) > print " argv = %s" % repr(sys.argv[1]) > print " platform = '%s'" % sys.platform > print " filesystemencoding = '%s'" % sys.getfilesystemencoding() > print " stdoutencoding = '%s'" % sys.stdout.encoding > > try: > tmpdir = tempfile.mkdtemp() > for fname in TEST_FILENAMES: > open(os.path.join(tmpdir, fname), 'w').close() > > # Use Unicode API under Windows or MacOS X > if sys.platform in ('win32', 'darwin'): > dirlist = os.listdir(unicode(tmpdir)) > else: > dirlist = os.listdir(tmpdir) > > print " dirlist = %s" % repr(dirlist) > except: > print " # Oops, I cannot write filenames containing non-ascii characters" > print > > shutil.rmtree(tmpdir) > sys.exit(0) > > from twisted.trial import unittest > from mock import patch > import sys > > from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \ > unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode > > class StringUtils(): > def setUp(self): > # Mock sys.platform because unicode_platform() uses it > self.original_platform = sys.platform > sys.platform = self.platform > > def tearDown(self): > sys.platform = self.original_platform > > @patch('sys.stdout') > def test_argv_to_unicode(self, mock): > mock.encoding = self.stdoutencoding > > argu = u'lumière' > argv = self.argv > > self.failUnlessEqual(argv_to_unicode(argv), argu) > > def test_unicode_to_url(self): > self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8')) > > @patch('sys.stdout') > def test_unicode_to_stdout(self, mock): > mock.encoding = self.stdoutencoding > self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv) > > def test_unicode_platform(self): > matrix = { > 'linux2': False, > 'win32': True, > 'darwin': True, > } > > self.failUnlessEqual(unicode_platform(), matrix[self.platform]) > > @patch('sys.getfilesystemencoding') > @patch('os.listdir') > def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding): > > mock_listdir.return_value = self.dirlist > mock_getfilesystemencoding.return_value = self.filesystemencoding > > filenames = listdir_unicode(u'/dummy') > > for fname in TEST_FILENAMES: > self.failUnless(isinstance(fname, unicode)) > > if fname not in filenames: > self.fail("Cannot find %r in %r" % (fname, filenames)) > > @patch('os.open') > def test_open_unicode(self, mock): > pass > > class UbuntuKarmicUTF8(StringUtils, unittest.TestCase): > uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' > argv = 'lumi\xc3\xa8re' > platform = 'linux2' > filesystemencoding = 'UTF-8' > stdoutencoding = 'UTF-8' > dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt'] > > > class UbuntuKarmicLatin1(StringUtils, unittest.TestCase): > uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' > argv = 'lumi\xe8re' > platform = 'linux2' > filesystemencoding = 'ISO-8859-1' > stdoutencoding = 'ISO-8859-1' > dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3'] > > class WindowsXP(StringUtils, unittest.TestCase): > uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' > argv = 'lumi\xe8re' > platform = 'win32' > filesystemencoding = 'mbcs' > stdoutencoding = 'cp850' > dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] > > #class WindowsXP_UTF8(StringUtils, unittest.TestCase): > # uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' > # argv = 'lumi\xe8re' > # platform = 'win32' > # filesystemencoding = 'mbcs' > # stdoutencoding = 'cp65001' > # dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] > > class WindowsVista(StringUtils, unittest.TestCase): > uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel' > argv = 'lumi\xe8re' > platform = 'win32' > filesystemencoding = 'mbcs' > stdoutencoding = 'cp850' > dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] > > class MacOSXLeopard(StringUtils, unittest.TestCase): > uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' > argv = 'lumi\xc3\xa8re' > platform = 'darwin' > filesystemencoding = 'utf-8' > stdoutencoding = 'UTF-8' > dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] > diff -rN old-tahoe-534/src/allmydata/util/stringutils.py new-tahoe-534/src/allmydata/util/stringutils.py 0a1,144 > """ > Functions used to convert inputs from whatever encoding used in the system to > unicode and back. > """ > > import sys > import os > import unicodedata > from allmydata.util.assertutil import precondition > from twisted.python import usage > > def get_stdout_encoding(): > """ > Returns the encoding expected for writing to stdout. If valid encoding > could be found, use UTF-8. > """ > # If you force Windows cmd.exe set to use UTF-8 by typing 'chcp 65001', > # sys.stdin.encoding and sys.stdout.encoding will be set to 'cp65001', > # which is not recognized as being the same as UTF-8. > # > # http://msdn.microsoft.com/en-us/library/dd317756%28VS.85%29.aspx > # Codepage 65001 -> Unicode (UTF-8) > # Codepage 850 -> OEM Multilingual Latin 1; Western European (DOS) > > enc = sys.stdout.encoding > > if enc is None or enc == 'cp65001': > enc = 'utf-8' > > if enc == 'cp850': > enc = 'ISO-8859-1' > > return enc > > def argv_to_unicode(s): > """ > Decode given argv element to unicode. > """ > # sys.argv encoding detection in Python is not trivial so utf-8 is > # currently used by default and an informative error message is given if > # the argument cannot be correctly decoded. > > precondition(isinstance(s, str), s) > > try: > return unicode(s, get_stdout_encoding()) > except UnicodeEncodeError: > raise usageError("Argument '%s' cannot be decoded as UTF-8." % s) > > def unicode_to_url(s): > """ > Encode an unicode object used in an URL. > """ > # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded. > > precondition(isinstance(s, unicode), s) > return s.encode('utf-8') > > def unicode_to_stdout(s): > """ > Encode an unicode object for representation on stdout. > """ > > precondition(isinstance(s, unicode), s) > > try: > return s.encode(get_stdout_encoding(), 'replace') > except LookupError: > return s.encode('utf-8', 'replace') # maybe > > def unicode_platform(): > """ > Does the current platform handle Unicode filenames natively ? > """ > > return sys.platform in ('win32', 'darwin') > > class FilenameEncodingError(Exception): > """ > Filename cannot be encoded using the current encoding of your filesystem > (%s). Please configure your locale correctly or rename this file. > """ > > pass > > def listdir_unicode_unix(path): > """ > This function emulates an Unicode API under Unix similar to one available > under Windows or MacOS X. > > If badly encoded filenames are encoutered, an exception is raised. > """ > precondition(isinstance(path, unicode), path) > > encoding = sys.getfilesystemencoding() > try: > byte_path = path.encode(encoding) > except UnicodeEncodeError: > raise FilenameEncodingError(path) > > try: > return [unicode(fn, encoding) for fn in os.listdir(byte_path)] > except UnicodeDecodeError: > raise FilenameEncodingError(fn) > > def listdir_unicode(path, encoding = None): > """ > Wrapper around listdir() which provides safe access to the convenient > Unicode API even under Unix. > """ > > precondition(isinstance(path, unicode), path) > > # On Windows and MacOS X, the Unicode API is used > if unicode_platform(): > dirlist = os.listdir(path) > > # On other platforms (ie. Unix systems), the byte-level API is used > else: > dirlist = listdir_unicode_unix(path) > > # Normalize the resulting unicode filenames > # > # This prevents different OS from generating non-equal unicode strings for > # the same filename representation > return [unicodedata.normalize('NFC', fname) for fname in dirlist] > > def open_unicode(path, mode='r'): > """ > Wrapper around open() which provides safe access to the convenient Unicode > API even under Unix. > """ > > precondition(isinstance(path, unicode), path) > > if unicode_platform(): > return open(path, mode) > else: > encoding = sys.getfilesystemencoding() > > try: > return open(path.encode(encoding), mode) > except UnicodeEncodeError: > raise FilenameEncodingError(path)