source: trunk/src/allmydata/test/test_encodingutil.py

Last change on this file was dfdb6c60, checked in by Itamar Turner-Trauring <itamar@…>, at 2024-03-01T17:17:11Z

Fix lints.

  • Property mode set to 100644
File size: 17.7 KB
Line 
1
2lumiere_nfc = u"lumi\u00E8re"
3Artonwall_nfc = u"\u00C4rtonwall.mp3"
4Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6TEST_FILENAMES = (
7  Artonwall_nfc,
8  u'test_file',
9  u'Blah blah.txt',
10)
11
12# The following main helps to generate a test class for other operating
13# systems.
14
15if __name__ == "__main__":
16    import sys, os
17    import tempfile
18    import shutil
19    import platform
20
21    if len(sys.argv) != 2:
22        print("Usage: %s lumi<e-grave>re" % sys.argv[0])
23        sys.exit(1)
24
25    if sys.platform == "win32":
26        try:
27            from allmydata.windows.fixups import initialize
28        except ImportError:
29            print("set PYTHONPATH to the src directory")
30            sys.exit(1)
31        initialize()
32
33    print()
34    print("class MyWeirdOS(EncodingUtil, unittest.TestCase):")
35    print("    uname = '%s'" % ' '.join(platform.uname()))
36    print("    argv = %s" % repr(sys.argv[1]))
37    print("    platform = '%s'" % sys.platform)
38    print("    filesystem_encoding = '%s'" % sys.getfilesystemencoding())
39    print("    io_encoding = '%s'" % sys.stdout.encoding)
40    try:
41        tmpdir = tempfile.mkdtemp()
42        for fname in TEST_FILENAMES:
43            open(os.path.join(tmpdir, fname), 'w').close()
44
45        dirlist = os.listdir(tmpdir)
46
47        print("    dirlist = %s" % repr(dirlist))
48    except:
49        print("    # Oops, I cannot write filenames containing non-ascii characters")
50    print()
51
52    shutil.rmtree(tmpdir)
53    sys.exit(0)
54
55
56import os, sys
57
58from twisted.trial import unittest
59
60from twisted.python.filepath import FilePath
61
62from allmydata.test.common_util import (
63    ReallyEqualMixin, skip_if_cannot_represent_filename,
64)
65from allmydata.util import encodingutil, fileutil
66from allmydata.util.encodingutil import unicode_to_url, \
67    unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
68    quote_filepath, unicode_platform, listdir_unicode, \
69    get_filesystem_encoding, to_bytes, from_utf8_or_none, _reload, \
70    to_filepath, extend_filepath, unicode_from_filepath, unicode_segments_from, \
71    unicode_to_argv
72
73class MockStdout(object):
74    pass
75
76
77class EncodingUtil(ReallyEqualMixin):
78    def setUp(self):
79        self.addCleanup(_reload)
80        self.patch(sys, "platform", self.platform)
81
82    def test_unicode_to_url(self):
83        self.failUnless(unicode_to_url(lumiere_nfc), b"lumi\xc3\xa8re")
84
85    def test_unicode_to_output_py3(self):
86        self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), lumiere_nfc)
87
88    def test_unicode_to_argv(self):
89        """
90        unicode_to_argv() returns its unicode argument on Windows and Python 2 and
91        converts to bytes using UTF-8 elsewhere.
92        """
93        result = unicode_to_argv(lumiere_nfc)
94        expected_value = lumiere_nfc
95
96        self.assertIsInstance(result, type(expected_value))
97        self.assertEqual(result, expected_value)
98
99    def test_unicode_platform_py3(self):
100        _reload()
101        self.failUnlessReallyEqual(unicode_platform(), True)
102
103    def test_listdir_unicode(self):
104        if 'dirlist' not in dir(self):
105            return
106
107        try:
108            u"test".encode(self.filesystem_encoding)
109        except (LookupError, AttributeError):
110            raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
111                                    "that we are testing for the benefit of a different platform."
112                                    % (self.filesystem_encoding,))
113
114        def call_os_listdir(path):
115            # Python 3 always lists unicode filenames:
116            return [d.decode(self.filesystem_encoding) if isinstance(d, bytes)
117                    else d
118                    for d in self.dirlist]
119
120        self.patch(os, 'listdir', call_os_listdir)
121
122        def call_sys_getfilesystemencoding():
123            return self.filesystem_encoding
124        self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding)
125
126        _reload()
127        filenames = listdir_unicode(u'/dummy')
128
129        self.failUnlessEqual(set([encodingutil.normalize(fname) for fname in filenames]),
130                             set(TEST_FILENAMES))
131
132
133class StdlibUnicode(unittest.TestCase):
134    """This mainly tests that some of the stdlib functions support Unicode paths, but also that
135    listdir_unicode works for valid filenames."""
136
137    def test_mkdir_open_exists_abspath_listdir_expanduser(self):
138        skip_if_cannot_represent_filename(lumiere_nfc)
139
140        try:
141            os.mkdir(lumiere_nfc)
142        except EnvironmentError as e:
143            raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
144                                    "does not support Unicode, even though the platform does." % (e,))
145
146        fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
147        open(fn, 'wb').close()
148        self.failUnless(os.path.exists(fn))
149        getcwdu = os.getcwd
150        self.failUnless(os.path.exists(os.path.join(getcwdu(), fn)))
151        filenames = listdir_unicode(lumiere_nfc)
152
153        # We only require that the listing includes a filename that is canonically equivalent
154        # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
155        self.failUnlessIn(lumiere_nfc + u".txt", set([encodingutil.normalize(fname) for fname in filenames]))
156
157        expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
158        self.failIfIn(u"~", expanded)
159        self.failUnless(expanded.endswith(lumiere_nfc), expanded)
160
161    def test_open_unrepresentable(self):
162        if unicode_platform():
163            raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
164
165        enc = get_filesystem_encoding()
166        fn = u'\u2621.txt'
167        try:
168            fn.encode(enc)
169            raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
170        except UnicodeEncodeError:
171            self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
172
173
174class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
175    def tearDown(self):
176        _reload()
177
178    def _check(self, inp, out, enc, optional_quotes, quote_newlines):
179        if isinstance(out, bytes):
180            out = out.decode(enc or encodingutil.io_encoding)
181        out2 = out
182        if optional_quotes:
183            out2 = out2[1:-1]
184        self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
185        self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
186        if out[0:2] == 'b"':
187            pass
188        elif isinstance(inp, bytes):
189            try:
190                unicode_inp = inp.decode("utf-8")
191            except UnicodeDecodeError:
192                # Some things decode on Python 2, but not Python 3...
193                return
194            self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quote_newlines=quote_newlines), out)
195            self.failUnlessReallyEqual(quote_output(unicode_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
196        else:
197            try:
198                bytes_inp = inp.encode('utf-8')
199            except UnicodeEncodeError:
200                # Some things encode on Python 2, but not Python 3, e.g.
201                # surrogates like u"\uDC00\uD800"...
202                return
203            self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quote_newlines=quote_newlines), out)
204            self.failUnlessReallyEqual(quote_output(bytes_inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
205
206    def _test_quote_output_all(self, enc):
207        def check(inp, out, optional_quotes=False, quote_newlines=None):
208            out = out.decode("ascii")
209            self._check(inp, out, enc, optional_quotes, quote_newlines)
210
211        # optional single quotes
212        check(b"foo",  b"'foo'",  True)
213        check(b"\\",   b"'\\'",   True)
214        check(b"$\"`", b"'$\"`'", True)
215        check(b"\n",   b"'\n'",   True, quote_newlines=False)
216
217        # mandatory single quotes
218        check(b"\"",   b"'\"'")
219
220        # double quotes
221        check(b"'",    b"\"'\"")
222        check(b"\n",   b"\"\\x0a\"", quote_newlines=True)
223        check(b"\x00", b"\"\\x00\"")
224
225        # invalid Unicode and astral planes
226        check(u"\uFDD0\uFDEF",       b"\"\\ufdd0\\ufdef\"")
227        check(u"\uDC00\uD800",       b"\"\\udc00\\ud800\"")
228        check(u"\uDC00\uD800\uDC00", b"\"\\udc00\\U00010000\"")
229        check(u"\uD800\uDC00",       b"\"\\U00010000\"")
230        check(u"\uD800\uDC01",       b"\"\\U00010001\"")
231        check(u"\uD801\uDC00",       b"\"\\U00010400\"")
232        check(u"\uDBFF\uDFFF",       b"\"\\U0010ffff\"")
233        check(u"'\uDBFF\uDFFF",      b"\"'\\U0010ffff\"")
234        check(u"\"\uDBFF\uDFFF",     b"\"\\\"\\U0010ffff\"")
235
236        # invalid UTF-8
237        check(b"\xFF",                b"b\"\\xff\"")
238        check(b"\x00\"$\\`\x80\xFF",  b"b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
239
240    def test_quote_output_ascii(self, enc='ascii'):
241        def check(inp, out, optional_quotes=False, quote_newlines=None):
242            self._check(inp, out, enc, optional_quotes, quote_newlines)
243
244        self._test_quote_output_all(enc)
245        check(u"\u00D7",   b"\"\\xd7\"")
246        check(u"'\u00D7",  b"\"'\\xd7\"")
247        check(u"\"\u00D7", b"\"\\\"\\xd7\"")
248        check(u"\u2621",   b"\"\\u2621\"")
249        check(u"'\u2621",  b"\"'\\u2621\"")
250        check(u"\"\u2621", b"\"\\\"\\u2621\"")
251        check(u"\n",       b"'\n'",      True, quote_newlines=False)
252        check(u"\n",       b"\"\\x0a\"", quote_newlines=True)
253
254    def test_quote_output_latin1(self, enc='latin1'):
255        def check(inp, out, optional_quotes=False, quote_newlines=None):
256            self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
257
258        self._test_quote_output_all(enc)
259        check(u"\u00D7",   u"'\u00D7'", True)
260        check(u"'\u00D7",  u"\"'\u00D7\"")
261        check(u"\"\u00D7", u"'\"\u00D7'")
262        check(u"\u00D7\"", u"'\u00D7\"'", True)
263        check(u"\u2621",   u"\"\\u2621\"")
264        check(u"'\u2621",  u"\"'\\u2621\"")
265        check(u"\"\u2621", u"\"\\\"\\u2621\"")
266        check(u"\n",       u"'\n'", True, quote_newlines=False)
267        check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
268
269    def test_quote_output_utf8(self, enc='utf-8'):
270        def check(inp, out, optional_quotes=False, quote_newlines=None):
271            self._check(inp, out, enc, optional_quotes, quote_newlines)
272
273        self._test_quote_output_all(enc)
274        check(u"\u2621",   u"'\u2621'", True)
275        check(u"'\u2621",  u"\"'\u2621\"")
276        check(u"\"\u2621", u"'\"\u2621'")
277        check(u"\u2621\"", u"'\u2621\"'", True)
278        check(u"\n",       u"'\n'", True, quote_newlines=False)
279        check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
280
281    def test_quote_output_default(self):
282        """Default is the encoding of sys.stdout if known, otherwise utf-8."""
283        encoding = getattr(sys.stdout, "encoding") or "utf-8"
284        self.assertEqual(quote_output(u"\u2621"),
285                         quote_output(u"\u2621", encoding=encoding))
286
287
288def win32_other(win32, other):
289    return win32 if sys.platform == "win32" else other
290
291class QuotePaths(ReallyEqualMixin, unittest.TestCase):
292
293    def assertPathsEqual(self, actual, expected):
294        expected = expected.decode("ascii")
295        self.failUnlessReallyEqual(actual, expected)
296
297    def test_quote_path(self):
298        self.assertPathsEqual(quote_path([u'foo', u'bar']), b"'foo/bar'")
299        self.assertPathsEqual(quote_path([u'foo', u'bar'], quotemarks=True), b"'foo/bar'")
300        self.assertPathsEqual(quote_path([u'foo', u'bar'], quotemarks=False), b"foo/bar")
301        self.assertPathsEqual(quote_path([u'foo', u'\nbar']), b'"foo/\\x0abar"')
302        self.assertPathsEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), b'"foo/\\x0abar"')
303        self.assertPathsEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), b'"foo/\\x0abar"')
304
305        self.assertPathsEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
306                                   win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'"))
307        self.assertPathsEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
308                                   win32_other(b"'C:\\foo'", b"'\\\\?\\C:\\foo'"))
309        self.assertPathsEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
310                                   win32_other(b"C:\\foo", b"\\\\?\\C:\\foo"))
311        self.assertPathsEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
312                                   win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'"))
313        self.assertPathsEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
314                                   win32_other(b"'\\\\foo\\bar'", b"'\\\\?\\UNC\\foo\\bar'"))
315        self.assertPathsEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
316                                   win32_other(b"\\\\foo\\bar", b"\\\\?\\UNC\\foo\\bar"))
317
318    def test_quote_filepath(self):
319        foo_bar_fp = FilePath(win32_other(u'C:\\foo\\bar', u'/foo/bar'))
320        self.assertPathsEqual(quote_filepath(foo_bar_fp),
321                                   win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'"))
322        self.assertPathsEqual(quote_filepath(foo_bar_fp, quotemarks=True),
323                                   win32_other(b"'C:\\foo\\bar'", b"'/foo/bar'"))
324        self.assertPathsEqual(quote_filepath(foo_bar_fp, quotemarks=False),
325                                   win32_other(b"C:\\foo\\bar", b"/foo/bar"))
326
327        if sys.platform == "win32":
328            foo_longfp = FilePath(u'\\\\?\\C:\\foo')
329            self.assertPathsEqual(quote_filepath(foo_longfp),
330                                       b"'C:\\foo'")
331            self.assertPathsEqual(quote_filepath(foo_longfp, quotemarks=True),
332                                       b"'C:\\foo'")
333            self.assertPathsEqual(quote_filepath(foo_longfp, quotemarks=False),
334                                       b"C:\\foo")
335
336
337class FilePaths(ReallyEqualMixin, unittest.TestCase):
338    def test_to_filepath(self):
339        foo_u = win32_other(u'C:\\foo', u'/foo')
340
341        nosep_fp = to_filepath(foo_u)
342        sep_fp = to_filepath(foo_u + os.path.sep)
343
344        for fp in (nosep_fp, sep_fp):
345            self.failUnlessReallyEqual(fp, FilePath(foo_u))
346            if encodingutil.use_unicode_filepath:
347                self.failUnlessReallyEqual(fp.path, foo_u)
348
349        if sys.platform == "win32":
350            long_u = u'\\\\?\\C:\\foo'
351            longfp = to_filepath(long_u + u'\\')
352            self.failUnlessReallyEqual(longfp, FilePath(long_u))
353            self.failUnlessReallyEqual(longfp.path, long_u)
354
355    def test_extend_filepath(self):
356        foo_bfp = FilePath(win32_other(b'C:\\foo', b'/foo'))
357        foo_ufp = FilePath(win32_other(u'C:\\foo', u'/foo'))
358        foo_bar_baz_u = win32_other(u'C:\\foo\\bar\\baz', u'/foo/bar/baz')
359
360        for foo_fp in (foo_bfp, foo_ufp):
361            fp = extend_filepath(foo_fp, [u'bar', u'baz'])
362            self.failUnlessReallyEqual(fp, FilePath(foo_bar_baz_u))
363            if encodingutil.use_unicode_filepath:
364                self.failUnlessReallyEqual(fp.path, foo_bar_baz_u)
365
366    def test_unicode_from_filepath(self):
367        foo_bfp = FilePath(win32_other(b'C:\\foo', b'/foo'))
368        foo_ufp = FilePath(win32_other(u'C:\\foo', u'/foo'))
369        foo_u = win32_other(u'C:\\foo', u'/foo')
370
371        for foo_fp in (foo_bfp, foo_ufp):
372            self.failUnlessReallyEqual(unicode_from_filepath(foo_fp), foo_u)
373
374    def test_unicode_segments_from(self):
375        foo_bfp = FilePath(win32_other(b'C:\\foo', b'/foo'))
376        foo_ufp = FilePath(win32_other(u'C:\\foo', u'/foo'))
377        foo_bar_baz_bfp = FilePath(win32_other(b'C:\\foo\\bar\\baz', b'/foo/bar/baz'))
378        foo_bar_baz_ufp = FilePath(win32_other(u'C:\\foo\\bar\\baz', u'/foo/bar/baz'))
379
380        for foo_fp in (foo_bfp, foo_ufp):
381            for foo_bar_baz_fp in (foo_bar_baz_bfp, foo_bar_baz_ufp):
382                self.failUnlessReallyEqual(unicode_segments_from(foo_bar_baz_fp, foo_fp),
383                                           [u'bar', u'baz'])
384
385
386class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
387    uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
388    argv = b'lumi\xc3\xa8re'
389    platform = 'linux2'
390    filesystem_encoding = 'UTF-8'
391    io_encoding = 'UTF-8'
392    dirlist = [b'test_file', b'\xc3\x84rtonwall.mp3', b'Blah blah.txt']
393
394class Windows(EncodingUtil, unittest.TestCase):
395    uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
396    argv = b'lumi\xc3\xa8re'
397    platform = 'win32'
398    filesystem_encoding = 'mbcs'
399    io_encoding = 'utf-8'
400    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
401
402class MacOSXLeopard(EncodingUtil, unittest.TestCase):
403    uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
404    output = b'lumi\xc3\xa8re'
405    platform = 'darwin'
406    filesystem_encoding = 'utf-8'
407    io_encoding = 'UTF-8'
408    dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
409
410
411class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
412    def test_to_bytes(self):
413        self.failUnlessReallyEqual(to_bytes(b"foo"), b"foo")
414        self.failUnlessReallyEqual(to_bytes(b"lumi\xc3\xa8re"), b"lumi\xc3\xa8re")
415        self.failUnlessReallyEqual(to_bytes(b"\xFF"), b"\xFF")  # passes through invalid UTF-8 -- is this what we want?
416        self.failUnlessReallyEqual(to_bytes(u"lumi\u00E8re"), b"lumi\xc3\xa8re")
417        self.failUnlessReallyEqual(to_bytes(None), None)
418
419    def test_from_utf8_or_none(self):
420        self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
421        self.failUnlessReallyEqual(from_utf8_or_none(b"lumi\xc3\xa8re"), u"lumi\u00E8re")
422        self.failUnlessReallyEqual(from_utf8_or_none(None), None)
423        self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, b"\xFF")
Note: See TracBrowser for help on using the repository browser.