diff -rN -u old-unicode/docs/frontends/CLI.txt new-unicode/docs/frontends/CLI.txt --- old-unicode/docs/frontends/CLI.txt 2009-04-08 22:04:16.000000000 -0600 +++ new-unicode/docs/frontends/CLI.txt 2009-04-08 22:04:24.000000000 -0600 @@ -91,9 +91,21 @@ These commands also use a table of "aliases" to figure out which directory they ought to use a starting point. This is explained in more detail below. -In Tahoe v1.3.0, passing non-ascii characters to the cli is not guaranteed to -work, although it might work on your platform, especially if your platform -uses utf-8 encoding. +As of Tahoe v1.3.1, filenames containing non-ascii characters are +supported on the commande line if your terminal is correctly configured +for UTF-8 support. This is usually the case on moderns GNU/Linux +distributions. + +If your terminal doesn't support UTF-8, you will still be able to list +directories but non-ascii characters will be replaced by a question mark +(?) on display. + +Reading from and writing to files whose name contain non-ascii +characters is also supported when your system correctly understand them. +Under Unix, this is usually handled by locale settings. If Tahoe cannot +correctly decode a filename, it will raise an error. In such case, +you'll need to correct the name of your file, possibly with help from +tools such as convmv. === Starting Directories === diff -rN -u old-unicode/src/allmydata/scripts/common.py new-unicode/src/allmydata/scripts/common.py --- old-unicode/src/allmydata/scripts/common.py 2009-04-08 22:04:19.000000000 -0600 +++ new-unicode/src/allmydata/scripts/common.py 2009-04-08 22:04:25.000000000 -0600 @@ -1,7 +1,8 @@ import os, sys, urllib +import codecs from twisted.python import usage - +from allmydata.util.stringutils import unicode_to_url class BaseOptions: # unit tests can override these to point at StringIO instances @@ -107,7 +108,7 @@ continue name, cap = line.split(":", 1) # normalize it: remove http: prefix, urldecode - cap = cap.strip() + cap = cap.strip().encode('ascii') aliases[name] = uri.from_string_dirnode(cap).to_string() except EnvironmentError: pass @@ -163,4 +164,4 @@ def escape_path(path): segments = path.split("/") - return "/".join([urllib.quote(s) for s in segments]) + return "/".join([urllib.quote(unicode_to_url(s)) for s in segments]) diff -rN -u old-unicode/src/allmydata/scripts/tahoe_backup.py new-unicode/src/allmydata/scripts/tahoe_backup.py --- old-unicode/src/allmydata/scripts/tahoe_backup.py 2009-04-08 22:04:19.000000000 -0600 +++ new-unicode/src/allmydata/scripts/tahoe_backup.py 2009-04-08 22:04:25.000000000 -0600 @@ -4,11 +4,15 @@ import urllib import simplejson import datetime +import sys from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS from allmydata.scripts.common_http import do_http from allmydata import uri from allmydata.util import time_format from allmydata.scripts import backupdb +from allmydata.util.stringutils import fs_to_unicode, unicode_to_fs +from allmydata.util.assertutil import precondition +from twisted.python import usage class HTTPError(Exception): pass @@ -248,6 +252,7 @@ print >>self.options.stdout, msg def process(self, localpath, olddircap): + precondition(isinstance(localpath, unicode), localpath) # returns newdircap self.verboseprint("processing %s, olddircap %s" % (localpath, olddircap)) @@ -256,7 +261,8 @@ olddircontents = self.readdir(olddircap) newdircontents = {} # childname -> (type, rocap, metadata) - for child in self.options.filter_listdir(os.listdir(localpath)): + for child in self.options.filter_listdir(os.listdir(unicode_to_fs(localpath))): + child = fs_to_unicode(child) childpath = os.path.join(localpath, child) if os.path.isdir(childpath): metadata = get_local_metadata(childpath) @@ -342,6 +348,8 @@ return contents def upload(self, childpath): + precondition(isinstance(childpath, unicode), childpath) + #self.verboseprint("uploading %s.." % childpath) metadata = get_local_metadata(childpath) @@ -350,7 +358,7 @@ if must_upload: self.verboseprint("uploading %s.." % childpath) - infileobj = open(os.path.expanduser(childpath), "rb") + infileobj = open(unicode_to_fs(os.path.expanduser(childpath)), "rb") url = self.options['node-url'] + "uri" resp = do_http("PUT", url, infileobj) if resp.status not in (200, 201): diff -rN -u old-unicode/src/allmydata/scripts/tahoe_cp.py new-unicode/src/allmydata/scripts/tahoe_cp.py --- old-unicode/src/allmydata/scripts/tahoe_cp.py 2009-04-08 22:04:19.000000000 -0600 +++ new-unicode/src/allmydata/scripts/tahoe_cp.py 2009-04-08 22:04:25.000000000 -0600 @@ -4,9 +4,13 @@ import simplejson from cStringIO import StringIO from twisted.python.failure import Failure +import sys from allmydata.scripts.common import get_alias, escape_path, DefaultAliasMarker from allmydata.scripts.common_http import do_http from allmydata import uri +from twisted.python import usage +from allmydata.util.stringutils import fs_to_unicode, unicode_to_fs, unicode_to_url +from allmydata.util.assertutil import precondition def ascii_or_none(s): if s is None: @@ -69,6 +73,7 @@ class LocalFileSource: def __init__(self, pathname): + precondition(isinstance(pathname, unicode), pathname) self.pathname = pathname def need_to_copy_bytes(self): @@ -79,6 +84,7 @@ class LocalFileTarget: def __init__(self, pathname): + precondition(isinstance(pathname, unicode), pathname) self.pathname = pathname def put_file(self, inf): outf = open(self.pathname, "wb") @@ -91,6 +97,7 @@ class LocalMissingTarget: def __init__(self, pathname): + precondition(isinstance(pathname, unicode), pathname) self.pathname = pathname def put_file(self, inf): @@ -104,6 +111,8 @@ class LocalDirectorySource: def __init__(self, progressfunc, pathname): + precondition(isinstance(pathname, unicode), pathname) + self.progressfunc = progressfunc self.pathname = pathname self.children = None @@ -112,8 +121,9 @@ if self.children is not None: return self.children = {} - children = os.listdir(self.pathname) + children = os.listdir(unicode_to_fs(self.pathname)) for i,n in enumerate(children): + n = fs_to_unicode(n) self.progressfunc("examining %d of %d" % (i, len(children))) pn = os.path.join(self.pathname, n) if os.path.isdir(pn): @@ -129,6 +139,8 @@ class LocalDirectoryTarget: def __init__(self, progressfunc, pathname): + precondition(isinstance(pathname, unicode), pathname) + self.progressfunc = progressfunc self.pathname = pathname self.children = None @@ -137,8 +149,9 @@ if self.children is not None: return self.children = {} - children = os.listdir(self.pathname) + children = os.listdir(unicode_to_fs(self.pathname)) for i,n in enumerate(children): + n = fs_to_unicode(n) self.progressfunc("examining %d of %d" % (i, len(children))) pn = os.path.join(self.pathname, n) if os.path.isdir(pn): @@ -160,8 +173,9 @@ return LocalDirectoryTarget(self.progressfunc, pathname) def put_file(self, name, inf): + precondition(isinstance(name, unicode), name) pathname = os.path.join(self.pathname, name) - outf = open(pathname, "wb") + outf = open(unicode_to_fs(pathname), "wb") while True: data = inf.read(32768) if not data: @@ -350,7 +364,7 @@ if self.writecap: url = self.nodeurl + "/".join(["uri", urllib.quote(self.writecap), - urllib.quote(name.encode('utf-8'))]) + urllib.quote(unicode_to_url(name))]) self.children[name] = TahoeFileTarget(self.nodeurl, mutable, writecap, readcap, url) else: diff -rN -u old-unicode/src/allmydata/scripts/tahoe_ls.py new-unicode/src/allmydata/scripts/tahoe_ls.py --- old-unicode/src/allmydata/scripts/tahoe_ls.py 2009-04-08 22:04:19.000000000 -0600 +++ new-unicode/src/allmydata/scripts/tahoe_ls.py 2009-04-08 22:04:25.000000000 -0600 @@ -82,17 +82,17 @@ if childtype == "dirnode": t0 = "d" size = "-" - classify = "/" + classify = u"/" elif childtype == "filenode": t0 = "-" size = str(child[1]['size']) - classify = "" + classify = u"" if rw_uri: - classify = "*" + classify = u"*" else: t0 = "?" size = "?" - classify = "?" + classify = u"?" t1 = "-" if ro_uri: t1 = "r" @@ -111,7 +111,7 @@ line.append(size) line.append(ctime_s) if not options["classify"]: - classify = "" + classify = u"" line.append(name + classify) if options["uri"]: line.append(uri) @@ -135,13 +135,13 @@ left_justifys[0] = True fmt_pieces = [] for i in range(len(max_widths)): - piece = "%" + piece = u"%" if left_justifys[i]: - piece += "-" + piece += u"-" piece += str(max_widths[i]) - piece += "s" + piece += u"s" fmt_pieces.append(piece) - fmt = " ".join(fmt_pieces) + fmt = u" ".join(fmt_pieces) for row in rows: print >>stdout, (fmt % tuple(row)).rstrip() diff -rN -u old-unicode/src/allmydata/scripts/tahoe_manifest.py new-unicode/src/allmydata/scripts/tahoe_manifest.py --- old-unicode/src/allmydata/scripts/tahoe_manifest.py 2009-04-08 22:04:19.000000000 -0600 +++ new-unicode/src/allmydata/scripts/tahoe_manifest.py 2009-04-08 22:04:25.000000000 -0600 @@ -78,10 +78,15 @@ print >>stdout, vc else: try: - print >>stdout, d["cap"], "/".join(d["path"]) + print >>stdout, d["cap"], u"/".join(d["path"]) except UnicodeEncodeError: - print >>stdout, d["cap"], "/".join([p.encode("utf-8") - for p in d["path"]]) + # Perhaps python and/or the local system is misconfigured + # and actually it should have used utf-8. See ticket #534 + # about the questionable practice of second-guessing + # python+system-config like this. (And how 'utf-16le' + # might be a better second-guess on Windows.) + print >>stdout, d["cap"].encode('utf-8'), + "/".join([p.encode('utf-8') for p in d["path"]]) def manifest(options): return ManifestStreamer().run(options) diff -rN -u old-unicode/src/allmydata/scripts/tahoe_mkdir.py new-unicode/src/allmydata/scripts/tahoe_mkdir.py --- old-unicode/src/allmydata/scripts/tahoe_mkdir.py 2009-04-08 22:04:19.000000000 -0600 +++ new-unicode/src/allmydata/scripts/tahoe_mkdir.py 2009-04-08 22:04:25.000000000 -0600 @@ -2,6 +2,7 @@ import urllib from allmydata.scripts.common_http import do_http, check_http_error from allmydata.scripts.common import get_alias, DEFAULT_ALIAS +from allmydata.util.stringutils import unicode_to_url def mkdir(options): nodeurl = options['node-url'] @@ -31,7 +32,7 @@ path = path[:-1] # path (in argv) must be "/".join([s.encode("utf-8") for s in segments]) url = nodeurl + "uri/%s/%s?t=mkdir" % (urllib.quote(rootcap), - urllib.quote(path)) + urllib.quote(unicode_to_url(path))) resp = do_http("POST", url) check_http_error(resp, stderr) new_uri = resp.read().strip() diff -rN -u old-unicode/src/allmydata/test/test_cli.py new-unicode/src/allmydata/test/test_cli.py --- old-unicode/src/allmydata/test/test_cli.py 2009-04-08 22:04:20.000000000 -0600 +++ new-unicode/src/allmydata/test/test_cli.py 2009-04-08 22:04:25.000000000 -0600 @@ -1,5 +1,6 @@ # coding=utf-8 +import sys import os.path from twisted.trial import unittest from cStringIO import StringIO @@ -518,6 +519,41 @@ self._test_webopen(["two:"], self.two_url) d.addCallback(_test_urls) + d.addCallback(lambda res: self.do_cli("create-alias", "études")) + def _check_create_unicode((rc,stdout,stderr)): + self.failUnlessEqual(rc, 0) + self.failIf(stderr) + + # If stdout only supports ascii, accentuated characters are + # being replaced by '?' + if sys.stdout.encoding == "ANSI_X3.4-1968": + self.failUnless("Alias '?tudes' created" in stdout) + else: + self.failUnless("Alias 'études' created" in stdout) + + aliases = get_aliases(self.get_clientdir()) + self.failUnless(aliases[u"études"].startswith("URI:DIR2:")) + d.addCallback(_check_create_unicode) + + d.addCallback(lambda res: self.do_cli("ls", "études:")) + def _check_ls1((rc, stdout, stderr)): + self.failUnlessEqual(rc, 0) + self.failIf(stderr) + + self.failUnlessEqual(stdout, "") + d.addCallback(_check_ls1) + + d.addCallback(lambda res: self.do_cli("put", "-", "études:uploaded.txt", + stdin="Blah blah blah")) + + d.addCallback(lambda res: self.do_cli("ls", "études:")) + def _check_ls2((rc, stdout, stderr)): + self.failUnlessEqual(rc, 0) + self.failIf(stderr) + + self.failUnlessEqual(stdout, "uploaded.txt\n") + d.addCallback(_check_ls2) + return d class Put(GridTestMixin, CLITestMixin, unittest.TestCase): @@ -739,6 +775,37 @@ d.addCallback(lambda (rc,out,err): self.failUnlessEqual(out, DATA2)) return d + def test_immutable_from_file_unicode(self): + # tahoe put file.txt "à trier.txt" + self.basedir = os.path.dirname(self.mktemp()) + self.set_up_grid() + + rel_fn = os.path.join(self.basedir, "DATAFILE") + abs_fn = os.path.abspath(rel_fn) + # we make the file small enough to fit in a LIT file, for speed + DATA = "short file" + f = open(rel_fn, "w") + f.write(DATA) + f.close() + + d = self.do_cli("create-alias", "tahoe") + + d.addCallback(lambda res: + self.do_cli("put", rel_fn, "à trier.txt")) + def _uploaded((rc,stdout,stderr)): + readcap = stdout.strip() + self.failUnless(readcap.startswith("URI:LIT:")) + self.failUnless("201 Created" in stderr, stderr) + self.readcap = readcap + d.addCallback(_uploaded) + + d.addCallback(lambda res: + self.do_cli("get", "tahoe:à trier.txt")) + d.addCallback(lambda (rc,stdout,stderr): + self.failUnlessEqual(stdout, DATA)) + + return d + class List(GridTestMixin, CLITestMixin, unittest.TestCase): def test_list(self): self.basedir = "cli/List/list" @@ -795,30 +862,37 @@ def test_unicode_filename(self): self.basedir = "cli/Cp/unicode_filename" self.set_up_grid() + d = self.do_cli("create-alias", "tahoe") + + # Use unicode strings when calling os functions + if sys.getfilesystemencoding() == "ANSI_X3.4-1968": + fn1 = os.path.join(self.basedir, u"Artonwall") + else: + fn1 = os.path.join(self.basedir, u"Ärtonwall") - fn1 = os.path.join(self.basedir, "Ärtonwall") DATA1 = "unicode file content" open(fn1, "wb").write(DATA1) + d.addCallback(lambda res: self.do_cli("cp", fn1.encode('utf-8'), "tahoe:Ärtonwall")) + + d.addCallback(lambda res: self.do_cli("get", "tahoe:Ärtonwall")) + d.addCallback(lambda (rc,out,err): self.failUnlessEqual(out, DATA1)) - fn2 = os.path.join(self.basedir, "Metallica") + + fn2 = os.path.join(self.basedir, u"Metallica") DATA2 = "non-unicode file content" open(fn2, "wb").write(DATA2) # Bug #534 # Assure that uploading a file whose name contains unicode character doesn't # prevent further uploads in the same directory - d = self.do_cli("create-alias", "tahoe") - d.addCallback(lambda res: self.do_cli("cp", fn1, "tahoe:")) - d.addCallback(lambda res: self.do_cli("cp", fn2, "tahoe:")) - - d.addCallback(lambda res: self.do_cli("get", "tahoe:Ärtonwall")) - d.addCallback(lambda (rc,out,err): self.failUnlessEqual(out, DATA1)) + d.addCallback(lambda res: self.do_cli("cp", fn2.encode('utf-8'), "tahoe:")) d.addCallback(lambda res: self.do_cli("get", "tahoe:Metallica")) d.addCallback(lambda (rc,out,err): self.failUnlessEqual(out, DATA2)) + d.addCallback(lambda res: self.do_cli("ls", "tahoe:")) + return d - test_unicode_filename.todo = "This behavior is not yet supported, although it does happen to work (for reasons that are ill-understood) on many platforms. See issue ticket #534." def test_dangling_symlink_vs_recursion(self): if not hasattr(os, 'symlink'): @@ -837,6 +911,17 @@ dn, "tahoe:")) return d +class Mkdir(GridTestMixin, CLITestMixin, unittest.TestCase): + def test_unicode_mkdir(self): + self.basedir = os.path.dirname(self.mktemp()) + self.set_up_grid() + + d = self.do_cli("create-alias", "tahoe") + d.addCallback(lambda res: self.do_cli("mkdir", "tahoe:Motörhead")) + + return d + + class Backup(GridTestMixin, CLITestMixin, StallMixin, unittest.TestCase): def writeto(self, path, data): @@ -871,6 +956,11 @@ self.writeto("parent/subdir/bar.txt", "bar\n" * 1000) self.writeto("parent/blah.txt", "blah") + if sys.getfilesystemencoding() == "ANSI_X3.4-1968": + self.writeto(u"parent/artonwall.txt", "Marmelade Jacuzzi") + else: + self.writeto(u"parent/ärtonwall.txt", "Marmelade Jacuzzi") + def do_backup(use_backupdb=True, verbose=False): cmd = ["backup"] if not have_bdb or not use_backupdb: @@ -895,8 +985,8 @@ self.failUnlessEqual(err, "") self.failUnlessEqual(rc, 0) fu, fr, dc, dr = self.count_output(out) - # foo.txt, bar.txt, blah.txt - self.failUnlessEqual(fu, 3) + # foo.txt, bar.txt, blah.txt, ärtonwall.txt + self.failUnlessEqual(fu, 4) self.failUnlessEqual(fr, 0) # empty, home, home/parent, home/parent/subdir self.failUnlessEqual(dc, 4) @@ -945,9 +1035,9 @@ self.failUnlessEqual(rc, 0) if have_bdb: fu, fr, dc, dr = self.count_output(out) - # foo.txt, bar.txt, blah.txt + # foo.txt, bar.txt, blah.txt, ärtonwall.txt self.failUnlessEqual(fu, 0) - self.failUnlessEqual(fr, 3) + self.failUnlessEqual(fr, 4) # empty, home, home/parent, home/parent/subdir self.failUnlessEqual(dc, 0) self.failUnlessEqual(dr, 4) @@ -975,9 +1065,9 @@ self.failUnlessEqual(rc, 0) fu, fr, dc, dr = self.count_output(out) fchecked, dchecked, dread = self.count_output2(out) - self.failUnlessEqual(fchecked, 3) + self.failUnlessEqual(fchecked, 4) self.failUnlessEqual(fu, 0) - self.failUnlessEqual(fr, 3) + self.failUnlessEqual(fr, 4) # TODO: backupdb doesn't do dirs yet; when it does, this will # change to dchecked=4, and maybe dread=0 self.failUnlessEqual(dchecked, 0) @@ -1023,8 +1113,8 @@ fu, fr, dc, dr = self.count_output(out) # new foo.txt, surprise file, subfile, empty self.failUnlessEqual(fu, 4) - # old bar.txt - self.failUnlessEqual(fr, 1) + # old bar.txt, ärtonwall.txt + self.failUnlessEqual(fr, 2) # home, parent, subdir, blah.txt, surprisedir self.failUnlessEqual(dc, 5) self.failUnlessEqual(dr, 0) @@ -1063,7 +1153,7 @@ self.failUnlessEqual(err, "") self.failUnlessEqual(rc, 0) fu, fr, dc, dr = self.count_output(out) - self.failUnlessEqual(fu, 5) + self.failUnlessEqual(fu, 6) self.failUnlessEqual(fr, 0) self.failUnlessEqual(dc, 0) self.failUnlessEqual(dr, 5) diff -rN -u old-unicode/src/allmydata/util/stringutils.py new-unicode/src/allmydata/util/stringutils.py --- old-unicode/src/allmydata/util/stringutils.py 1969-12-31 17:00:00.000000000 -0700 +++ new-unicode/src/allmydata/util/stringutils.py 2009-04-08 22:04:25.000000000 -0600 @@ -0,0 +1,48 @@ +""" +Functions used to convert inputs from whatever encoding used in the system to +unicode and back. + +TODO: + * Accept two cli arguments --argv-encoding and --filesystem-encoding +""" + +import sys +from allmydata.util.assertutil import precondition +from twisted.python import usage + +def fs_to_unicode(s): + """ + Decode a filename (or a directory name) to unicode using the same encoding + as the filesystem. + """ + # Filename encoding detection is a little bit better thanks to + # getfilesystemencoding() in the sys module. However, filenames can be + # encoded using another encoding than the one used on the filesystem. + + precondition(isinstance(s, str), s) + encoding = sys.getfilesystemencoding() + try: + return unicode(s, encoding) + except UnicodeDecodeError: + raise usage.UsageError("Filename '%s' cannot be decoded using the current encoding of your filesystem (%s). Please rename this file." % (s, encoding)) + +def unicode_to_fs(s): + """ + Encode an unicode object used in file or directoy name. + """ + + precondition(isinstance(s, unicode), s) + encoding = sys.getfilesystemencoding() + try: + return s.encode(encoding) + except UnicodeEncodeError: + raise usage.UsageError("Filename '%s' cannot be encoded using the current encoding of your filesystem (%s). Please configure your locale correctly or rename this file." % (s, encoding)) + +def unicode_to_url(s): + """ + Encode an unicode object used in an URL. + """ + # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded. + + precondition(isinstance(s, unicode), s) + return s.encode('utf-8')