diff --git a/docs/frontends/webapi.rst b/docs/frontends/webapi.rst index df2a864..a6c9485 100644 --- a/docs/frontends/webapi.rst +++ b/docs/frontends/webapi.rst @@ -36,6 +36,7 @@ The Tahoe REST-ful Web API 8. `Static Files in /public_html`_ 9. `Safety and Security Issues -- Names vs. URIs`_ 10. `Concurrency Issues`_ +11. `Access Blacklist`_ Enabling the web-API port @@ -1933,6 +1934,51 @@ For more details, please see the "Consistency vs Availability" and "The Prime Coordination Directive" sections of `mutable.rst <../specifications/mutable.rst>`_. +Access Blacklist +================ + +Gateway nodes may find it necessary to prohibit access to certain files. The +web-API has a facility to block access to filecaps by their storage index, +returning a 403 "Forbidden" error instead of the original file. + +This blacklist is recorded in $NODEDIR/access.blacklist, and contains one +blocked file per line. Comment lines (starting with ``#``) are ignored. Each +line consists of the storage-index (in the usual base32 format as displayed +by the "More Info" page, or by the "tahoe debug dump-cap" command), followed +by whitespace, followed by a reason string, which will be included in the 403 +error message. This could hold a URL to a page that explains why the file is +blocked, for example. + +So for example, if you found a need to block access to a file with filecap +``URI:CHK:n7r3m6wmomelk4sep3kw5cvduq:os7ijw5c3maek7pg65e5254k2fzjflavtpejjyhshpsxuqzhcwwq:3:20:14861``, +you could do the following:: + + tahoe debug dump-cap URI:CHK:n7r3m6wmomelk4sep3kw5cvduq:os7ijw5c3maek7pg65e5254k2fzjflavtpejjyhshpsxuqzhcwwq:3:20:14861 + -> storage index: whpepioyrnff7orecjolvbudeu + echo "whpepioyrnff7orecjolvbudeu my puppy told me to" >>$NODEDIR/access.blacklist + tahoe restart $NODEDIR + tahoe get URI:CHK:n7r3m6wmomelk4sep3kw5cvduq:os7ijw5c3maek7pg65e5254k2fzjflavtpejjyhshpsxuqzhcwwq:3:20:14861 + -> error, 403 Access Prohibited: my-puppy-told-me-to + +The ``access.blacklist`` file will be checked each time a file or directory +is accessed: the file's ``mtime`` is used to decide whether it need to be +reloaded. Therefore no node restart is necessary when creating the initial +blacklist, nor when adding second, third, or additional entries to the list. +When modifying the file, be careful to update it atomically, otherwise a +request may arrive while the file is only halfway written, and the partial +file may be incorrectly parsed. + +The blacklist is applied to all access paths (including FTP, SFTP, and CLI +operations), not just the web-API. The blacklist also applies to directories. +If a directory is blacklisted, the gateway will refuse access to both that +directory and any child files/directories underneath it, when accessed via +"DIRCAP/SUBDIR/FILENAME" -style URLs. Users who go directly to the child +file/dir will bypass the blacklist. + +The node will log the SI of the file being blocked, and the reason code, into +the ``logs/twistd.log`` file. + + .. [1] URLs and HTTP and UTF-8, Oh My HTTP does not provide a mechanism to specify the character set used to diff --git a/src/allmydata/blacklist.py b/src/allmydata/blacklist.py new file mode 100755 index 0000000..e23eec6 --- /dev/null +++ b/src/allmydata/blacklist.py @@ -0,0 +1,48 @@ + +import os +from twisted.python import log as twisted_log +from allmydata.util import base32 + +class FileProhibited(Exception): + """This client has been configured to prohibit access to this object.""" + def __init__(self, reason): + self.reason = reason + + +class Blacklist: + def __init__(self, blacklist_fn): + self.blacklist_fn = blacklist_fn + self.last_mtime = None + self.entries = {} + self.read_blacklist() # sets .last_mtime and .entries + + def read_blacklist(self): + try: + current_mtime = os.stat(self.blacklist_fn).st_mtime + except EnvironmentError: + # unreadable blacklist file means no blacklist + self.entries.clear() + return + try: + if self.last_mtime is None or current_mtime > self.last_mtime: + self.entries.clear() + for line in open(self.blacklist_fn, "r").readlines(): + line = line.lstrip() + if not line or line.startswith("#"): + continue + si_s, reason = line.split(None, 1) + si = base32.a2b(si_s) # must be valid base32 + self.entries[si] = reason + self.last_mtime = current_mtime + except Exception, e: + twisted_log.err(e, "unparseable blacklist file") + raise + + def check_storageindex(self, si): + self.read_blacklist() + reason = self.entries.get(si, None) + if reason: + # log this to logs/twistd.log, since web logs go there too + twisted_log.msg("blacklist prohibited access to SI %s: %s" % + (base32.b2a(si), reason)) + raise FileProhibited(reason) diff --git a/src/allmydata/client.py b/src/allmydata/client.py index d007b8b..2b0797e 100644 --- a/src/allmydata/client.py +++ b/src/allmydata/client.py @@ -24,6 +24,7 @@ from allmydata.stats import StatsProvider from allmydata.history import History from allmydata.interfaces import IStatsProducer, RIStubClient from allmydata.nodemaker import NodeMaker +from allmydata.blacklist import Blacklist KiB=1024 @@ -278,6 +279,7 @@ class Client(node.Node, pollmixin.PollMixin): self.terminator.setServiceParent(self) self.add_service(Uploader(helper_furl, self.stats_provider)) self.init_stub_client() + self.init_blacklist() self.init_nodemaker() def init_client_storage_broker(self): @@ -330,6 +332,10 @@ class Client(node.Node, pollmixin.PollMixin): d.addErrback(log.err, facility="tahoe.init", level=log.BAD, umid="OEHq3g") + def init_blacklist(self): + fn = os.path.join(self.basedir, "access.blacklist") + self.blacklist = Blacklist(fn) + def init_nodemaker(self): self.nodemaker = NodeMaker(self.storage_broker, self._secret_holder, @@ -337,7 +343,8 @@ class Client(node.Node, pollmixin.PollMixin): self.getServiceNamed("uploader"), self.terminator, self.get_encoding_parameters(), - self._key_generator) + self._key_generator, + self.blacklist) def get_history(self): return self.history @@ -479,11 +486,15 @@ class Client(node.Node, pollmixin.PollMixin): # dirnodes. The first takes a URI and produces a filenode or (new-style) # dirnode. The other three create brand-new filenodes/dirnodes. - def create_node_from_uri(self, write_uri, read_uri=None, deep_immutable=False, name=""): + def create_node_from_uri(self, write_uri, read_uri=None, + deep_immutable=False, name=""): # This returns synchronously. - # Note that it does *not* validate the write_uri and read_uri; instead we - # may get an opaque node if there were any problems. - return self.nodemaker.create_from_cap(write_uri, read_uri, deep_immutable=deep_immutable, name=name) + # Note that it does *not* validate the write_uri and read_uri; + # instead we may get an opaque node if there were any problems. + n = self.nodemaker.create_from_cap(write_uri, read_uri, + deep_immutable=deep_immutable, + name=name) + return n def create_dirnode(self, initial_children={}): d = self.nodemaker.create_new_mutable_directory(initial_children) diff --git a/src/allmydata/nodemaker.py b/src/allmydata/nodemaker.py index 3b74d90..0aa70d5 100644 --- a/src/allmydata/nodemaker.py +++ b/src/allmydata/nodemaker.py @@ -14,7 +14,8 @@ class NodeMaker: def __init__(self, storage_broker, secret_holder, history, uploader, terminator, - default_encoding_parameters, key_generator): + default_encoding_parameters, key_generator, + blacklist=None): self.storage_broker = storage_broker self.secret_holder = secret_holder self.history = history @@ -22,6 +23,7 @@ class NodeMaker: self.terminator = terminator self.default_encoding_parameters = default_encoding_parameters self.key_generator = key_generator + self.blacklist = blacklist self._node_cache = weakref.WeakValueDictionary() # uri -> node @@ -60,14 +62,20 @@ class NodeMaker: else: memokey = "M" + bigcap if memokey in self._node_cache: - return self._node_cache[memokey] - cap = uri.from_string(bigcap, deep_immutable=deep_immutable, name=name) - node = self._create_from_single_cap(cap) - if node: - self._node_cache[memokey] = node # note: WeakValueDictionary + node = self._node_cache[memokey] else: - # don't cache UnknownNode - node = UnknownNode(writecap, readcap, deep_immutable=deep_immutable, name=name) + cap = uri.from_string(bigcap, deep_immutable=deep_immutable, + name=name) + node = self._create_from_single_cap(cap) + if node: + self._node_cache[memokey] = node # note: WeakValueDictionary + else: + # don't cache UnknownNode + node = UnknownNode(writecap, readcap, + deep_immutable=deep_immutable, name=name) + if self.blacklist: + si = node.get_storage_index() + self.blacklist.check_storageindex(si) # may raise FileProhibited return node def _create_from_single_cap(self, cap): diff --git a/src/allmydata/test/no_network.py b/src/allmydata/test/no_network.py index 535cac1..95bafa9 100644 --- a/src/allmydata/test/no_network.py +++ b/src/allmydata/test/no_network.py @@ -206,6 +206,7 @@ class NoNetworkGrid(service.MultiService): self.basedir = basedir fileutil.make_dirs(basedir) + self._client_config_hooks = client_config_hooks self.servers_by_number = {} # maps to StorageServer instance self.wrappers_by_id = {} # maps to wrapped StorageServer instance self.proxies_by_id = {} # maps to IServer on which .rref is a wrapped @@ -229,20 +230,24 @@ class NoNetworkGrid(service.MultiService): f.write("[storage]\n") f.write("enabled = false\n") f.close() - c = None - if i in client_config_hooks: - # this hook can either modify tahoe.cfg, or return an - # entirely new Client instance - c = client_config_hooks[i](clientdir) - if not c: - c = NoNetworkClient(clientdir) - c.set_default_mutable_keysize(522) - c.nodeid = clientid - c.short_nodeid = b32encode(clientid).lower()[:8] - c._servers = self.all_servers # can be updated later - c.setServiceParent(self) + c = self._create_client(i, clientdir, clientid) self.clients.append(c) + def _create_client(self, i, clientdir, clientid): + c = None + if i in self._client_config_hooks: + # this hook can either modify tahoe.cfg, or return an + # entirely new Client instance + c = self._client_config_hooks[i](clientdir) + if not c: + c = NoNetworkClient(clientdir) + c.set_default_mutable_keysize(522) + c.nodeid = clientid + c.short_nodeid = b32encode(clientid).lower()[:8] + c._servers = self.all_servers # can be updated later + c.setServiceParent(self) + return c + def make_server(self, i, readonly=False): serverid = hashutil.tagged_hash("serverid", str(i))[:20] serverdir = os.path.join(self.basedir, "servers", @@ -274,6 +279,17 @@ class NoNetworkGrid(service.MultiService): for c in self.clients: c._servers = self.all_servers + def restart_client(self, i): + # we must remove the client, then build a new one with the same id + # and basedir + old_client = self.clients[i] + d = defer.maybeDeferred(old_client.disownServiceParent) + def _then(ign): + c = self._create_client(i, old_client.basedir, old_client.nodeid) + self.clients[i] = c + d.addCallback(_then) + return d + def remove_server(self, serverid): # it's enough to remove the server from c._servers (we don't actually # have to detach and stopService it) @@ -336,6 +352,15 @@ class GridTestMixin: ss = self.g.servers_by_number[i] yield (i, ss, ss.storedir) + def restart_client(self, i=0): + d = self.g.restart_client(i) + def _then(ign): + c = self.g.clients[i] + self.client_webports[i] = c.getServiceNamed("webish").getPortnum() + self.client_baseurls[i] = c.getServiceNamed("webish").getURL() + d.addCallback(_then) + return d + def find_uri_shares(self, uri): si = tahoe_uri.from_string(uri).get_storage_index() prefixdir = storage_index_to_dir(si) diff --git a/src/allmydata/test/test_web.py b/src/allmydata/test/test_web.py index 28c1323..1fbaf87 100644 --- a/src/allmydata/test/test_web.py +++ b/src/allmydata/test/test_web.py @@ -161,6 +161,7 @@ class FakeClient(Client): self.history = FakeHistory() self.uploader = FakeUploader() self.uploader.setServiceParent(self) + self.blacklist = None self.nodemaker = FakeNodeMaker(None, self._secret_holder, None, self.uploader, None, None, None) @@ -4450,6 +4451,90 @@ class Grid(GridTestMixin, WebErrorMixin, ShouldFailMixin, testutil.ReallyEqualMi return d + def test_blacklist(self): + # download from a blacklisted URI, get an error + self.basedir = "web/Grid/blacklist" + self.set_up_grid() + c0 = self.g.clients[0] + c0_basedir = c0.basedir + fn = os.path.join(c0_basedir, "access.blacklist") + self.uris = {} + DATA = "off-limits " * 50 + d = c0.upload(upload.Data(DATA, convergence="")) + def _stash_uri(ur): + self.uri = ur.uri + self.url = "uri/"+self.uri + u = uri.from_string_filenode(self.uri) + self.si = u.get_storage_index() + d.addCallback(_stash_uri) + d.addCallback(lambda ign: self.GET(self.url)) + def _blacklist(ign): + f = open(fn, "w") + f.write(" # this is a comment\n") + f.write(" \n") + f.write("\n") # also exercise blank lines + f.write("%s %s\n" % (base32.b2a(self.si), "off-limits to you")) + f.close() + # clients should be checking the blacklist each time, so we don't + # need to restart the client + d.addCallback(_blacklist) + d.addCallback(lambda ign: + self.shouldHTTPError("_get_from_blacklisted_uri", + 403, "Forbidden", + "Access Prohibited: off-limits", + self.GET, "uri/" + self.uri)) + def _unblacklist(ign): + open(fn, "w").close() + # the Blacklist object watches mtime to tell when the file has + # changed, but on windows this test will run faster than the + # filesystem's mtime resolution. So we edit Blacklist.last_mtime + # to force a reload. + self.g.clients[0].blacklist.last_mtime -= 2.0 + d.addCallback(_unblacklist) + # now a read should work + d.addCallback(lambda ign: self.GET(self.url)) + # read again to exercise the blacklist-is-unchanged logic + d.addCallback(lambda ign: self.GET(self.url)) + + # now add a blacklisted directory, and make sure files under it are + # refused too + def _add_dir(ign): + childnode = c0.create_node_from_uri(self.uri, None) + return c0.create_dirnode({u"child": (childnode,{}) }) + d.addCallback(_add_dir) + def _get_dircap(dn): + self.dir_si_b32 = base32.b2a(dn.get_storage_index()) + self.dir_url_rw = "uri/"+dn.get_write_uri()+"/?t=json" + self.dir_url_ro = "uri/"+dn.get_readonly_uri()+"/?t=json" + self.child_url = "uri/"+dn.get_readonly_uri()+"/child" + d.addCallback(_get_dircap) + d.addCallback(lambda ign: self.GET(self.dir_url_rw)) + d.addCallback(lambda ign: self.GET(self.dir_url_ro)) + d.addCallback(lambda ign: self.GET(self.child_url)) + def _block_dir(ign): + f = open(fn, "w") + f.write("%s %s\n" % (base32.b2a(self.si), "dir-off-limits to you")) + f.close() + self.g.clients[0].blacklist.last_mtime -= 2.0 + d.addCallback(_block_dir) + d.addCallback(lambda ign: + self.shouldHTTPError("_get_from_blacklisted_uri 2", + 403, "Forbidden", + "Access Prohibited: dir-off-limits", + self.GET, self.dir_url_rw)) + d.addCallback(lambda ign: + self.shouldHTTPError("_get_from_blacklisted_uri 3", + 403, "Forbidden", + "Access Prohibited: dir-off-limits", + self.GET, self.dir_url_ro)) + d.addCallback(lambda ign: + self.shouldHTTPError("_get_from_blacklisted_uri 4", + 403, "Forbidden", + "Access Prohibited: dir-off-limits", + self.GET, self.child_url)) + + return d + class CompletelyUnhandledError(Exception): pass class ErrorBoom(rend.Page): diff --git a/src/allmydata/web/common.py b/src/allmydata/web/common.py index 22083a5..07ea487 100644 --- a/src/allmydata/web/common.py +++ b/src/allmydata/web/common.py @@ -6,6 +6,7 @@ from zope.interface import Interface from nevow import loaders, appserver from nevow.inevow import IRequest from nevow.util import resource_filename +from allmydata import blacklist from allmydata.interfaces import ExistingChildError, NoSuchChildError, \ FileTooLargeError, NotEnoughSharesError, NoSharesError, \ EmptyPathnameComponentError, MustBeDeepImmutableError, \ @@ -231,6 +232,9 @@ def humanize_failure(f): "The cap is being passed in a read slot (ro_uri), or was retrieved " "from a read slot as an unknown cap.") % quoted_name return (t, http.BAD_REQUEST) + if f.check(blacklist.FileProhibited): + t = "Access Prohibited: %s" % f.value.reason + return (t, http.FORBIDDEN) if f.check(WebError): return (f.value.text, f.value.code) if f.check(FileTooLargeError): diff --git a/src/allmydata/webish.py b/src/allmydata/webish.py index 03ca3ba..a8e0bff 100644 --- a/src/allmydata/webish.py +++ b/src/allmydata/webish.py @@ -129,7 +129,7 @@ class WebishServer(service.MultiService): name = "webish" def __init__(self, client, webport, nodeurl_path=None, staticdir=None, - clock=None): + clock=None): service.MultiService.__init__(self) # the 'data' argument to all render() methods default to the Client # the 'clock' argument to root.Root is, if set, a